In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import warnings
warnings.filterwarnings("ignore")


In [2]:
# Load your uploaded CSV
df = pd.read_csv("MedicInfo.csv")

# Show basic info
df.head()


Unnamed: 0,id,name,price(₹),Is_discontinued,manufacturer_name,type,pack_size_label,short_composition1,short_composition2
0,1,Augmentin 625 Duo Tablet,223.42,False,Glaxo SmithKline Pharmaceuticals Ltd,allopathy,strip of 10 tablets,Amoxycillin (500mg),Clavulanic Acid (125mg)
1,2,Azithral 500 Tablet,132.36,False,Alembic Pharmaceuticals Ltd,allopathy,strip of 5 tablets,Azithromycin (500mg),
2,3,Ascoril LS Syrup,118.0,False,Glenmark Pharmaceuticals Ltd,allopathy,bottle of 100 ml Syrup,Ambroxol (30mg/5ml),Levosalbutamol (1mg/5ml)
3,4,Allegra 120mg Tablet,218.81,False,Sanofi India Ltd,allopathy,strip of 10 tablets,Fexofenadine (120mg),
4,5,Avil 25 Tablet,10.96,False,Sanofi India Ltd,allopathy,strip of 15 tablets,Pheniramine (25mg),


In [3]:
df.info

<bound method DataFrame.info of             id                           name  price(₹)  Is_discontinued  \
0            1       Augmentin 625 Duo Tablet    223.42            False   
1            2            Azithral 500 Tablet    132.36            False   
2            3               Ascoril LS Syrup    118.00            False   
3            4           Allegra 120mg Tablet    218.81            False   
4            5                 Avil 25 Tablet     10.96            False   
...        ...                            ...       ...              ...   
253968  253969  Ziyapod 100mg Oral Suspension     62.30            False   
253969  253970            Zemhart 30mg Tablet     54.00            False   
253970  253971              Zivex 25mg Tablet     57.00            False   
253971  253972        ZI Fast 500mg Injection    152.00            False   
253972  253973      Zyvocol 1% Dusting Powder    110.00            False   

                            manufacturer_name       typ

In [4]:
df.isnull().sum()

id                         0
name                       0
price(₹)                   0
Is_discontinued            0
manufacturer_name          0
type                       0
pack_size_label            0
short_composition1         0
short_composition2    141802
dtype: int64

In [6]:
# Fill NaNs
# Clean column names for consistency
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Fill missing values
df['short_composition1'] = df['short_composition1'].fillna('')
df['short_composition2'] = df['short_composition2'].fillna('')
df['name'] = df['name'].fillna('')

# Create searchable field
df['combined_info'] = df['short_composition1'] + ' ' + df['short_composition2'] + ' ' + df['name']


In [7]:
# Initialize and fit TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_info'])

# Save the TF-IDF model and dataframe
joblib.dump(tfidf, "tfidf_vectorizer.joblib")
joblib.dump(tfidf_matrix, "tfidf_matrix.joblib")
df.to_csv("medicine_data_cleaned.csv", index=False)

print("✅ TF-IDF model and matrix saved!")


✅ TF-IDF model and matrix saved!


In [8]:
# Load saved objects
tfidf = joblib.load("tfidf_vectorizer.joblib")
tfidf_matrix = joblib.load("tfidf_matrix.joblib")
df = pd.read_csv("medicine_data_cleaned.csv")


In [11]:
def recommend_medicines(symptom_query, top_n=5):
    query_vec = tfidf.transform([symptom_query])
    similarity = cosine_similarity(query_vec, tfidf_matrix)
    top_indices = similarity[0].argsort()[-top_n:][::-1]
    
    results = df.iloc[top_indices][['name', 'manufacturer_name', 'price(₹)', 'is_discontinued']]
    results['similarity_score'] = similarity[0][top_indices]
    return results.reset_index(drop=True)



In [12]:
# Try example query
recommend_medicines("cold and cough", top_n=5)


Unnamed: 0,name,manufacturer_name,price(₹),is_discontinued,similarity_score
0,Kenadol Plus Cold & Cough Tablet,Fawn Incorporation,79.8,False,0.609974
1,Clistin Cold and Cough Syrup,Strides shasun Ltd,75.0,True,0.579068
2,Respimol Cough N Cold Tablet,BRD MediLabs,39.0,False,0.524344
3,Febrinil-CC Cough & Cold Capsule,Maneesh Pharmaceuticals Ltd,44.0,False,0.493696
4,D COLD SYRUP,Paras Pharmaceuticals Ltd,35.0,True,0.440902
