In [1]:
import pandas as pd

# Load the datasets
final_df = pd.read_csv("final.csv")
disease_df = pd.read_csv("Disease_symptom_and_patient_profile_dataset.csv")

# Drop rows with null values
final_df.dropna(inplace=True)
disease_df.dropna(inplace=True)

# Merge the datasets on disease or drug
merged_df = pd.merge(final_df, disease_df, left_on='disease', right_on='Disease', how='inner')

# Drop unnecessary columns
merged_df.drop(columns=['Disease'], inplace=True)


In [2]:
# Save the merged dataset
merged_df.to_csv("merged_dataset.csv", index=False)
merged_df.head()

Unnamed: 0.1,Unnamed: 0,disease,drug,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,75,Acne,doxycycline,No,No,Yes,No,40,Male,Normal,Normal,Negative
1,76,Acne,spironolactone,No,No,Yes,No,40,Male,Normal,Normal,Negative
2,77,Acne,minocycline,No,No,Yes,No,40,Male,Normal,Normal,Negative
3,78,Acne,clindamycin,No,No,Yes,No,40,Male,Normal,Normal,Negative
4,79,Acne,tretinoin,No,No,Yes,No,40,Male,Normal,Normal,Negative


In [3]:
df = merged_df

# Convert the 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing' columns to binary
df['Fever'] = df['Fever'].map({'Yes': 1, 'No': 0})
df['Cough'] = df['Cough'].map({'Yes': 1, 'No': 0})
df['Fatigue'] = df['Fatigue'].map({'Yes': 1, 'No': 0})
df['Difficulty Breathing'] = df['Difficulty Breathing'].map({'Yes': 1, 'No': 0})
df['Blood Pressure'] = df['Blood Pressure'].map({'Normal': 0, 'Low': 1, 'high':1})
df['Cholesterol Level'] = df['Cholesterol Level'].map({'Normal': 0, 'Low': 1, 'high':1})

# Calculate the mean occurrence of each symptom for each disease
mean_symptoms = df.groupby('disease')[['Fever', 'Cough', 'Fatigue', 'Blood Pressure','Cholesterol Level' ,'Difficulty Breathing']].mean()

# Find the top 3 symptoms for each disease
top_3_symptoms = mean_symptoms.apply(lambda x: x.nlargest(3).index.tolist(), axis=1)

print("Top 3 most probable symptoms for each disease:")
print(top_3_symptoms)
print(type(top_3_symptoms))

Top 3 most probable symptoms for each disease:
disease
Acne                                   [Fatigue, Fever, Cough]
Allergic Rhinitis          [Cough, Fatigue, Cholesterol Level]
Alzheimer's Disease                    [Fatigue, Fever, Cough]
Anemia                                 [Fatigue, Fever, Cough]
Appendicitis                           [Fever, Fatigue, Cough]
                                          ...                 
Thyroid Cancer                         [Fatigue, Fever, Cough]
Turner Syndrome                        [Fatigue, Fever, Cough]
Typhoid Fever                [Cough, Cholesterol Level, Fever]
Ulcerative Colitis                     [Cough, Fatigue, Fever]
Urinary Tract Infection                [Fever, Cough, Fatigue]
Length: 64, dtype: object
<class 'pandas.core.series.Series'>


In [4]:
top_3_symptoms_df = top_3_symptoms.reset_index()
top_3_symptoms_df.columns = ['Disease', 'Top Symptoms']

# Save the DataFrame to a CSV file
top_3_symptoms_df.to_csv("top_3_symptoms_per_disease.csv", index=False)

In [6]:
# Load the files
top_3_symptoms_df = pd.read_csv("top_3_symptoms_per_disease.csv")

merged_df = pd.read_csv("merged_dataset.csv")
merged_df = merged_df[["disease", "drug"]]
merged_df = merged_df.rename(columns={"disease": "Disease"})

drugs_df = pd.read_csv("drugs_for_common_treatments.csv")
drugs_df = drugs_df[['drug_name', 'rating']]
drugs_df = drugs_df.rename(columns={"drug_name": "drug"})

# Merge top symptoms with merged dataset
merged_df['drug'] = merged_df['drug'].str.strip()
top_3_symptoms_df = pd.merge(top_3_symptoms_df, merged_df, on='Disease', how='inner')

# Merge drug ratings
merged_with_ratings_df = pd.merge(top_3_symptoms_df, drugs_df, on='drug', how='left')

merged_with_ratings_df.to_csv("tmp.csv", index=False)
merged_with_ratings_df.head()

Unnamed: 0,Disease,Top Symptoms,drug,rating
0,Acne,"['Fatigue', 'Fever', 'Cough']",doxycycline,6.8
1,Acne,"['Fatigue', 'Fever', 'Cough']",doxycycline,5.1
2,Acne,"['Fatigue', 'Fever', 'Cough']",doxycycline,6.5
3,Acne,"['Fatigue', 'Fever', 'Cough']",doxycycline,8.0
4,Acne,"['Fatigue', 'Fever', 'Cough']",doxycycline,6.6


In [14]:
# 删除 rating 列为空的行
df = merged_with_ratings_df.dropna(subset=['rating'])
# 根据 drug 计算 rating 的平均值
avg_rating_per_drug = df.groupby(['Disease','Top Symptoms','drug'])['rating'].mean().reset_index()

# 根据 Disease、Top Symptoms 和 drug 进行分组，并根据 rating 进行排序
sorted_df = avg_rating_per_drug.groupby(['Disease', 'Top Symptoms']).apply(lambda x: x.sort_values('rating', ascending=False)).reset_index(drop=True)
# 仅保留每种药物的平均评分
sorted_df.to_csv("avg_rating_per_drug.csv", index=False)
sorted_df.head()

Unnamed: 0,Disease,Top Symptoms,drug,rating
0,Acne,"['Fatigue', 'Fever', 'Cough']",cephalexin,8.4
1,Acne,"['Fatigue', 'Fever', 'Cough']",isotretinoin,8.0
2,Acne,"['Fatigue', 'Fever', 'Cough']",benzoyl peroxide,7.8
3,Acne,"['Fatigue', 'Fever', 'Cough']",tretinoin,7.7
4,Acne,"['Fatigue', 'Fever', 'Cough']",minocycline,7.533333
