In [1]:
import pandas as pd

In [2]:
data_dir=r"C:\Users\BIT\Downloads\patient_medical_history_50k.csv"
raw_df=pd.read_csv(data_dir)

In [3]:
raw_df=raw_df.dropna()

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assume raw_df is your DataFrame loaded with your data.
# Define features and targets.
x = raw_df[['Age','Gender','Past_Medical_History','Condition','Prescribed_Antibiotic']]
y = raw_df[['Take_Medication','Suggested_Antibiotic']]

# Replace binary values.
raw_df['Take_Medication'] = raw_df['Take_Medication'].replace({'Yes': 1, 'No': 0})
raw_df['Gender'] = raw_df['Gender'].replace({'Male': 1, 'Female': 0})

# Create separate LabelEncoder instances for each column.
le_history = LabelEncoder()
le_condition = LabelEncoder()
le_antibiotic = LabelEncoder()
le_target = LabelEncoder()

# Encode feature columns.
x['Past_Medical_History'] = le_history.fit_transform(x['Past_Medical_History'])
x['Condition'] = le_condition.fit_transform(x['Condition'])
x['Prescribed_Antibiotic'] = le_antibiotic.fit_transform(x['Prescribed_Antibiotic'])

# Encode target column for Suggested_Antibiotic.
y['Suggested_Antibiotic'] = le_target.fit_transform(y['Suggested_Antibiotic'])

print("Processed Dataset Sample:\n", raw_df.head())

# Split data into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Check for overlapping indices to ensure no data leakage from splitting.
overlap = set(x_train.index).intersection(set(x_test.index))
if overlap:
    print("Data leak detected: Overlapping indices found:", overlap)
else:
    print("No overlap between training and test sets. Data splitting is fine.")

# Initialize MultiOutputClassifier with RandomForestClassifier.
model = MultiOutputClassifier(RandomForestClassifier(random_state=42))

# Define hyperparameter grid.
param_grid = {
    'estimator__n_estimators': [50, 100],
    'estimator__max_depth': [None, 10, 20],
    'estimator__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(x_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Predict on the test set.
predictions = grid_search.predict(x_test)
predictions_df = pd.DataFrame(predictions, columns=y.columns)
print("\nPredictions:\n", predictions_df)

# Convert numerical predictions for Suggested_Antibiotic back to original labels.
predictions_df['Suggested_Antibiotic'] = le_target.inverse_transform(predictions_df['Suggested_Antibiotic'])
print("\nFinal Predictions with String Labels:\n", predictions_df)

# Accuracy metrics.
accuracy1 = accuracy_score(y_test["Take_Medication"], predictions_df["Take_Medication"])
accuracy2 = accuracy_score(y_test["Suggested_Antibiotic"], predictions_df["Suggested_Antibiotic"])

print("\nAccuracy for Target1 (Binary Classification - Yes/No):", accuracy1)
print("Accuracy for Target2 (Categorical Classification - String Labels):", accuracy2)

# Additional Accuracy Tests: Classification reports and confusion matrices.
print("\nClassification Report for Target1 (Take_Medication):")
print(classification_report(y_test["Take_Medication"], predictions_df["Take_Medication"]))
print("Confusion Matrix for Target1 (Take_Medication):")
print(confusion_matrix(y_test["Take_Medication"], predictions_df["Take_Medication"]))

print("\nClassification Report for Target2 (Suggested_Antibiotic):")
print(classification_report(y_test["Suggested_Antibiotic"], predictions_df["Suggested_Antibiotic"]))
print("Confusion Matrix for Target2 (Suggested_Antibiotic):")
print(confusion_matrix(y_test["Suggested_Antibiotic"], predictions_df["Suggested_Antibiotic"]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Past_Medical_History'] = le_history.fit_transform(x['Past_Medical_History'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Condition'] = le_condition.fit_transform(x['Condition'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Prescribed_Antibiotic'] = le_antibiotic.fit_transform(x['Presc

Processed Dataset Sample:
    Age  Gender Past_Medical_History    Condition Prescribed_Antibiotic  \
2   61       1            Allergies    Sinusitis        Clarithromycin   
3   72       1             Diabetes  Sore Throat           Amoxicillin   
4   19       1             Diabetes   Bronchitis           Doxycycline   
5   33       1        Liver Disease  Sore Throat           Amoxicillin   
6   71       0        Heart Problem  Sore Throat           Amoxicillin   

   Take_Medication Suggested_Antibiotic  
2                1       Clarithromycin  
3                1          Amoxicillin  
4                1          Doxycycline  
5                1          Amoxicillin  
6                1          Amoxicillin  
No overlap between training and test sets. Data splitting is fine.
Fitting 3 folds for each of 12 candidates, totalling 36 fits




Best Parameters: {'estimator__max_depth': None, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 50}
Best Cross-Validation Score: nan

Predictions:
       Take_Medication  Suggested_Antibiotic
0                   1                     2
1                   1                     4
2                   1                     3
3                   1                     4
4                   1                     4
...               ...                   ...
8594                0                     5
8595                0                     2
8596                1                     3
8597                1                     2
8598                1                     5

[8599 rows x 2 columns]

Final Predictions with String Labels:
       Take_Medication Suggested_Antibiotic
0                   1        Ciprofloxacin
1                   1          Doxycycline
2                   1       Clarithromycin
3                   1          Doxycycline
4                   1         

ValueError: Mix of label input types (string and number)