**Data loading and preprocessing**

In [29]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the training and testing datasets
train_df = pd.read_csv("/content/drive/MyDrive/Train_data.csv")
test_df = pd.read_csv("/content/drive/MyDrive/test_data.csv")

# Concatenate train and test datasets for preprocessing
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Data Cleaning
combined_df.dropna(inplace=True)

# Perform label encoding for the Disease column
le = LabelEncoder()
combined_df['Disease'] = le.fit_transform(combined_df['Disease'])

# Split data into features and target variable
X = combined_df.drop(columns=['Disease'])  # Features
y = combined_df['Disease']  # Target variable

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Model initialization and training**

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Initialize models with pipelines
models = {
    'Random Forest': Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier())]),
    'Decision Tree': Pipeline([('scaler', StandardScaler()), ('clf', DecisionTreeClassifier())]),
    'Naive Bayes': Pipeline([('scaler', StandardScaler()), ('clf', GaussianNB())])
}

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'Random Forest': {'clf__n_estimators': [100, 200, 300]},
    'Decision Tree': {'clf__max_depth': [None, 10, 20, 30]},
    'Naive Bayes': {}
}

best_models = {}
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='f1_macro')
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_


**Model evaluation**

In [31]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
import warnings

# Model evaluation
results = {}
for name, model in best_models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 in labels with no predicted samples.")
        results[name] = {
            'Accuracy': scores.mean(),
            'Precision': precision_score(y_test, model.predict(X_test), average='macro', zero_division=1),
            'Recall': recall_score(y_test, model.predict(X_test), average='macro'),
            'F1-score': f1_score(y_test, model.predict(X_test), average='macro')
        }

# Display results
results_df = pd.DataFrame(results).T
print("Model Performance on Test Data:")
print(results_df)

# Choose the best model based on F1-score
best_model_name = results_df.idxmax()['F1-score']
best_model = best_models[best_model_name]


Model Performance on Test Data:
               Accuracy  Precision    Recall  F1-score
Random Forest  0.800083   0.978947  0.814786  0.838484
Decision Tree  0.844819   0.762464  0.778674  0.770080
Naive Bayes    0.735341   0.886976  0.701421  0.709297


**Prediction**

In [32]:
# Hardcoded input data
input_data = {
    'Glucose': 0.413680476,
    'Cholesterol': 0.536058397,
    'Hemoglobin': 0.222889694,
    'Platelets': 0.130596361,
    'White Blood Cells': 0.678140913,
    'Red Blood Cells': 0.832631669,
    'Hematocrit': 0.461772312,
    'Mean Corpuscular Volume': 0.388528562,
    'Mean Corpuscular Hemoglobin': 0.33082952,
    'Mean Corpuscular Hemoglobin Concentration': 0.496490935,
    'Insulin': 0.118212034,
    'BMI': 0.400445712,
    'Systolic Blood Pressure': 0.743976602,
    'Diastolic Blood Pressure': 0.831235426,
    'Triglycerides': 0.943581849,
    'HbA1c': 0.464659192,
    'LDL Cholesterol': 0.167301875,
    'HDL Cholesterol': 0.067760585,
    'ALT': 0.121632728,
    'AST': 0.384449662,
    'Heart Rate': 0.928104624,
    'Creatinine': 0.063159835,
    'Troponin': 0.814727419,
    'C-reactive Protein': 0.49286176
}

# Manual input and prediction function
def predict_disease_manually(model):
    input_features = pd.DataFrame(input_data, index=[0])
    predicted_disease = model.predict(input_features)
    return predicted_disease

# Example of manual input and prediction using the best model
print(f"\nUsing the best model ({best_model_name}):")
predicted_disease = predict_disease_manually(best_model)
print(f"Predicted disease: {le.inverse_transform(predicted_disease)}")



Using the best model (Random Forest):
Predicted disease: ['Thromboc']
