## Split the data into training and testing

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [13]:
# load the dataset
df = pd.read_csv("../../data/engineered_COPD_data.csv")

In [14]:
# define the features and the target variable
X, y = df.drop(columns=["COPD_Diagnosis"]), df["COPD_Diagnosis"]

In [15]:
# split the data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model training
- Based on the data, it is a binary classification problem set as we are prediction if someone has COPD or not, hence we are going to use following models:
    - Logistic Regression
    - Decision Trees
    - Random Forest

In [16]:
# import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle

In [17]:
# initialize the model
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [20]:
# Train the models
for name, model in models.items():
    model.fit(X_train, y_train)

    # Save the models
    with open(f"../../models/{name.replace(' ', '_')}.pkl", "wb") as file:
        pickle.dump(model, file)
    
    print(f"{name} model trained and saved")

print("Model training completed")


Logistic Regression model trained and saved
Decision Tree model trained and saved
Random Forest model trained and saved
Model training completed


## Evaluate the models
- Accuracy, Precision, Recall ,F1 score

In [21]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Evaluation: ")
    print(classification_report(y_test, y_pred))


Logistic Regression Evaluation: 
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       134
           1       0.95      0.94      0.95        66

    accuracy                           0.96       200
   macro avg       0.96      0.96      0.96       200
weighted avg       0.96      0.96      0.96       200


Decision Tree Evaluation: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       134
           1       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Random Forest Evaluation: 
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       134
           1       1.00      0.98      0.99        66

    accuracy                           0.99       200
   macro avg       1.00      0.99     

## Model refinement

In [23]:
from sklearn.model_selection import GridSearchCV

In [25]:
# define the parameter grid for random forest
params_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_sample_split": [2, 3, 10]
}

In [26]:
# initialize the grid search cv\
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params_grid, cv=5, n_jobs = -1, scoring="accuracy")

In [28]:
# fit the grid search cv
grid_search.fit(X_train, y_train)

TypeError: estimator should be an estimator implementing 'fit' method,      Age  Biomass_Fuel_Exposure  Occupational_Exposure  Family_History_COPD  \
29    50                      0                      0                    0   
535   35                      1                      0                    1   
695   76                      1                      0                    0   
557   53                      1                      0                    0   
836   77                      0                      1                    0   
..   ...                    ...                    ...                  ...   
106   50                      1                      0                    0   
270   49                      1                      0                    0   
860   65                      0                      0                    0   
435   77                      0                      0                    0   
102   36                      1                      1                    1   

       BMI  Air_Pollution_Level  Respiratory_Infections_Childhood  \
29   25.34                  135                                 1   
535  20.41                  123                                 1   
695  24.52                   63                                 1   
557  25.27                  106                                 0   
836  33.60                  135                                 0   
..     ...                  ...                               ...   
106  28.27                  131                                 1   
270  34.63                  171                                 1   
860  33.96                  109                                 1   
435  25.50                  179                                 0   
102  30.63                   72                                 1   

     Pollution_Risk_Score  Smoking_Status_encoded  Gender_  \
29                      0                     0.0        0   
535                     0                     1.0        0   
695                     0                     0.0        0   
557                     0                     0.5        1   
836                     0                     0.0        0   
..                    ...                     ...      ...   
106                     0                     0.5        1   
270                     1                     0.5        1   
860                     0                     1.0        1   
435                     1                     1.0        1   
102                     0                     0.0        1   

     Smoking_Pollution_Interaction  Location_Biratnagar  Location_Butwal  \
29                             0.0                False            False   
535                          123.0                False            False   
695                            0.0                False             True   
557                           53.0                False            False   
836                            0.0                False            False   
..                             ...                  ...              ...   
106                           65.5                False            False   
270                           85.5                 True            False   
860                          109.0                False            False   
435                          179.0                False            False   
102                            0.0                False            False   

     Location_Chitwan  Location_Dharan  Location_Hetauda  Location_Kathmandu  \
29              False            False              True               False   
535             False            False             False               False   
695             False            False             False               False   
557             False            False             False               False   
836             False            False             False               False   
..                ...              ...               ...                 ...   
106             False            False             False               False   
270             False            False             False               False   
860             False            False             False               False   
435             False            False             False               False   
102              True            False             False               False   

     Location_Lalitpur  Location_Nepalgunj  Location_Pokhara  
29               False               False             False  
535              False               False             False  
695              False               False             False  
557              False               False             False  
836              False               False             False  
..                 ...                 ...               ...  
106              False               False              True  
270              False               False             False  
860               True               False             False  
435              False                True             False  
102              False               False             False  

[800 rows x 20 columns] was passed

In [None]:
# best parameters
print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_