# Asthma Diesase Project

Possible models
1. Random Forest
2. SVM (Support Vector Machine)
3. KNN

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score

# Path to your CSV file
zip_file_path = r'C:\Users\Admin\Downloads\archive.zip'
csv_file_name = 'asthma_disease_data.csv'

# Read the CSV file
data = pd.read_csv(zip_file_path)

# Display the first few rows of the dataset
print(data.head())


# Drop confidential column
data.drop(columns=['DoctorInCharge'], inplace=True)

# Define features and target
X = data.drop(columns=['Diagnosis', 'PatientID'])
y = data['Diagnosis']

# Define numerical and categorical columns
numerical_cols = ['Age', 'BMI', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'PollutionExposure', 'PollenExposure', 'DustExposure', 'LungFunctionFEV1', 'LungFunctionFVC']
categorical_cols = ['Gender', 'Ethnicity', 'EducationLevel', 'Smoking', 'PetAllergy', 'FamilyHistoryAsthma', 'HistoryOfAllergies', 'Eczema', 'HayFever', 'GastroesophagealReflux', 'Wheezing', 'ShortnessOfBreath', 'ChestTightness', 'Coughing', 'NighttimeSymptoms', 'ExerciseInduced']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])


   PatientID  Age  Gender  Ethnicity  EducationLevel        BMI  Smoking  \
0       5034   63       0          1               0  15.848744        0   
1       5035   26       1          2               2  22.757042        0   
2       5036   57       0          2               1  18.395396        0   
3       5037   40       1          2               1  38.515278        0   
4       5038   61       0          0               3  19.283802        0   

   PhysicalActivity  DietQuality  SleepQuality  ...  LungFunctionFEV1  \
0          0.894448     5.488696      8.701003  ...          1.369051   
1          5.897329     6.341014      5.153966  ...          2.197767   
2          6.739367     9.196237      6.840647  ...          1.698011   
3          1.404503     5.826532      4.253036  ...          3.032037   
4          4.604493     3.127048      9.625799  ...          3.470589   

   LungFunctionFVC  Wheezing  ShortnessOfBreath  ChestTightness  Coughing  \
0         4.941206         

## Random forest model

In [2]:
# Random forest Model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
RandomForestClassifier(random_state=42, class_weight='balanced')

#Training Model

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
model.fit(X_resampled, y_resampled)

model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      1.00      0.98       456
           1       0.00      0.00      0.00        23

    accuracy                           0.95       479
   macro avg       0.48      0.50      0.49       479
weighted avg       0.91      0.95      0.93       479

Accuracy: 0.9519832985386222


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Support Vector Model

In [3]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the dataset
file_path = r'C:\Users\Admin\Downloads\asthma_disease_data.csv'
df = pd.read_csv(file_path)
 
# Drop unnecessary columns
df_cleaned = df.drop(columns=['PatientID', 'DoctorInCharge'])
 
# Separate features and target
X = df_cleaned.drop(columns=['Diagnosis'])
y = df_cleaned['Diagnosis']
 
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
 
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
 
# Best SVM hyperparameters
best_params = {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
best_svc = SVC(**best_params)
 
# Manually oversample the minority class in the training set
class_0_indices = np.where(y_train == 0)[0]
class_1_indices = np.where(y_train == 1)[0]
 
# Resample the minority class to match the majority class
oversampled_class_1_indices = np.random.choice(class_1_indices, size=len(class_0_indices), replace=True)
 
# Combine indices
oversampled_indices = np.concatenate([class_0_indices, oversampled_class_1_indices])
 
# Create oversampled training set
X_train_oversampled = X_train_scaled[oversampled_indices]
y_train_oversampled = y_train.iloc[oversampled_indices]
 
# Refit the SVM model with the best hyperparameters on the oversampled data
best_svc.fit(X_train_oversampled, y_train_oversampled)
 
# Predictions and Evaluation
y_pred_oversampled = best_svc.predict(X_test_scaled)
classification_rep_oversampled = classification_report(y_test, y_pred_oversampled)
roc_auc_oversampled = roc_auc_score(y_test, best_svc.decision_function(X_test_scaled))
 
print("Classification Report:\n", classification_rep_oversampled)
print("ROC AUC Score:", roc_auc_oversampled)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.52      0.67       681
           1       0.06      0.57      0.11        37

    accuracy                           0.52       718
   macro avg       0.51      0.54      0.39       718
weighted avg       0.91      0.52      0.64       718

ROC AUC Score: 0.5189506687304044


## K-nearest neighbors Model

In [8]:
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsClassifier 
import random

# print the summary of the data
print(df.describe())

random.seed(42)

# Define the model k = 3
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(X_train,y_train)

# Prediction and evaluation
predictions = knn3.predict(X_test)
accuracy_rate = knn3.score(X_test,y_test)
print(accuracy_rate)

         PatientID          Age       Gender    Ethnicity  EducationLevel  \
count  2392.000000  2392.000000  2392.000000  2392.000000     2392.000000   
mean   6229.500000    42.137960     0.493311     0.669732        1.307274   
std     690.655244    21.606655     0.500060     0.986120        0.898242   
min    5034.000000     5.000000     0.000000     0.000000        0.000000   
25%    5631.750000    23.000000     0.000000     0.000000        1.000000   
50%    6229.500000    42.000000     0.000000     0.000000        1.000000   
75%    6827.250000    61.000000     1.000000     1.000000        2.000000   
max    7425.000000    79.000000     1.000000     3.000000        3.000000   

               BMI      Smoking  PhysicalActivity  DietQuality  SleepQuality  \
count  2392.000000  2392.000000       2392.000000  2392.000000   2392.000000   
mean     27.244877     0.141722          5.051786     5.022867      7.019012   
std       7.201628     0.348838          2.903574     2.909980    