# Asthma Diesase Project

Possible models
1. Random Forest
2. SVM (Support Vector Machine)
3. KNN

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score

# Path to your CSV file
zip_file_path = r'C:\Users\Admin\Downloads\archive.zip'
csv_file_name = 'asthma_disease_data.csv'

# Read the CSV file
data = pd.read_csv(zip_file_path)

# Display the first few rows of the dataset
print(data.head())


# Drop confidential column
data.drop(columns=['DoctorInCharge'], inplace=True)

# Define features and target
X = data.drop(columns=['Diagnosis', 'PatientID'])
y = data['Diagnosis']

# Define numerical and categorical columns
numerical_cols = ['Age', 'BMI', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'PollutionExposure', 'PollenExposure', 'DustExposure', 'LungFunctionFEV1', 'LungFunctionFVC']
categorical_cols = ['Gender', 'Ethnicity', 'EducationLevel', 'Smoking', 'PetAllergy', 'FamilyHistoryAsthma', 'HistoryOfAllergies', 'Eczema', 'HayFever', 'GastroesophagealReflux', 'Wheezing', 'ShortnessOfBreath', 'ChestTightness', 'Coughing', 'NighttimeSymptoms', 'ExerciseInduced']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])


## Random forest model

In [None]:
# Random forest Model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
RandomForestClassifier(random_state=42, class_weight='balanced')

#Training Model

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
model.fit(X_resampled, y_resampled)

model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


## Support Vector Model

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the dataset
file_path = r'C:\Users\Admin\Downloads\asthma_disease_data.csv'
df = pd.read_csv(file_path)
 
# Drop unnecessary columns
df_cleaned = df.drop(columns=['PatientID', 'DoctorInCharge'])
 
# Separate features and target
X = df_cleaned.drop(columns=['Diagnosis'])
y = df_cleaned['Diagnosis']
 
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
 
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
 
# Best SVM hyperparameters
best_params = {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
best_svc = SVC(**best_params)
 
# Manually oversample the minority class in the training set
class_0_indices = np.where(y_train == 0)[0]
class_1_indices = np.where(y_train == 1)[0]
 
# Resample the minority class to match the majority class
oversampled_class_1_indices = np.random.choice(class_1_indices, size=len(class_0_indices), replace=True)
 
# Combine indices
oversampled_indices = np.concatenate([class_0_indices, oversampled_class_1_indices])
 
# Create oversampled training set
X_train_oversampled = X_train_scaled[oversampled_indices]
y_train_oversampled = y_train.iloc[oversampled_indices]
 
# Refit the SVM model with the best hyperparameters on the oversampled data
best_svc.fit(X_train_oversampled, y_train_oversampled)
 
# Predictions and Evaluation
y_pred_oversampled = best_svc.predict(X_test_scaled)
classification_rep_oversampled = classification_report(y_test, y_pred_oversampled)
roc_auc_oversampled = roc_auc_score(y_test, best_svc.decision_function(X_test_scaled))
 
print("Classification Report:\n", classification_rep_oversampled)
print("ROC AUC Score:", roc_auc_oversampled)

## K-nearest neighbors Model

In [None]:
import statsmodels.api as sm
import KNeighborsClassifier from sklearn.neighbors
import random

# print the summary of the data
print(df.describe())

random.seed(42)

# Define the model k = 3
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(X_train,y_train)

# Prediction and evaluation
predictions = knn3.predict(X_test)
accuracy_rate = knn3.score(X_test,y_test)
print(accuracy_rate)