<a href="https://colab.research.google.com/github/yasmiinalii7/Asthma-Machine-Learning/blob/main/asthma_risk_severity_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# # pip install imbalanced-learn
# !pip install ydata-profiling

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt                                                                      # Visualization
import seaborn as sns

from ydata_profiling import ProfileReport                                                           # !pip install ydata-profiling

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, PolynomialFeatures     # Preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE                                                            # To handle class imbalance

from scipy import stats

from sklearn.linear_model import LogisticRegression                                                  # Machine learning models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from keras.models import Sequential                                                                  # ANN
from keras.layers import Dense
from keras.optimizers import Adam

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix                  # Evaluation

import joblib                                                                                        # Save and load

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/synthetic_asthma_dataset.csv')

In [None]:
df.head()

### Understand the Data

In [None]:
df.info()

In [None]:
df.shape

### Missing Values

In [None]:
df.isnull().sum()

In [None]:
df.drop('Patient_ID',axis=1,inplace=True)

In [None]:
df['Allergies'].fillna(df['Allergies'].mode()[0],inplace=True)

In [None]:
df.drop('Asthma_Control_Level',axis=1,inplace=True)

In [None]:
df.drop('Comorbidities',axis=1,inplace=True)

In [None]:
df.isnull().sum()

# YData profiling

In [None]:
profile=ProfileReport(df)
profile.to_file(output_file='report.html')

### Visualize the Data

In [None]:
# Age Distribution
plt.figure(figsize=(8, 4))
sns.histplot(df['Age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()

In [None]:
sns.countplot(x='Gender', hue='Has_Asthma', data=df)
plt.title('Asthma')
plt.show()

In [None]:
sns.histplot(df['Has_Asthma'])

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Smoking_Status', y='Has_Asthma', data=df)
plt.title('Smoking Status vs. Asthma')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Has_Asthma', y='Age', hue='Gender', data=df)
plt.title('Age Distribution by Asthma and Gender')
plt.xticks([0, 1], ['No Asthma', 'Has Asthma'])
plt.show()

In [None]:
sns.scatterplot(x='BMI', y='Has_Asthma', data=df)
plt.title('BMI vs. Asthma')
plt.show()

### Handling Outliers

In [None]:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

plt.figure(figsize=(10, len(numeric_cols)*4))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(len(numeric_cols), 1, i)
    sns.boxplot(x=df[col], color='skyblue')
    plt.title(f'Boxplot for {col}')
    plt.tight_layout()

plt.show()

In [None]:
# List of features to clean
features = ['BMI', 'Number_of_ER_Visits', 'FeNO_Level']

for col in features:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    #  show how many outliers
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"{col} → Outliers: {len(outliers)}")

    # Handling: remove outliers
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]


# Preprocessing

In [None]:
df.info()

In [None]:
for col in df.select_dtypes(include='object').columns:
    print(f"Unique values in column '{col}':")
    print(df[col].unique())

In [None]:
df['Gender']=df['Gender'].map({'Female':1,'Male':0, 'Other':2})

In [None]:
df['Smoking_Status']=df['Smoking_Status'].map({'Current':0,'Former':1,'Never':2})
df['Occupation_Type']=df['Occupation_Type'].map({'Outdoor':0,'Indoor':1})

In [None]:
le=LabelEncoder()
df['Allergies']=le.fit_transform(df['Allergies'])
df['Physical_Activity_Level']=le.fit_transform(df['Physical_Activity_Level'])
df['Air_Pollution_Level']=le.fit_transform(df['Air_Pollution_Level'])

In [None]:
df.info()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True,fmt='.2f')
plt.show()

In [None]:
x=df.drop('Has_Asthma',axis=1)
y=df['Has_Asthma']

In [None]:
st=StandardScaler()
x=st.fit_transform(x)

In [None]:
smote=SMOTE()
x,y=smote.fit_resample(x,y)

## Feature Expansion

In [None]:
poly=PolynomialFeatures(degree=2)
x_poly = poly.fit_transform(x)

## Dimensionality reduction

In [None]:
pca = PCA(n_components=2)
x_pca = pca.fit_transform(x)

In [None]:
df_pca = pd.DataFrame(data=x_pca, columns=['PC1', 'PC2'])            # dataframe to draw
df_pca['Has_Asthma'] = y.values

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(
    data=df_pca,
    x='PC1', y='PC2',
    hue='Has_Asthma',
    palette=['skyblue', 'tomato'],
    alpha=0.7
)
plt.title('PCA (2D) - Has_Asthma Distribution')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Has Asthma')
plt.grid(True)
plt.show()

# Data Splitting

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_poly,y,test_size=0.2,random_state=42 , stratify=y)

### Random Forest

In [None]:
rf=RandomForestClassifier(n_estimators=1000, random_state=42 , max_depth=20 , class_weight='balanced')

In [None]:
rf.fit(x_train, y_train)
y_pred_rf=rf.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred_rf)

In [None]:
print(classification_report(y_test,y_pred_rf))

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(conf_matrix, annot=True, fmt="d")

In [None]:
importances=rf.feature_importances_
importances

In [None]:
feature_names=poly.get_feature_names_out(df.drop('Has_Asthma', axis=1).columns)
plt.figure(figsize=(10, 6))
plt.bar(feature_names, importances)
plt.xticks(rotation=90)
plt.title("Feature Importances")
plt.tight_layout()
plt.show()

### Support Vector Machine

In [None]:
svc=SVC(kernel='rbf', C=1.0, gamma='scale', class_weight='balanced')

In [None]:
svc.fit(x_train,y_train)

In [None]:
y_pred_svc=svc.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred_svc)

In [None]:
print(classification_report(y_test,y_pred_svc))

# Decision Tree

In [None]:
dt=DecisionTreeClassifier(class_weight='balanced')

In [None]:
dt.fit(x_train,y_train)

In [None]:
y_pred_dt=dt.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred_dt)

In [None]:
print(classification_report(y_test,y_pred_dt))

### Logistic Regression

In [None]:
lr=LogisticRegression(class_weight='balanced')

In [None]:
lr.fit(x_train,y_train)

In [None]:
y_pred_lr=lr.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred_lr)

In [None]:
print(classification_report(y_test,y_pred_lr))

# KNN

In [None]:
knn=KNeighborsClassifier(n_neighbors=5)

In [None]:
knn.fit(x_train,y_train)

In [None]:
y_pred_knn=knn.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred_knn)

In [None]:
print(classification_report(y_test,y_pred_knn))

# Naive Bayes

In [None]:
nb=GaussianNB()

In [None]:
nb.fit(x_train,y_train)

In [None]:
y_pred_nb=nb.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred_nb)

In [None]:
print(classification_report(y_test,y_pred_nb))

# ADABOOST

In [None]:
base = DecisionTreeClassifier(max_depth=1)

In [None]:
ada_model = AdaBoostClassifier(estimator=base, n_estimators=100, random_state=42)

In [None]:
ada_model.fit(x_train, y_train)

In [None]:
y_pred_ada = ada_model.predict(x_test)

In [None]:
accuracy_score(y_test,y_pred_ada)

In [None]:
print(classification_report(y_test,y_pred_ada))

# Neural Network (ANN)

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(x_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs=15, batch_size=32, validation_split=0.2)

In [None]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Total Accuracy on Test Data: {accuracy:.2f}")

In [None]:
model_names = ['Logistic Regression', 'KNN', 'SVC', 'Decision Tree', 'Random Forest',
               'Naive Bayes', 'AdaBoost', 'ANN']

accuracies = [0.932912653603454, 0.9309199601461309, 0.9521753570242444, 0.9458651610760545, 0.9598140152773165, 0.8605114579873796, 0.9176353370973098, 0.95]

# نرسم الـ Bar Plot
plt.figure(figsize=(12, 6))
plt.bar(model_names, accuracies, color='mediumslateblue')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Model Comparison')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show

# Save the best model

In [None]:
joblib.dump(rf, 'rf.pkl')