In [2]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

keys = ['A', 'D', 'S']
csv_files = [f for f in os.listdir() if f.split('.')[-1] in ['csv']]
csv_files

['asthma_dataset.csv', 'diabetes_data.csv', 'stroke_data.csv']

In [3]:
def check_skewness(data, cols):
    # Print the skewness of skewed numerical features, |skewness| > 0.5 is considered skewed
    df_skew = pd.DataFrame({"columns": cols, "skewness": data[cols].skew().values, "too skewed": abs(data[cols].skew().values) > 0.75})
    return df_skew

def model_test(model, Xtrain, Xtest, ytrain, ytest):
    model_ = model
    model_.fit(Xtrain, ytrain)
    print(classification_report(model_.predict(Xtest), ytest))
    return model_

def show_FI(model):
    return pd.DataFrame({"features": model.feature_names_in_[model.feature_importances_.argsort()], 
              "feature importances":np.sort(model.feature_importances_)})

In [4]:
diabetes = pd.read_csv("diabetes_data.csv").dropna()
X = diabetes.iloc[:, :-1]
y = diabetes.iloc[:, -1]

diabetes.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
y

0        0
1        0
2        0
3        0
4        0
        ..
70687    1
70688    1
70689    1
70690    1
70691    1
Name: Diabetes, Length: 70692, dtype: int32

In [4]:
# Check for null 
diabetes.isnull().any().any()

False

In [5]:
minmax_sc = MinMaxScaler()
X.loc[:,:] = minmax_sc.fit_transform(X)
X

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:,:] = minmax_sc.fit_transform(X)


Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP
0,0.250000,1.0,0.0,1.0,0.162791,0.0,0.0,1.0,0.0,1.0,0.0,0.50,0.166667,1.000000,0.0,0.0,1.0
1,0.916667,1.0,1.0,1.0,0.162791,1.0,0.0,0.0,1.0,0.0,0.0,0.50,0.000000,0.000000,0.0,1.0,1.0
2,1.000000,1.0,0.0,1.0,0.162791,0.0,0.0,1.0,1.0,1.0,0.0,0.00,0.000000,0.333333,0.0,0.0,0.0
3,0.833333,1.0,1.0,1.0,0.186047,1.0,0.0,1.0,1.0,1.0,0.0,0.50,0.000000,0.100000,0.0,0.0,1.0
4,0.583333,0.0,0.0,1.0,0.197674,1.0,0.0,1.0,1.0,1.0,0.0,0.25,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70687,0.416667,0.0,1.0,1.0,0.290698,0.0,0.0,0.0,0.0,1.0,0.0,0.75,0.000000,0.000000,0.0,0.0,0.0
70688,0.750000,1.0,1.0,1.0,0.197674,1.0,1.0,0.0,1.0,1.0,0.0,0.25,0.000000,0.000000,1.0,0.0,0.0
70689,1.000000,0.0,1.0,1.0,0.151163,0.0,1.0,0.0,1.0,0.0,0.0,1.00,0.500000,0.000000,1.0,0.0,1.0
70690,0.833333,0.0,1.0,1.0,0.069767,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.000000,0.000000,1.0,0.0,1.0


In [6]:
xgb_clf = XGBClassifier()
rf_clf = RandomForestClassifier()
gb_clf = GradientBoostingClassifier()
xgb_clf.fit(X, y)
rf_clf.fit(X, y)
gb_clf.fit(X, y)

average_FI = (xgb_clf.feature_importances_ + rf_clf.feature_importances_  + gb_clf.feature_importances_ )/3
print(np.sort(average_FI))
print(X.columns[average_FI.argsort()])

[0.00908694 0.01219291 0.0122946  0.01379646 0.01393612 0.01781
 0.01851758 0.01884285 0.02015051 0.02228598 0.0263526  0.03335162
 0.05366853 0.09628051 0.11963709 0.18120033 0.33059536]
Index(['Stroke', 'PhysActivity', 'Veggies', 'Smoker', 'Fruits', 'Sex',
       'CholCheck', 'HvyAlcoholConsump', 'HeartDiseaseorAttack', 'DiffWalk',
       'MentHlth', 'PhysHlth', 'HighChol', 'Age', 'BMI', 'GenHlth', 'HighBP'],
      dtype='object')


In [7]:
cols_drop = X.columns[average_FI.argsort()][:11].tolist()
# cols_drop.remove('PhysActivity')
print(cols_drop)
X = X.drop(columns=cols_drop)
X.head()

['Stroke', 'PhysActivity', 'Veggies', 'Smoker', 'Fruits', 'Sex', 'CholCheck', 'HvyAlcoholConsump', 'HeartDiseaseorAttack', 'DiffWalk', 'MentHlth']


Unnamed: 0,Age,HighChol,BMI,GenHlth,PhysHlth,HighBP
0,0.25,0.0,0.162791,0.5,1.0,1.0
1,0.916667,1.0,0.162791,0.5,0.0,1.0
2,1.0,0.0,0.162791,0.0,0.333333,0.0
3,0.833333,1.0,0.186047,0.5,0.1,1.0
4,0.583333,0.0,0.197674,0.25,0.0,0.0


In [8]:
numerical_cols = [col for col in X.columns if len(X[col].unique()) > 10]
categorical_cols = list(set(X.columns) - set(numerical_cols))
print(numerical_cols)
print(categorical_cols)

['Age', 'BMI', 'PhysHlth']
['GenHlth', 'HighChol', 'HighBP']


In [9]:
check_skewness(X, numerical_cols)

Unnamed: 0,columns,skewness,too skewed
0,Age,-0.545923,False
1,BMI,1.71918,True
2,PhysHlth,1.657304,True


In [10]:
X_qt = X.copy()
qt = QuantileTransformer(n_quantiles=500, output_distribution='normal')
# numerical_cols.remove('Age')
X[['BMI']] = qt.fit_transform(X[['BMI']]) 
X[['PhysHlth']] = qt.fit_transform(X[['PhysHlth']])

In [11]:
check_skewness(X, numerical_cols)

Unnamed: 0,columns,skewness,too skewed
0,Age,-0.545923,False
1,BMI,-0.003027,False
2,PhysHlth,0.692381,False


In [14]:
X

Unnamed: 0,Age,HighChol,BMI,GenHlth,PhysHlth,HighBP
0,0.250000,0.0,-0.505473,0.50,5.199338,1.0
1,0.916667,1.0,-0.505473,0.50,-5.199338,1.0
2,1.000000,0.0,-0.505473,0.00,0.822449,0.0
3,0.833333,1.0,-0.090543,0.50,0.471513,1.0
4,0.583333,0.0,0.065349,0.25,-5.199338,0.0
...,...,...,...,...,...,...
70687,0.416667,1.0,1.106438,0.75,-5.199338,0.0
70688,0.750000,1.0,0.065349,0.25,-5.199338,0.0
70689,1.000000,1.0,-0.698331,1.00,-5.199338,1.0
70690,0.833333,1.0,-2.456544,0.75,-5.199338,1.0


In [15]:
y = y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
y

0        0
1        0
2        0
3        0
4        0
        ..
70687    1
70688    1
70689    1
70690    1
70691    1
Name: Diabetes, Length: 70692, dtype: int32

In [16]:
X_train.shape

(49484, 6)

In [21]:
model_test(XGBClassifier(), X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.70      0.77      0.73      9635
           1       0.79      0.72      0.75     11573

    accuracy                           0.74     21208
   macro avg       0.74      0.74      0.74     21208
weighted avg       0.75      0.74      0.74     21208



In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# defining parameter range
param_grid = {'n_neighbors': [1,3,5,7,9,11,13,15,17,19],  #odd numbers because there are 2 classes in target coulmn
              'weights': ['distance', 'uniform']}  
gridKNN = GridSearchCV(KNeighborsClassifier(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
gridKNN.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ...n_neighbors=1, weights=distance;, score=0.660 total time=   0.2s
[CV 2/5] END ...n_neighbors=1, weights=distance;, score=0.657 total time=   0.1s
[CV 3/5] END ...n_neighbors=1, weights=distance;, score=0.660 total time=   0.2s
[CV 4/5] END ...n_neighbors=1, weights=distance;, score=0.661 total time=   0.3s
[CV 5/5] END ...n_neighbors=1, weights=distance;, score=0.664 total time=   0.2s
[CV 1/5] END ....n_neighbors=1, weights=uniform;, score=0.660 total time=   0.4s
[CV 2/5] END ....n_neighbors=1, weights=uniform;, score=0.657 total time=   0.4s
[CV 3/5] END ....n_neighbors=1, weights=uniform;, score=0.660 total time=   0.3s
[CV 4/5] END ....n_neighbors=1, weights=uniform;, score=0.661 total time=   0.3s
[CV 5/5] END ....n_neighbors=1, weights=uniform;, score=0.664 total time=   0.3s
[CV 1/5] END ...n_neighbors=3, weights=distance;, score=0.685 total time=   0.2s
[CV 2/5] END ...n_neighbors=3, weights=distance