In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv("diabetes.csv")

In [28]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [27]:
df.loc[df.Pregnancies > 13,'Pregnancies'].describe()

count     4.000000
mean     15.000000
std       1.414214
min      14.000000
25%      14.000000
50%      14.500000
75%      15.500000
max      17.000000
Name: Pregnancies, dtype: float64

In [29]:
df[df.Glucose == 0].count()

Pregnancies                 5
Glucose                     5
BloodPressure               5
SkinThickness               5
Insulin                     5
BMI                         5
DiabetesPedigreeFunction    5
Age                         5
Outcome                     5
dtype: int64

## Removing 0 Values

In [58]:
df = df[df.Glucose != 0]
df = df[df.BloodPressure != 0]
df = df[df.BMI != 0]
df_y = df.Outcome
df_x = df.drop('Outcome',axis=1)

## Preprocessing

In [34]:
from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures

#Y = dfmerged.is_promoted
#X = dfmerged.drop(['is_promoted'],1)
 


def add_interactions(df):
    combos = list(combinations(list(df.columns), 2))
    colnames = list(df.columns)+['_'.join(x) for x in combos]
    
    #scaler = MinMaxScaler()
    #scaler.fit(df)
    #df = scaler.transform(df)
    
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = colnames
    
    noint_indices = [i for i,x in enumerate(list((df==0).all())) if x]
    df= df.drop(df.columns[noint_indices], axis=1)
    
    return df
#X = add_interactions(X)
#X.shape

In [35]:
df_x = add_interactions(df_x)

In [36]:
df_x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Pregnancies_Glucose,Pregnancies_BloodPressure,...,SkinThickness_Insulin,SkinThickness_BMI,SkinThickness_DiabetesPedigreeFunction,SkinThickness_Age,Insulin_BMI,Insulin_DiabetesPedigreeFunction,Insulin_Age,BMI_DiabetesPedigreeFunction,BMI_Age,DiabetesPedigreeFunction_Age
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,888.0,432.0,...,0.0,1176.0,21.945,1750.0,0.0,0.0,0.0,21.0672,1680.0,31.35
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,85.0,66.0,...,0.0,771.4,10.179,899.0,0.0,0.0,0.0,9.3366,824.6,10.881
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1464.0,512.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.6576,745.6,21.504
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,89.0,66.0,...,2162.0,646.3,3.841,483.0,2641.4,15.698,1974.0,4.6927,590.1,3.507
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,0.0,0.0,...,5880.0,1508.5,80.08,1155.0,7240.8,384.384,5544.0,98.6128,1422.3,75.504


In [59]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
scaler = RobustScaler()
#scaler = StandardScaler()
df_x= scaler.fit_transform(df_x)
df_x = pd.DataFrame(df_x)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.10, random_state=42)

## Log Reg

In [60]:
lr = LogisticRegression(solver = 'newton-cg', max_iter=150,random_state=42,penalty='l2',C=0.1)

In [61]:
lr.fit(x_train,y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=150, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [62]:
lr.score(x_test,y_test)

0.7808219178082192

In [63]:
from sklearn.model_selection import cross_val_score
cross_val_score(lr,x_test,y_test,cv=5).mean()

0.7013095238095237

In [42]:
lr.predict(x_test)

array([0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0], dtype=int64)

In [43]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,lr.predict(x_test))

0.7170231729055259

## Log Reg CV

In [380]:
lr_cv = LogisticRegressionCV(cv=5,max_iter=150,solver='liblinear')

In [381]:
lr_cv.fit(x_train,y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=150,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='liblinear',
           tol=0.0001, verbose=0)

In [469]:
lr_cv.score(x_test,y_test)

0.7534246575342466

In [468]:
cross_val_score(lr_cv,x_test,y_test,cv=5).mean()

0.6985714285714286

In [470]:
roc_auc_score(y_test,lr_cv.predict(x_test))

0.7201426024955436

## Grad Boost

In [44]:
from sklearn.ensemble import GradientBoostingClassifier

In [45]:
gbc = XGBClassifier(random_state=42,n_estimators=17,max_depth=3
                                )

NameError: name 'XGBClassifier' is not defined

In [989]:
gbc.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=17, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [990]:
gbc.score(x_test,y_test)

0.7808219178082192

In [991]:
cross_val_score(gbc,x_test,y_test,cv=9).mean()

0.7466931216931217

In [992]:
roc_auc_score(y_test,gbc.predict(x_test))

0.7397504456327986

## SVC

In [443]:
from sklearn.svm import SVC

In [476]:
svc = SVC(random_state=42, C=0.05)

In [450]:
svc

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)

In [477]:
svc.fit(x_train,y_train)



SVC(C=0.05, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False)

In [478]:
svc.score(x_test,y_test)

0.7534246575342466

In [479]:
roc_auc_score(y_test,svc.predict(x_test))

0.6038324420677361

In [480]:
from sklearn.metrics import accuracy_score

In [481]:
accuracy_score(y_test,svc.predict(x_test))

0.7534246575342466

## MLP

In [942]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=1000,hidden_layer_sizes=(100,100,2),random_state=42)

In [549]:
mlp

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=100, learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [943]:
mlp.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=42, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [944]:
mlp.score(x_test,y_test)

0.6986301369863014

In [625]:
roc_auc_score(y_test,mlp.predict(x_test))

0.5

## Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42,n_estimators=20)
rfc.fit(x_train,y_train)
print(rfc.score(x_test,y_test))
roc_auc_score(y_test,rfc.predict(x_test))

0.8082191780821918


0.7981283422459893

In [865]:
rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

## KNN

In [47]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=35)
knn.fit(x_train,y_train)
#knn.score(y_train,y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=35, p=2,
           weights='uniform')

In [48]:
knn.score(x_test,y_test)

0.7671232876712328

In [49]:
roc_auc_score(y_test,knn.predict(x_test))

0.6911764705882353

## Stacking

In [993]:
from mlxtend.classifier import StackingCVClassifier

In [1003]:
cl1 = XGBClassifier(**xgb_best)
cl2 = LogisticRegression(random_state=42,C=0.1,solver = 'newton-cg')
cl3 = RandomForestClassifier(**rf_best)
#cl4 = MLPClassifier(random_state=42, hidden_layer_sizes=(100,100,50),max_iter=500)
cl4 = KNeighborsClassifier(n_neighbors=35)
sc = StackingClassifier(classifiers=[cl1,cl2,cl3,cl4],meta_classifier=LogisticRegression(solver = 'liblinear',random_state=42))
sc.fit(x_train,y_train)

StackingClassifier(average_probas=False,
          classifiers=[XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8500000000000001,
       gamma=0.30000000000000004, learning_rate=0.11803936277945995,
       max_delta_step=0, max_depth=5, min_child_weight=8.0, missing=None,
       n_es...ki',
           metric_params=None, n_jobs=None, n_neighbors=35, p=2,
           weights='uniform')],
          drop_last_proba=False,
          meta_classifier=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
          store_train_meta_features=False, use_clones=True,
          use_features_in_secondary=False, use_probas=False, verbose=0)

In [1004]:
sc.score(x_test,y_test)

0.8356164383561644

In [1005]:
roc_auc_score(y_test,sc.predict(x_test))

0.8306595365418896

In [794]:
import xgboost

In [None]:
from xgboost import 

## HyperOPT!!!! XGB

In [68]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
#from sklearn.cross_validation import cross_val_score
from hyperopt.pyll.base import scope
from xgboost import XGBClassifier

def hyperopt_train_test(params):
    clf = XGBClassifier(**params)
    clf.fit(x_train,y_train)
    p = clf.predict(x_test)
    #return cross_val_score(clf,x_test,y_test,cv=5).mean()
    return roc_auc_score(p,y_test)
    #return clf.score(x_test,y_test)

space4xgb = {
    'n_estimators':scope.int(hp.quniform('n_estimators',50,500,10)),
   'max_depth':scope.int(hp.quniform('max_depth',1,12,1)),
   'min_child_weight': hp.quniform ('min_child_weight', 1, 12, 1),
    'subsample': hp.quniform ('subsample', 0.12, 1,0.05),
      'learning_rate': hp.uniform('learning_rate', 0.05,0.3),
    'gamma': hp.quniform('gamma', 0, 0.6, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'seed':0,
    'random_state':42
#         'reg_alpha': hp.quniform ('reg_alpha', 0, 1,0.05),
    #'eval_metric':'auc'
    
}

def f(params):
    acc = hyperopt_train_test(params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
xgb_best = fmin(f, space4xgb, algo=tpe.suggest, max_evals=100, trials=trials)

 64%|███████████████████████████████▎                 | 64/100 [00:08<00:04,  7.68it/s, best loss: -0.7879166666666667]


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [67]:
xgb_best['n_estimators'] = int(xgb_best['n_estimators'])
xgb_best['max_depth'] = int(xgb_best['max_depth'])
xgb_best

{'colsample_bytree': 0.65,
 'gamma': 0.2,
 'learning_rate': 0.2673269860887836,
 'max_depth': 7,
 'min_child_weight': 6.0,
 'n_estimators': 380,
 'subsample': 1.0}

In [54]:
from xgboost import XGBClassifier
xgb = XGBClassifier(**xgb_best)
xgb.fit(x_train,y_train)
xgb_pred = xgb.predict(x_test)
roc_auc_score(xgb_pred,y_test)

0.7648902821316614

In [55]:
xgb.score(x_test,y_test)

0.7945205479452054

In [863]:
rf = RandomForestClassifier()
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Hyperopt RF

In [56]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
#from sklearn.cross_validation import cross_val_score
from hyperopt.pyll.base import scope
from sklearn.ensemble import RandomForestClassifier

def hyperopt_train_test(params):
    clf = RandomForestClassifier(**params)
    clf.fit(x_train,y_train)
    p = clf.predict(x_test)
    #return cross_val_score(clf,x_test,y_test,cv=5).mean()
    return roc_auc_score(p,y_test)
    #return clf.score(x_test,y_test)

space4xgb = {
    'n_estimators':scope.int(hp.quniform('n_estimators',5,100,5)),
    'max_depth':scope.int(hp.quniform('max_depth',1,12,1)),
    'max_features':hp.choice('max_features',['auto','sqrt']),
    #'min_samples_split':scope.int(hp.quniform('min_samples_split',1,12,1)),
    'min_samples_leaf':scope.int(hp.quniform('min_samples_leaf',2,50,4)),
    'bootstrap':hp.choice('bootstrap',['True','False']),
    'random_state':42,
#         'reg_alpha': hp.quniform ('reg_alpha', 0, 1,0.05),
    #'eval_metric':'auc'
    
}

def f(params):
    acc = hyperopt_train_test(params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
rf_best = fmin(f, space4xgb, algo=tpe.suggest, max_evals=200, trials=trials)

100%|████████████████████████████████████████████████| 200/200 [00:15<00:00, 12.33it/s, best loss: -0.7872807017543859]


In [57]:
rf_best

{'bootstrap': 1,
 'max_depth': 1.0,
 'max_features': 0,
 'min_samples_leaf': 24.0,
 'n_estimators': 45.0}

In [927]:
rf_best['n_estimators'] = int(rf_best['n_estimators'])
rf_best['max_depth'] = int(rf_best['max_depth'])
rf_best['min_samples_leaf'] = int(rf_best['min_samples_leaf'])
rf_best['bootstrap'] = 'False'
rf_best['max_features'] = 'sqrt'
rf_best['random_state'] = 42

In [928]:
rf_best

{'bootstrap': 'False',
 'max_depth': 1,
 'max_features': 'sqrt',
 'min_samples_leaf': 24,
 'n_estimators': 25,
 'random_state': 42}

In [929]:
#from xgboost import XGBClassifier
rf = RandomForestClassifier(**rf_best)
rf.fit(x_train,y_train)
rf_pred = rf.predict(x_test)
roc_auc_score(rf_pred,y_test)

0.8805970149253731

In [930]:
rf.score(x_test,y_test)

0.7808219178082192

## Cat Boost First Try

In [30]:
!pip install catboost

Collecting catboost
  Downloading https://files.pythonhosted.org/packages/26/8b/97ed7dc482cdf54cc53f19dbf1d5aecc9f77baa3b03ff64cdc6f57f476de/catboost-0.15.1-cp37-none-win_amd64.whl (60.7MB)
Collecting graphviz (from catboost)
  Downloading https://files.pythonhosted.org/packages/af/ae/e1c63ac4c531d69a7960a99af99e184d4f3da15e29f67767c4252bf19cce/graphviz-0.11-py2.py3-none-any.whl
Installing collected packages: graphviz, catboost
Successfully installed catboost-0.15.1 graphviz-0.11
