In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold 
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from scipy import stats

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.preprocessing import StandardScaler


sns.set(color_codes = True)

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

plt.style.use('fivethirtyeight')

%matplotlib inline

# warnings.filterwarnings('ignore')

In [None]:
data_file = 'IBM-Employee-Attrition.csv'

# import cafe listings into dataframe
emp = pd.read_csv(data_file)

In [None]:
emp.head()

In [None]:
emp.shape

In [None]:
emp.info()

In [None]:
emp.isnull().sum()

In [None]:
emp[emp.Over18!='Y'].shape

In [None]:
emp = emp.drop(columns=['EmployeeCount', 'EmployeeNumber','Over18'])

In [None]:
emp.shape

In [None]:
sns.pairplot(data=emp,hue='Attrition')

In [None]:

attrition_map={'Yes':1,'No':0}
emp['Attrition'] = emp['Attrition'].map(attrition_map)

In [None]:
def plot_heatmap(data):
    sns.set(style="white")

    # Generate a mask for the upper triangle
    mask = np.zeros_like(data.corr(), dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(30, 25))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(data.corr(), mask=mask, cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

    plt.show()

plot_heatmap(emp)

In [None]:
figure = plt.figure(figsize=(12,7))
sns.heatmap(emp.corr()[['Attrition']].sort_values('Attrition',ascending=False),annot=True, cmap='coolwarm', center=0);


In [None]:
# get all numerical columns
numerical_dtypes = ['int16','int32', 'int64','float16','float32','float64']
num_cols = []
for i in emp.columns:
    if emp[i].dtype in numerical_dtypes:
        num_cols.append(i)
        
print(len(num_cols))
print(num_cols)

In [None]:
# get all category columns

cat_cols = list(set(emp.columns) - set(num_cols))
print(len(cat_cols))
print(cat_cols)

In [None]:
# plot sale price of all numerical columns

def plot_multi_charts(data, x_column_list, y, title, y_label, plot_type, figsize):
    fig = plt.figure(figsize=figsize)
    fig.subplots_adjust(hspace=0.4, wspace=0.4, top=0.975)
    fig.suptitle(title)

    for i, col in enumerate(list(data[x_column_list]),1):
        
        ax = fig.add_subplot(len(x_column_list), 3, i)
        
        if plot_type == 'scatter':
            plt.scatter(x=data[col], y=data[y])
            plt.xlabel('{}'.format(col), size=15,labelpad=12.5)
            plt.ylabel(y_label, size=15, labelpad=12.5)
        elif plot_type == 'bar':
            data.groupby(col).plot.bar(ax=ax)
    
    figname = title + '.png'
    fig.savefig(figname,transparent=False, bbox_inches='tight', dpi=300)
    
    print('Total cols: ',len(x_column_list))    
    plt.show()
    

In [None]:
# Observe sale price of all numerical features

plot_multi_charts(data=emp, x_column_list=num_cols,y='Attrition', 
                  title='SalePrice vs numerical features',y_label='SalePrice',plot_type='bar',figsize=(18,180))    

In [None]:
fig = plt.figure(figsize=(20,100))
fig.subplots_adjust(hspace=1.2, wspace=0.5, top=0.975)

for i, col in enumerate((num_cols),1):
    ax = fig.add_subplot(len(num_cols), 3, i)
    sns.distplot(emp[col])
    plt.xlabel('{}'.format(col), size=15,labelpad=12.5)
    plt.ylabel('Frequecy', size=15, labelpad=12.5)
    plt.xticks(rotation=45)

plt.show()

    

In [None]:
attri_yes = emp[emp.Attrition==1]
attri_no =  emp[emp.Attrition==0]

In [None]:
def plot_pair_charts(data1, data2, column_list, legend_text,fig_size):
    fig = plt.figure(figsize=fig_size)
    fig.subplots_adjust(hspace=1, wspace=0.5, top=0.96)
    fig.suptitle('Attrtion Yes vs No')

    for i, col in enumerate(list(data1[column_list]),1):
        # print(col)
        ax = fig.add_subplot(len(column_list), 3, i)
        plt.hist(data1[col],alpha=0.8)
        plt.hist(data2[col],alpha=0.8)
        plt.xlabel('{}'.format(col), size=15,labelpad=12.5)
        plt.ylabel('Frequecy', size=15, labelpad=12.5)
        plt.legend(legend_text)
        plt.xticks(rotation=45)
    
    #figname = 'High vs Low quality houses.png'
    # fig.savefig(figname,transparent=False, bbox_inches='tight', dpi=300)
    
    plt.show()

In [None]:
# plot bar charts of ordinal features 
cols_to_plot = cat_cols
plot_pair_charts(attri_no,attri_yes,cols_to_plot,legend_text=['Attrition No','Attrition Yes'],fig_size=(15, 30))   

In [None]:
# plot bar charts of numerial features 
cols_to_plot = num_cols
plot_pair_charts(attri_no,attri_yes,cols_to_plot,legend_text=['Attrition Yes','Attrition No'],fig_size=(15, 58))   

In [None]:
cols1 = ['WorkLifeBalance',
         'JobSatisfaction',
         'JobInvolvement',
         'YearsAtCompany',
         'StockOptionLevel',
         'YearsWithCurrManager',
         'Age',
         'MonthlyIncome',
         'YearsInCurrentRole',
         'JobLevel',
         'TotalWorkingYears']

cols2 = ['DistanceFromHome',
        'NumCompaniesWorked',
        'PerformanceRating',
        'HourlyRate',
        'PercentSalaryHike',
        'Education',
        'YearsSinceLastPromotion',
        'RelationshipSatisfaction',
        'DailyRate',
        'TrainingTimesLastYear']

feature_cols = cols1 + cols2

target_col = 'Attrition'

# feature_cols = [c for c in emp.columns if c != target_col]

X = emp[feature_cols]

y = emp['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
X.head()


In [None]:
def plot_ROC_curve(model,X_test,y_test):

    # Generate the prediction values for each of the test observations using predict_proba() function rather than just predict
    preds = model.predict_proba(X_test)[:,1]

    # Store the false positive rate(fpr), true positive rate (tpr) in vectors for use in the graph
    fpr, tpr, _ = roc_curve(y_test, preds)

    # Store the Area Under the Curve (AUC) so we can annotate our graph with theis metric
    roc_auc = auc(fpr, tpr)

    # Plot the ROC Curve
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw = lw, label = 'ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color = 'navy', lw = lw, linestyle = '--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc = "lower right")
    plt.show()


In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train,y_train)
dummy_clf.predict(X_test)
dummy_clf.score(X_train,y_train)



In [None]:
# RANDOMFOREST

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(criterion='gini',
                                max_depth=7, 
                                random_state=1)

rf_clf.fit(X_train,y_train)
y_pred = rf_clf.predict(X_test)

ac = accuracy_score(y_test,y_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_test,y_pred))

    
plot_ROC_curve(rf_clf,X_test,y_test)

In [None]:
# DECISION TREE

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=None,class_weight='balanced')
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)

ac = accuracy_score(y_test,y_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_test, y_pred))

plot_ROC_curve(dtc,X_test,y_test)

In [None]:
# LOGISTIC REGRESSION

lin_model = LogisticRegression(solver='liblinear')
lin_model.fit(X_train, y_train)
lin_model.score(X_test, y_test)

y_pred = lin_model.predict(X_test)
ac = accuracy_score(y_test,y_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_test, y_pred))

plot_ROC_curve(lin_model,X_test,y_test)

In [None]:
# SUPPORT VECTOR MACHINE


# svc_model = SVC(kernel='linear', probability=True)
svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_model.score(X_test, y_test)

y_pred = svc_model.predict(X_test)
ac = accuracy_score(y_test, y_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_test, y_pred))

# plot_ROC_curve(svc_model,X_test,y_test)

In [None]:
emp['Attrition'].value_counts(normalize=True).plot(kind='bar')

# plt.bar(x, height= y)

for i, v in enumerate(emp['Attrition']):
    plt.text(xlocs[i] - 0.25, v + 0.01, str(v))
plt.show()

In [None]:
emp['Attrition'].value_counts(normalize=True)

In [None]:
# plot test accuracy by number of neighbors:
test_acc = []
for i in range(1, 399):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    test_acc.append(knn.score(X_test, y_test))
    
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(list(range(1, 399)), test_acc, lw=3.)
plt.show()
print('Highest accuracy is', round(max(test_acc),2), ' when k=', test_acc.index(max(test_acc)))


In [None]:
# ANSWER
mcv_accuracy=[]

for k in np.arange(1,365):
    knclf_x = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
    cv_score = cross_val_score(knclf_x,X_train,y_train,cv=5)
    mcv_accuracy.append(cv_score.mean())
    
# cv_accuracy

print('Highest accuracy is', round(max(mcv_accuracy),3), ' when k=', mcv_accuracy.index(max(mcv_accuracy)))

plt.figure(figsize=(10,7))
plt.plot(np.arange(1,365),mcv_accuracy,linestyle='dashed',marker='.',markerfacecolor='red', markersize=10)
plt.xlabel('k')
plt.ylabel('mean cv_accuracy')
plt.show()

In [None]:
folds = 5
max_neighbors = np.floor(X.shape[0] - X.shape[0]/5.)

print(max_neighbors)
max_neighbors=399

# plot test accuracy by number of neighbors:
test_acc = []
for i in range(1, int(max_neighbors)):
    knn = KNeighborsClassifier(n_neighbors=i)
    test_acc.append(np.mean(cross_val_score(knn, X, y, cv=5)))
    
print(max(test_acc))

In [None]:
ss = StandardScaler()
Xs = ss.fit_transform(X)

len(Xs)

In [None]:
# plot test accuracy by number of neighbors:
test_acc_std = []
for i in range(1, int(max_neighbors)):
    knn = KNeighborsClassifier(n_neighbors=i)
    test_acc_std.append(np.mean(cross_val_score(knn, Xs, y, cv=5)))

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(list(range(1, int(max_neighbors))), test_acc, lw=3.)
ax.plot(list(range(1, int(max_neighbors))), test_acc_std, lw=3., color='darkred')
plt.show()

In [None]:
print('Highest accuracy is', round(max(test_acc_std),3), ' when k=', test_acc_std.index(max(test_acc_std)))


In [None]:
# PCA ANALYSIS

from sklearn.decomposition import PCA

# Instantiate the PCA class and set at 16 components 
pca = PCA()

# Fit PCA with standardised features
pca.fit(Xs)

len(pca.explained_variance_ratio_)

In [None]:
# Plot cumulative variance explained vs number of components
plt.plot(range(1, len(pca.explained_variance_ratio_)+1), 100 - (100*pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Explained cumulative variance %')
plt.show()

In [None]:
def calculate_cum_var_exp(eig_vals):
    '''
    Calculate Explained Variance from Eigenvalues
    
    Return a list or array containing the cumulative explained variance 
    '''
    cum_var_exp=[]
    
#     for i in np.arange(1,len(eig_vals)):
#         expVar = eig_vals[i] / sum(eig_vals[i:len(eig_vals)]) * 100
#         cum_var_exp.append(expVar)
#         # print(sum(eig_vals[i:len(eig_vals)]))


        
    for i in sorted(eig_vals,reverse=True):
        expVar = i / sum(eig_vals) * 100
        cum_var_exp.append(expVar)
    
    cum_var_exp = np.cumsum(cum_var_exp)
    
    return cum_var_exp

In [None]:
def plot_var_exp(eig_vals):
    
    cum_var_exp = calculate_cum_var_exp(eig_vals)
    
    plt.figure(figsize=(9,7))

    component_number = [i+1 for i in range(len(cum_var_exp))]
    print(component_number)
    
    plt.plot(component_number, cum_var_exp, lw=7)

    plt.axhline(y=0, linewidth=5, color='grey', ls='dashed')
    plt.axhline(y=100, linewidth=3, color='grey', ls='dashed')

    ax = plt.gca()
    ax.set_xlim([1,30])
    ax.set_ylim([-5,105])

    ax.set_ylabel('cumulative variance explained', fontsize=16)
    ax.set_xlabel('component', fontsize=16)

    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(12) 

    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(12) 

    ax.set_title('component vs cumulative variance explained\n', fontsize=20)

    plt.show()


In [None]:
plot_var_exp(pca.explained_variance_)

In [None]:
pca = PCA(n_components=16)
pca.fit(Xs)
principleComponents = pca.transform(Xs)

pca_df = pd.DataFrame(principleComponents)
pca_df.head()


In [None]:
pca_df.shape

In [None]:
# Plot PC1 vs PC2
plt.scatter(principleComponents[:, 0], principleComponents[:, 1])
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid()
plt.show()

In [None]:
# ANSWER
# Split Data

X_train, X_test, y_train, y_test = train_test_split(pca_df,y,test_size=0.2,random_state=42)

Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs,y,test_size=0.2,random_state=42)

In [None]:
# Set KNN classifier to use 5 neighbors
knn5 = KNeighborsClassifier(n_neighbors=5)

# CV accuracy of KNN using standardised data 
standard_scores = cross_val_score(knn5, Xs_train, ys_train, cv=5)
print("Number of features in standardised data:       ", Xs.shape[1])
print("5-fold CV accuracy using standardised data:    ", standard_scores.mean())

In [None]:
# CV accuracy of KNN using PCA-transformed data
pca_scores = cross_val_score(knn5, X_train, y_train, cv=5)
print("Number of features in PCA-transformed data:    ", pca_df.shape[1])
print("5-fold CV accuracy using PCA-transformed data: ", pca_scores.mean())

In [None]:
# ANSWER
score_pca=[]
test_range=X_train.shape[0]
# test_range = 500

for k in np.arange(1,test_range):
    knclf_x = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
    score_pca.append(knclf_x.score(X_test,y_test))
    
# score_pca

In [None]:
print('Highest accuracy is', round(max(score_pca),2), ' when k=', score_pca.index(max(score_pca)))

plt.figure(figsize=(10,7))
plt.plot(np.arange(1,test_range),score_pca, linestyle='dashed',marker='.',markerfacecolor='red', markersize=10)

plt.xlabel('k')
plt.ylabel('accuracy')
plt.show()


In [None]:
lr_params = {
    'penalty': ['l1','l2'],
    'C': [1, 10, 100]
}

grid_cv = GridSearchCV(estimator = log_reg,
                       param_grid = lr_params,
                       scoring = 'accuracy',
                       verbose = 1,
                       n_jobs = -1)

grid_cv.fit(X_train, y_train)
print(grid_cv.best_params_)

In [None]:
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
        
xgb_cfl = xgb.XGBClassifier(n_jobs = -1)


# A parameter grid for XGBoost
params = {
        'n_estimators' : [100, 200, 500, 750],
        'learning_rate' : [0.01, 0.02, 0.05, 0.1, 0.25],
        'min_child_weight': [1, 5, 7, 10],
        'gamma': [0.1, 0.5, 1, 1.5, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 10, 12]
        }

folds = 5
param_comb = 800

random_search = RandomizedSearchCV(xgb_cfl, param_distributions=params, n_iter=param_comb, scoring='accuracy', n_jobs=-1, cv=5, verbose=3, random_state=42)

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
#----------------------------# random_search.fit(X, y)
timer(start_time) # timing ends here for "start_time" variable

In [None]:
def cross_val_metrics(model) :
    scores = ['accuracy', 'precision', 'recall']
    for sc in scores:
        scores = cross_val_score(model, X, y, cv = 5, scoring = sc)
        print('[%s] : %0.5f (+/- %0.5f)'%(sc, scores.mean(), scores.std()))
        
cross_val_metrics(xgb_clf)     

In [None]:
# xgb 
xgb_clf = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                           colsample_bytree=0.8, gamma=1.5, learning_rate=0.05,
                           max_delta_step=0, max_depth=3, min_child_weight=7, missing=None,
                           n_estimators=200, n_jobs=-1, nthread=None,
                           objective='binary:logistic', random_state=0, reg_alpha=0,
                           reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
                           subsample=0.6)

xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
y_score = xgb_clf.predict_proba(X_test)[:,1]

ac = accuracy_score(y_test, y_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_test, y_pred))

In [None]:
xbg_score = cross_val_score(xgb_clf, Xs_train, ys_train, cv=5, scoring='recall')
xbg_score.mean()

In [None]:
ac = accuracy_score(y_test, y_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_test, y_pred))

In [None]:
def plot_confusion_matrix(cm,target_names,title='Confusion Matrix',cmap=None,
                          normalize=False):
    import itertools
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
plot_confusion_matrix(cm, 
                          normalize    = False,
                          target_names = ['Stay','Leave'],
                          title        = "Confusion matrix for one class ")

In [None]:
cm

In [None]:
from sklearn.utils import resample

#combine split dataset back for resampling
train_data = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
# negative = train_data[train_data.Attrition==0]
# positive = train_data[train_data.Attrition==1]

negative = emp[emp.Attrition==0]
positive = emp[emp.Attrition==1]

In [None]:
X_train.shape

In [None]:
help(resample)

In [None]:
# upsample minority
pos_upsampled = resample(positive,
                         replace=True, # sample with replacement
                         n_samples=len(negative), # match number in majority class
                         random_state=42) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([negative, pos_upsampled])

# check new class counts
upsampled.Attrition.value_counts()


In [None]:
# downsample majority
neg_downsampled = resample(negative,
                         replace=True, # sample with replacement
                         n_samples=len(positive), # match number in minority class
                         random_state=42) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([positive, neg_downsampled])

# check new class counts
downsampled.Attrition.value_counts()


In [None]:
X_res = pd.DataFrame(upsampled,columns=feature_cols)

y_res = upsampled['Attrition']

X_res_train, X_res_test, y_res_train, y_res_test = train_test_split(X_res,y_res,test_size=0.2,random_state=42)



In [None]:
X_res_test.head()

In [None]:
X_res_test.isnull().sum()


In [None]:
X_res_train.head()

In [None]:
# DECISION TREE

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=None)
dtc.fit(X_res_train,y_res_train)
y_res_pred = dtc.predict(X_res_test)

ac = accuracy_score(y_res_test,y_res_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_res_test, y_res_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_res_test, y_res_pred))

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=2000)

from sklearn import tree

pt = tree.plot_tree(dtc,
                impurity = False,
                feature_names = X_res_test.columns.values,
                class_names = ['No', 'Yes'],
                rounded =True,
                filled= True )
fig.savefig('plot_tree.png')


In [None]:
# xgb 
xgb_clf = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                           colsample_bytree=0.8, gamma=1.5, learning_rate=0.05,
                           max_delta_step=0, max_depth=3, min_child_weight=7, missing=None,
                           n_estimators=200, n_jobs=-1, nthread=None,
                           objective='binary:logistic', random_state=0, reg_alpha=0,
                           reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
                           subsample=0.6)

xgb_clf.fit(X_res_train, y_res_train)
y_res_pred = xgb_clf.predict(X_res_test)
y_score = xgb_clf.predict_proba(X_res_test)[:,1]

ac = accuracy_score(y_res_test, y_res_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_res_test, y_res_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_res_test, y_res_pred))

In [None]:
# RANDOMFOREST

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(criterion='gini',
                                max_depth=13, 
                                random_state=1)

rf_clf.fit(X_res_train,y_res_train)
y_res_pred = rf_clf.predict(X_res_test)

ac = accuracy_score(y_res_test,y_res_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_res_test, y_res_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_res_test, y_res_pred))

In [None]:

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets 
model.fit(X_train, y_train)

#Predict Score 
y_pred = model.predict(X_test)
ac = accuracy_score(y_test, y_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm,annot=True,fmt="d");
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(random_state=0)

gb_clf.fit(X_res_train,y_res_train)
y_res_pred = gb_clf.predict(X_res_test)

ac = accuracy_score(y_res_test,y_res_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_res_test, y_res_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_res_test, y_res_pred))

In [None]:
gb_clf.fit(X_train,y_train)
y_pred = gb_clf.predict(X_test)

ac = accuracy_score(y_test,y_pred)
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True,fmt="d")
print(classification_report(y_test,y_pred))

In [None]:
# Find fpr, tpr
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred)

# Find auc
roc_auc_lr = auc(fpr_lr, tpr_lr)





# Plot of a ROC curve for class 1 (has_cancer)
plt.figure(figsize=[8,8])

# Plot fpr, tpr
plt.plot(fpr_lr, tpr_lr, color='darkblue', lw = 2, label = 'LR ROC curve (area = %0.2f)' % roc_auc_lr)

plt.plot(fpr_svc, tpr_svc, color='darkgreen', lw = 2, label = 'SVC ROC curve (area = %0.2f)' % roc_auc_svc)

plt.plot(fpr_nb, tpr_nb, color='darkorange', lw = 2, label = 'NB ROC curve (area = %0.2f)' % roc_auc_nb)


plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Receiver Operating Characteristic: M', fontsize=18)
plt.legend(loc="lower right")
plt.show()