# Imports and Functions

In [354]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, precision_recall_curve, confusion_matrix

In [355]:
def upsample_func(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

In [356]:
def downsample_func(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=12345)
    
    return features_downsampled, target_downsampled

In [357]:
def cross_validate(validator, model, features, target, score=['f1','AUC_ROC','confusion'], \
                   scaler=None ,scale=False,scale_col=None, upsample=False, downsample=False, sample_ratio=None):
    #save all scores in df
    scores = pd.DataFrame(index=range(0,validator.n_splits), columns=score)
#     scores = {}
#     for s in score:
#         scores[s] = []
    #save sanity checks in df
    sanity_check = pd.DataFrame(index=range(0,validator.n_splits), columns=['predicted 1s ratio','f1 constant 1s'])
    fold=0
    #iterate over splits
    for train_index, valid_index in validator.split(features, target):
        #copying sets to perform scales / sampling
        features_train_edit = features.loc[train_index,:].copy()
        target_train_edit = target[train_index].copy()
        features_valid_edit = features.loc[valid_index,:].copy()
        target_valid_edit = target[valid_index].copy()
#        print(target_train_edit.mean(), target_valid_edit.mean())
        #scale data
        if scale and scaler is not None and scale_col is not None:
            scaler.fit(features_train_edit[scale_col])
            features_train_edit.loc[:,scale_col] = scaler.transform(features_train_edit[scale_col])
            features_valid_edit.loc[:,scale_col] = scaler.transform(features_valid_edit[scale_col])
        #upsample
        if upsample and sample_ratio is not None:
            features_train_edit, target_train_edit = upsample_func(features_train_edit, target_train_edit, sample_ratio)
        #downsample
        if downsample and sample_ratio is not None:
            features_train_edit, target_train_edit = downsample_func(features_train_edit, target_train_edit, 1/sample_ratio)
        #train model
        model.fit(features_train_edit,target_train_edit)
        #evaluate predictions
        predicted_valid = model.predict(features_valid_edit)
        if 'f1' in score:
            scores.loc[fold,'f1'] = (round(f1_score(target_valid_edit, predicted_valid),5))
        if 'AUC_ROC' in score:
            probabilities_valid = model.predict_proba(features_valid_edit)
            probabilities_one_valid = probabilities_valid[:, 1]
            scores.loc[fold,'AUC_ROC'] = (round(roc_auc_score(target_valid_edit, probabilities_one_valid),5))
        if 'confusion' in score:
            scores.loc[fold,'confusion'] = (confusion_matrix(target_valid_edit, predicted_valid))
        #perform sanity checks
        #1's ratio in predicted data
        sanity_check.loc[fold,'predicted 1s ratio'] = predicted_valid.mean()
        #f1 score for a constant model with all 1's
        sanity_check.loc[fold,'f1 constant 1s'] = f1_score(target_valid_edit,(pd.Series(1,target_valid_edit.index)))
        fold+=1
    return scores, sanity_check
    

# Feature Preparation

In [358]:
#read file
data = pd.read_csv('/datasets/Churn.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             9091 non-null float64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [359]:
#1's ratio
data['Exited'].mean()

0.2037

## Handling textual features

In [360]:
#examining textual features and their importance

#data['Surname'].head()
data['Geography'].head()
data['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [361]:
data['Gender'].unique()

array(['Female', 'Male'], dtype=object)

There are 3 textual features:
- 'Surname' we can drop since it's not categorical and probably of less importance
- 'Geography' and 'Gender' have only 2-3 categories - it's best to One-Hot encode them for both Logistic Regression and Random Forest use

In addition, columns 'RowNumber' and 'CustomerId' should be dropped since they are meaningless for prediction

In [362]:
#dropping meaningless columns
data_prep = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [363]:
#One-Hot encoding of Geography and Gender
data_prep = pd.get_dummies(data_prep)
data_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
CreditScore          10000 non-null int64
Age                  10000 non-null int64
Tenure               9091 non-null float64
Balance              10000 non-null float64
NumOfProducts        10000 non-null int64
HasCrCard            10000 non-null int64
IsActiveMember       10000 non-null int64
EstimatedSalary      10000 non-null float64
Exited               10000 non-null int64
Geography_France     10000 non-null uint8
Geography_Germany    10000 non-null uint8
Geography_Spain      10000 non-null uint8
Gender_Female        10000 non-null uint8
Gender_Male          10000 non-null uint8
dtypes: float64(3), int64(6), uint8(5)
memory usage: 752.1 KB


## Handling missing data

In [364]:
data_prep.corr()['Tenure']

CreditScore         -0.000062
Age                 -0.013134
Tenure               1.000000
Balance             -0.007911
NumOfProducts        0.011979
HasCrCard            0.027232
IsActiveMember      -0.032178
EstimatedSalary      0.010520
Exited              -0.016761
Geography_France     0.002167
Geography_Germany   -0.003299
Geography_Spain      0.000810
Gender_Female       -0.012634
Gender_Male          0.012634
Name: Tenure, dtype: float64

'Tenure' is not correlated with any other column so we can't recover it by another

In [365]:
#viewing Tenure statistics
data_prep.describe()['Tenure']

count    9091.000000
mean        4.997690
std         2.894723
min         0.000000
25%         2.000000
50%         5.000000
75%         7.000000
max        10.000000
Name: Tenure, dtype: float64

Since missing data is less than 10% it's ok to fill with median for training purposes

In [366]:
#filling missing Tenure
data_prep['Tenure'] = data_prep['Tenure'].fillna(data_prep['Tenure'].median())
data_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
CreditScore          10000 non-null int64
Age                  10000 non-null int64
Tenure               10000 non-null float64
Balance              10000 non-null float64
NumOfProducts        10000 non-null int64
HasCrCard            10000 non-null int64
IsActiveMember       10000 non-null int64
EstimatedSalary      10000 non-null float64
Exited               10000 non-null int64
Geography_France     10000 non-null uint8
Geography_Germany    10000 non-null uint8
Geography_Spain      10000 non-null uint8
Gender_Female        10000 non-null uint8
Gender_Male          10000 non-null uint8
dtypes: float64(3), int64(6), uint8(5)
memory usage: 752.1 KB


## Splitting to cross-validate

In [367]:
#splitting with 20% test 
features_train, features_test, target_train, target_test = \
    train_test_split(data_prep.drop(['Exited'] , axis=1), data_prep['Exited'], \
                     test_size=0.2, random_state=1234)
features_train.reset_index(drop=True, inplace=True)
target_train.reset_index(drop=True, inplace=True)
#checking balance
print('train 1s: ', target_train.mean(), 'test 1s: ',target_test.mean() )

train 1s:  0.203125 test 1s:  0.206


In [368]:
#StratifiedKFold cross-validator
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12345)

## Scaling

In [370]:
#columns to scale
numeric = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

#tunning scaler on train set
scaler = StandardScaler()

# Training model without balancing

In [373]:
#examining imbalace
data_prep['Exited'].mean()

0.2037

only 20% of data is class '1'

## Training with logistic regression

In [374]:
model = LogisticRegression(solver='liblinear', random_state = 12345)

In [375]:
scores, sanity_check = cross_validate(skf,  model, features_train, target_train, \
                                      score=['f1','AUC_ROC','confusion'], \
                        scaler=scaler,scale=True,scale_col=numeric, \
                                      upsample=False, downsample=False, sample_ratio=None)

In [376]:
scores

Unnamed: 0,f1,AUC_ROC,confusion
0,0.33257,0.76156,"[[1234, 41], [252, 73]]"
1,0.33411,0.80181,"[[1241, 34], [253, 72]]"
2,0.31081,0.78978,"[[1225, 50], [256, 69]]"
3,0.30131,0.73949,"[[1211, 64], [256, 69]]"
4,0.28306,0.74238,"[[1230, 45], [264, 61]]"


In [377]:
scores[['f1','AUC_ROC']].astype('float64').describe(percentiles=[]).round(3)

Unnamed: 0,f1,AUC_ROC
count,5.0,5.0
mean,0.312,0.767
std,0.022,0.028
min,0.283,0.739
50%,0.311,0.762
max,0.334,0.802


In [378]:
sanity_check

Unnamed: 0,predicted 1s ratio,f1 constant 1s
0,0.07125,0.337662
1,0.06625,0.337662
2,0.074375,0.337662
3,0.083125,0.337662
4,0.06625,0.337662


- The data is strongly skewed with only 20% '1's 
- predicted data has less than 10% 1's compared with 20% expected
- F1 score is low (worst case 0.28), lower than F1 score of a constant model with all 1's (0.337)
- AUC ROC score is moderate 0.73
- Since there is a strong imbalance we can't count on ROC AUC - it gave a moderately high graph because FPR is lowered due to large N and model guessing much more 0's (low FP), as can be seen in confusion matrix - FN is 3 times larger than TP. So the model actually didn't achieve a good TPR.

This model is not very good.

## Training with random forest

In [390]:
#searching for best n to maximize f1, AUC ROC

scores_dict = {} 
sanity_check_dict = {}

for n_estimators in range(8,50):
    model = RandomForestClassifier(random_state=12340+n_estimators, n_estimators=n_estimators)
    
    scores_dict[n_estimators], sanity_check_dict[n_estimators] = \
        cross_validate(skf,  model, features_train, target_train, \
                                          score=['f1','AUC_ROC','confusion'], \
                            scaler=scaler,scale=True,scale_col=numeric, \
                                          upsample=False, downsample=False, sample_ratio=None)
    

In [391]:
#N that maximize F1
max_N = 8
for N in scores_dict.keys():
    if scores_dict[max_N]['f1'].min() < scores_dict[N]['f1'].min():
        max_N = N        
print('max F1: ', scores_dict[max_N]['f1'].min(),'n_estimators: ', max_N, \
      'AUC ROC: ',scores_dict[max_N]['AUC_ROC'].min(), \
      '1\'s ratio: ', sanity_check_dict[max_N]['predicted 1s ratio'].min())

max F1:  0.57736 n_estimators:  47 AUC ROC:  0.82461 1's ratio:  0.120625


In [392]:
scores_dict[47]

Unnamed: 0,f1,AUC_ROC,confusion
0,0.58974,0.85088,"[[1215, 60], [164, 161]]"
1,0.59459,0.8772,"[[1236, 39], [171, 154]]"
2,0.58301,0.85367,"[[1233, 42], [174, 151]]"
3,0.57736,0.83791,"[[1223, 52], [172, 153]]"
4,0.57786,0.82461,"[[1221, 54], [171, 154]]"


- f1 score improves significanly to 0.577 (worst case) with Random Forest model with N=47 (greedy)
- AUC ROC score improves to 0.824
- Only 12% predictions of 1's, the same as in Logistic Regression

# Training model with balancing

## Training with Weight Adjustment

### Training with logistic resression

In [394]:
model = LogisticRegression(solver='liblinear', random_state = 12345, class_weight='balanced')
# model.fit(features_train, target_train)

In [395]:
scores, sanity_check = cross_validate(skf,  model, features_train, target_train, \
                                      score=['f1','AUC_ROC','confusion'], \
                        scaler=scaler,scale=True,scale_col=numeric, \
                                      upsample=False, downsample=False, sample_ratio=None)

In [396]:
scores

Unnamed: 0,f1,AUC_ROC,confusion
0,0.48939,0.76374,"[[924, 351], [106, 219]]"
1,0.5271,0.80531,"[[907, 368], [77, 248]]"
2,0.51173,0.79267,"[[934, 341], [96, 229]]"
3,0.47414,0.74435,"[[892, 383], [105, 220]]"
4,0.46053,0.74585,"[[898, 377], [115, 210]]"


In [397]:
scores[['f1','AUC_ROC']].astype('float64').describe(percentiles=[]).round(3)

Unnamed: 0,f1,AUC_ROC
count,5.0,5.0
mean,0.493,0.77
std,0.027,0.028
min,0.461,0.744
50%,0.489,0.764
max,0.527,0.805


In [398]:
sanity_check

Unnamed: 0,predicted 1s ratio,f1 constant 1s
0,0.35625,0.337662
1,0.385,0.337662
2,0.35625,0.337662
3,0.376875,0.337662
4,0.366875,0.337662


- f1 score improves to 0.46 compared with logistic resression with no balancing (0.28) but is lower than random forest with no balancing (0.56)
- AUC ROC still lower than in random forest
- 37% predictions of 1's when there are 20% in data
- TN decreases, TP increases

### Training with random forest

In [403]:
#searching for best n to maximize f1, AUC ROC

scores_dict = {} 
sanity_check_dict = {}

for n_estimators in range(8,50):
    model = RandomForestClassifier(random_state=12340+n_estimators, n_estimators=n_estimators, class_weight='balanced')
    
    scores_dict[n_estimators], sanity_check_dict[n_estimators] = \
        cross_validate(skf,  model, features_train, target_train, \
                                          score=['f1','AUC_ROC','confusion'], \
                            scaler=scaler,scale=True,scale_col=numeric, \
                                          upsample=False, downsample=False, sample_ratio=None)

In [404]:
#N that maximize F1
max_N = 8
for N in scores_dict.keys():
    if scores_dict[max_N]['f1'].min() < scores_dict[N]['f1'].min():
        max_N = N        
print('max F1: ', scores_dict[max_N]['f1'].min(),'n_estimators: ', max_N, \
      'AUC ROC: ',scores_dict[max_N]['AUC_ROC'].min(), \
      '1\'s ratio: ', sanity_check_dict[max_N]['predicted 1s ratio'].min())

max F1:  0.55133 n_estimators:  36 AUC ROC:  0.83135 1's ratio:  0.10625


In [405]:
scores_dict[36]

Unnamed: 0,f1,AUC_ROC,confusion
0,0.55877,0.85273,"[[1226, 49], [180, 145]]"
1,0.57874,0.86807,"[[1239, 36], [178, 147]]"
2,0.55758,0.85886,"[[1243, 32], [187, 138]]"
3,0.56699,0.83601,"[[1231, 44], [179, 146]]"
4,0.55133,0.83135,"[[1219, 56], [180, 145]]"


- f1 score is lower than in random forest with no balancing
- AUC ROC is a bit larger
- 10% predictions of 1's when there are 20% in data

Balancing with Weight Adjustment didn't improve the models

## Training with upsampling

In [408]:
#smapling ratio
(1-target_train.mean())/target_train.mean()

3.923076923076923

### Training with random forest

In [410]:
#searching for best n to maximize f1, AUC ROC

scores_dict = {} 
sanity_check_dict = {}

for n_estimators in range(8,50):
    model = RandomForestClassifier(random_state=12340+n_estimators, n_estimators=n_estimators)
    
    scores_dict[n_estimators], sanity_check_dict[n_estimators] = \
        cross_validate(skf,  model, features_train, target_train, \
                                          score=['f1','AUC_ROC','confusion'], \
                            scaler=scaler,scale=True,scale_col=numeric, \
                                          upsample=True, downsample=False, sample_ratio=4)

In [411]:
#N that maximize F1
max_N = 8
for N in scores_dict.keys():
    if scores_dict[max_N]['f1'].min() < scores_dict[N]['f1'].min():
        max_N = N        
print('max F1: ', scores_dict[max_N]['f1'].min(),'n_estimators: ', max_N, \
      'AUC ROC: ',scores_dict[max_N]['AUC_ROC'].min(), \
      '1\'s ratio: ', sanity_check_dict[max_N]['predicted 1s ratio'].min())

max F1:  0.59022 n_estimators:  41 AUC ROC:  0.82474 1's ratio:  0.1575


In [412]:
scores_dict[41]

Unnamed: 0,f1,AUC_ROC,confusion
0,0.60496,0.84185,"[[1178, 97], [142, 183]]"
1,0.6136,0.869,"[[1182, 93], [140, 185]]"
2,0.60312,0.85273,"[[1197, 78], [151, 174]]"
3,0.59022,0.83721,"[[1182, 93], [150, 175]]"
4,0.5906,0.82474,"[[1180, 95], [149, 176]]"


- f1 score improves a little to 0.59 comapred with 0.577 in random forest with no balancing
- AUC ROC is the same
- 15% predictions of 1's, an improvement compared with 12% with no balancing
- TN decreases, TP and FP increases

## Training with downsampling

### Training with random forest

In [415]:
#searching for best n to maximize f1, AUC ROC

scores_dict = {} 
sanity_check_dict = {}

for n_estimators in range(8,50):
    model = RandomForestClassifier(random_state=12340+n_estimators, n_estimators=n_estimators)
    
    scores_dict[n_estimators], sanity_check_dict[n_estimators] = \
        cross_validate(skf,  model, features_train, target_train, \
                                          score=['f1','AUC_ROC','confusion'], \
                            scaler=scaler,scale=True,scale_col=numeric, \
                                          upsample=False, downsample=True, sample_ratio=4)

In [416]:
#N that maximize F1
max_N = 8
for N in scores_dict.keys():
    if scores_dict[max_N]['f1'].min() < scores_dict[N]['f1'].min():
        max_N = N        
print('max F1: ', scores_dict[max_N]['f1'].min(),'n_estimators: ', max_N, \
      'AUC ROC: ',scores_dict[max_N]['AUC_ROC'].min(), \
      '1\'s ratio: ', sanity_check_dict[max_N]['predicted 1s ratio'].min())

max F1:  0.56912 n_estimators:  49 AUC ROC:  0.83414 1's ratio:  0.319375


In [417]:
scores_dict[49]

Unnamed: 0,f1,AUC_ROC,confusion
0,0.57809,0.83893,"[[990, 285], [77, 248]]"
1,0.6071,0.87259,"[[992, 283], [60, 265]]"
2,0.57279,0.85605,"[[1002, 273], [85, 240]]"
3,0.56912,0.83414,"[[979, 296], [78, 247]]"
4,0.57416,0.8387,"[[1004, 271], [85, 240]]"


- f1 score is lower random forest with upsampling
- AUC ROC is higher
- 31% predictions of 1's

# Final Training

Training with Random Forest N=41 with upsmapling

In [418]:
features = features_train.copy()
target = target_train.copy()

In [419]:
#fit scaler on all training set
scaler = StandardScaler()
scaler.fit(features[numeric])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [420]:
#scale training set
features.loc[:,numeric] = scaler.transform(features[numeric])

In [421]:
#upsample
features_upsampled, target_upsampled = upsample(features, target, 4)

In [422]:
# #downsample
# features_downsampled, target_downsampled = downsample_func(features, target, 1/4)

In [423]:
target_upsampled.mean()

0.5048543689320388

In [424]:
final_model = RandomForestClassifier(random_state=12340+41, n_estimators=41)
final_model.fit(features_upsampled,target_upsampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=41,
                       n_jobs=None, oob_score=False, random_state=12381,
                       verbose=0, warm_start=False)

# Final testing

In [425]:
#scaling test set
features_test_edit = features_test.copy()
features_test_edit.loc[:,numeric] = scaler.transform(features_test[numeric])

In [426]:
predicted_test = final_model.predict(features_test_edit)

In [427]:
f1_score(target_test, predicted_test)

0.6008119079837619

In [429]:
probabilities_test = model.predict_proba(features_test_edit)
probabilities_one_test = probabilities_test[:, 1]
roc_auc_score(target_test, probabilities_one_test)

0.8386495500232325

In [428]:
predicted_test.mean()

0.1635

# Conclusion

- The data of 10000 entries is strongly skewed with 20% 1's
- Test part is 20% of data, train is 80%
- F1 score for constant model is 0.33 and AUC ROC is 0.5
- Logistic Regression models didn't perform well with or without balancing and had bad 1's ratio of less than 10%
- Random Forest models performed the best with upsampling with relatively large N=41
- F1 score on test set is 0.6, AUC ROC is 0.83, 1's ratio is 16%