## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from math import exp

## Load Dataset

In [2]:
data = pd.read_csv('data/revenue-data.csv')

In [3]:
data.head()

Unnamed: 0,Age,SiteSpending,SiteTime,RecommendImpression,Education,WorkType,Sex,Region,Salary
0,32,314.06,30.14,0,Degree,Private sector,Male,London,60173.49
1,20,3758.36,149.36,4,GCSE,Private sector,Female,South East,42965.45
2,36,601.72,21.87,0,Masters,Private sector,Male,East of England,54924.41
3,21,44.89,182.8,9,Masters,Private sector,Female,Northern Ireland,26734.99
4,24,614.8,194.34,0,GCSE,Private sector,Male,Scotland,15325.23


## Data Processing

In [4]:
data['£35K+ Salary'] = np.where(data['Salary'] > 35000, True, False )

In [5]:
data.head()

Unnamed: 0,Age,SiteSpending,SiteTime,RecommendImpression,Education,WorkType,Sex,Region,Salary,£35K+ Salary
0,32,314.06,30.14,0,Degree,Private sector,Male,London,60173.49,True
1,20,3758.36,149.36,4,GCSE,Private sector,Female,South East,42965.45,True
2,36,601.72,21.87,0,Masters,Private sector,Male,East of England,54924.41,True
3,21,44.89,182.8,9,Masters,Private sector,Female,Northern Ireland,26734.99,False
4,24,614.8,194.34,0,GCSE,Private sector,Male,Scotland,15325.23,False


### Label Encoding

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

data["Education"] = le.fit_transform(data['Education'])
data["WorkType"] = le.fit_transform(data["WorkType"])
data['Sex'] = le.fit_transform(data['Sex'])
data['Region'] =  le.fit_transform(data['Region'])
data['£35K+ Salary'] =  le.fit_transform(data['£35K+ Salary'])

In [7]:
data.head()

Unnamed: 0,Age,SiteSpending,SiteTime,RecommendImpression,Education,WorkType,Sex,Region,Salary,£35K+ Salary
0,32,314.06,30.14,0,1,0,1,2,60173.49,1
1,20,3758.36,149.36,4,2,0,0,7,42965.45,1
2,36,601.72,21.87,0,3,0,1,1,54924.41,1
3,21,44.89,182.8,9,3,0,0,5,26734.99,0
4,24,614.8,194.34,0,2,0,1,6,15325.23,0


### Feature Scaling

In [8]:
from sklearn import preprocessing
x = data.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
data_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(data_scaled)

In [9]:
df.columns = data.columns

In [10]:
df.head()

Unnamed: 0,Age,SiteSpending,SiteTime,RecommendImpression,Education,WorkType,Sex,Region,Salary,£35K+ Salary
0,0.206349,0.062878,0.084627,0.0,0.166667,0.0,1.0,0.181818,0.35947,1.0
1,0.015873,0.854345,0.48771,0.235294,0.333333,0.0,0.0,0.636364,0.229875,1.0
2,0.269841,0.128979,0.056666,0.0,0.5,0.0,1.0,0.090909,0.319939,1.0
3,0.031746,0.001025,0.600771,0.529412,0.5,0.0,0.0,0.454545,0.107643,0.0
4,0.079365,0.131985,0.639788,0.0,0.333333,0.0,1.0,0.545455,0.021716,0.0


### Defining Feature and Target variables

In [11]:
#feature variables
x = df.drop(['£35K+ Salary'], axis=1)
x = x.drop(['Salary'], axis=1)
x

Unnamed: 0,Age,SiteSpending,SiteTime,RecommendImpression,Education,WorkType,Sex,Region
0,0.206349,0.062878,0.084627,0.000000,0.166667,0.000000,1.0,0.181818
1,0.015873,0.854345,0.487710,0.235294,0.333333,0.000000,0.0,0.636364
2,0.269841,0.128979,0.056666,0.000000,0.500000,0.000000,1.0,0.090909
3,0.031746,0.001025,0.600771,0.529412,0.500000,0.000000,0.0,0.454545
4,0.079365,0.131985,0.639788,0.000000,0.333333,0.000000,1.0,0.545455
...,...,...,...,...,...,...,...,...
995,0.587302,0.031536,0.905298,0.294118,0.333333,0.333333,0.0,0.363636
996,0.555556,0.026424,0.764310,0.294118,0.833333,0.000000,1.0,0.818182
997,0.047619,0.536715,0.953579,0.647059,0.833333,0.000000,1.0,0.636364
998,0.476190,0.991456,0.076647,0.000000,0.166667,0.000000,1.0,0.454545


In [12]:
#target variable
y = df['£35K+ Salary']
y

0      1.0
1      1.0
2      1.0
3      0.0
4      0.0
      ... 
995    0.0
996    0.0
997    1.0
998    1.0
999    1.0
Name: £35K+ Salary, Length: 1000, dtype: float64

## Decision Tree Classifier

In [178]:
# Import Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier 

# Import train_test_split function
from sklearn.model_selection import train_test_split 

# Import train_test_split function
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [179]:
# Create Decision Tree classifer object
model = DecisionTreeClassifier()

# Train Decision Tree Classifer
model = model.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = model.predict(x_test)

In [180]:
#Evaluation using Accuracy score
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)

Accuracy: 91.5


### Hyperparameter Tuning

In [181]:
from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [182]:
std_slc = StandardScaler()

In [183]:
pca = decomposition.PCA()

In [184]:
dec_tree = tree.DecisionTreeClassifier()

In [185]:
pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])

In [186]:
n_components = list(range(1,x.shape[1]+1,1))

In [187]:
criterion = ['gini', 'entropy']
max_depth = [2,4,6,8,10,12]

In [188]:
parameters = dict(pca__n_components=n_components,
                      dec_tree__criterion=criterion,
                      dec_tree__max_depth=max_depth)

In [189]:
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(x, y)

GridSearchCV(estimator=Pipeline(steps=[('std_slc', StandardScaler()),
                                       ('pca', PCA()),
                                       ('dec_tree', DecisionTreeClassifier())]),
             param_grid={'dec_tree__criterion': ['gini', 'entropy'],
                         'dec_tree__max_depth': [2, 4, 6, 8, 10, 12],
                         'pca__n_components': [1, 2, 3, 4, 5, 6, 7, 8]})

In [190]:
 print('Best Criterion:', clf_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['dec_tree'])

Best Criterion: gini
Best max_depth: 12
Best Number Of Components: 7

DecisionTreeClassifier(max_depth=12)


In [191]:
DecisionTreeClassifier(
    #class_weight=None, 
                       criterion='gini', max_depth=12,
#             max_features=None, max_leaf_nodes=None,
            #min_impurity_decrease=0.0, 
            #min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, random_state=None,
#             splitter='best'
                      )

DecisionTreeClassifier(max_depth=12)

In [192]:
# Create Decision Tree classifer object
model = DecisionTreeClassifier(criterion='gini', max_depth=12)

# Train Decision Tree Classifer
model = model.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = model.predict(x_test)

In [193]:
#Evaluation using Accuracy score
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)

Accuracy: 92.5


### Evaluation

In [145]:
#Evaluation using Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[ 58,   6],
       [  8, 128]], dtype=int64)

In [146]:
#Evaluation using Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.88      0.91      0.89        64
         1.0       0.96      0.94      0.95       136

    accuracy                           0.93       200
   macro avg       0.92      0.92      0.92       200
weighted avg       0.93      0.93      0.93       200



## Logistic Regression

In [227]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [228]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

In [229]:
lr = LogisticRegression()

In [230]:
lr.fit(X_train, y_train)

LogisticRegression()

In [231]:
y_pred = lr.predict(X_test)

In [232]:
y_pred

array([1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1.,
       1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1.,
       1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0.,
       1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 0.])

In [233]:
from sklearn.metrics import accuracy_score

In [234]:
print('Test Accuracy Score--')
accuracy_score(y_pred, y_test)

Test Accuracy Score--


0.625

In [235]:
y_pred_train = lr.predict(X_train)

In [236]:
print('Train Accuracy Score--')
accuracy_score(y_pred_train, y_train)

Train Accuracy Score--


0.64875

In [237]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(lr.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(lr.score(X_test, y_test)))

Training set score: 0.6488
Test set score: 0.6250


### Hyperparameter Tuning

In [238]:
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x, y)
# summarize results
print("Best: %f using %s" %(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" %(mean, stdev, param))

Best: 0.637667 using {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.634000 (0.042864) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.634000 (0.042864) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.633667 (0.042699) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.635333 (0.043107) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.635333 (0.043107) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.635667 (0.043103) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.637333 (0.043660) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.637667 (0.043411) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.637000 (0.040837) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.624333 (0.016059) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.624333 (0.016059) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.625000 (0.016279) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.631000 (0.003000) with: {

In [246]:
lr = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs')

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

In [247]:
print('Test Accuracy Score--')
accuracy_score(y_pred, y_test)

Test Accuracy Score--


0.625

In [248]:
y_pred_train = lr.predict(X_train)

In [249]:
print('Train Accuracy Score--')
accuracy_score(y_pred_train, y_train)

Train Accuracy Score--


0.64875

In [250]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(lr.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(lr.score(X_test, y_test)))

Training set score: 0.6488
Test set score: 0.6250


### Evaluation

In [251]:
#Evaluation using Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[ 21,  50],
       [ 25, 104]], dtype=int64)

In [252]:
#Evaluation using Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.46      0.30      0.36        71
         1.0       0.68      0.81      0.73       129

    accuracy                           0.62       200
   macro avg       0.57      0.55      0.55       200
weighted avg       0.60      0.62      0.60       200



## Random Forrest Classifier

In [296]:
from sklearn.datasets import make_classification

In [297]:
x.shape, y.shape

((1000, 8), (1000,))

In [298]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [299]:
X_train.shape, y_train.shape

((800, 8), (800,))

In [300]:
X_test.shape, y_test.shape

((200, 8), (200,))

In [301]:
from sklearn.ensemble import RandomForestClassifier

In [302]:
rf = RandomForestClassifier(max_features=5, n_estimators=100)

In [303]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_features=5)

In [304]:
y_pred = rf.predict(X_test)

In [305]:
y_pred

array([1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1.,
       0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 0.,
       1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0.,
       1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 1.,
       0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0.])

In [306]:
accuracy_score(y_pred, y_test)

0.92

### Hyperparameter Tuning

In [307]:
from sklearn.model_selection import GridSearchCV

In [308]:
max_features_range = np.arange(1,5,1)

In [309]:
n_estimators_range = np.arange(10,210,10)

In [310]:
param_grid = dict(max_features=max_features_range, n_estimators=n_estimators_range)

In [311]:
rf = RandomForestClassifier()

In [312]:
grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

In [313]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': array([1, 2, 3, 4]),
                         'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200])})

In [314]:
print("Optimal parameters %s accuracy score of  %0.3f"
      % (grid.best_params_, grid.best_score_))

Optimal parameters {'max_features': 4, 'n_estimators': 150} accuracy score of  0.954


In [315]:
rf = RandomForestClassifier(max_features=4, n_estimators=10)

In [316]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_features=4, n_estimators=10)

In [317]:
y_pred = rf.predict(X_test)

In [318]:
y_pred

array([1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1.,
       0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 0.,
       1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0.,
       1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 1.,
       0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0.])

In [319]:
accuracy_score(y_pred, y_test)

0.925

### Evaluation

In [320]:
#Evaluation using Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[ 61,   4],
       [ 11, 124]], dtype=int64)

In [321]:
#Evaluation using Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.85      0.94      0.89        65
         1.0       0.97      0.92      0.94       135

    accuracy                           0.93       200
   macro avg       0.91      0.93      0.92       200
weighted avg       0.93      0.93      0.93       200

