## Model Selection and Cross Validation

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [2]:
#Reading input data
df = pd.read_csv('Purchased_Dataset.csv')

In [3]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
df.shape

(400, 5)

In [5]:
## Defining X and y variables
x = df[['Age','EstimatedSalary']]
y = df['Purchased']

In [6]:
y.value_counts()/len(y)

0    0.6425
1    0.3575
Name: Purchased, dtype: float64

In [7]:
## Splitting data into train and test
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=9)

In [8]:
#Build logistic regression classifier
classifier = LogisticRegression(solver='lbfgs')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print('Accuracy of logistic regression classifier on train set = {:.2f}'.format(classifier.score(x_train, y_train)))
print('Accuracy of logistic regression classifier on test set = {:.2f}'.format(classifier.score(x_test, y_test)))

Accuracy of logistic regression classifier on train set = 0.66
Accuracy of logistic regression classifier on test set = 0.60


In [11]:
## fitting model with Logistic Regression algorithm along with k-fold cross validation
logreg = LogisticRegression(solver='lbfgs')
print (cross_val_score(logreg, x_train, y_train, cv=10, scoring = 'accuracy').mean())

0.6701001112347051


In [14]:
logreg = LogisticRegression(solver='lbfgs')
print (cross_val_score(logreg, x_train, y_train, cv=45, scoring = 'accuracy').mean())

0.6801587301587303


In [15]:
from sklearn.metrics import confusion_matrix
print("Confusion Matrix below:")
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

Confusion Matrix below:
[[60  0]
 [40  0]]


#### Accuracy of the model is very less due to imbalance data

In [16]:
# Creating random forest classifier object
RFclassifier = RandomForestClassifier(n_estimators=300, random_state=123)

In [17]:
## Fit the model with train and test data
RFclassifier.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

In [18]:
## Predict target variable with test data
y_pred = RFclassifier.predict(x_test)

In [19]:
## Find accuracy score using predicted and actual
metrics.accuracy_score(y_test,y_pred)

0.94

In [20]:
RFclassifier.feature_importances_

array([0.4962602, 0.5037398])

In [29]:
x_train.head()

Unnamed: 0,Age,EstimatedSalary
155,31,15000
249,35,97000
20,45,22000
194,28,89000
222,37,144000


In [21]:
## Running model with different parameters
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state =10)

RFclassifier = RandomForestClassifier(n_estimators=400, random_state=134)

RFclassifier.fit(x_train,y_train)

y_pred = RFclassifier.predict(x_test)

metrics.accuracy_score(y_test,y_pred)

0.92

In [58]:
x_train.head()

Unnamed: 0,Age,EstimatedSalary
155,31,15000
249,35,97000
20,45,22000
194,28,89000
222,37,144000


In [22]:
## fitting same model with k-fold cross validation

RFclassifier = RandomForestClassifier(n_estimators=300, random_state=123)
print(cross_val_score(RFclassifier, x, y, cv=45, scoring ='accuracy').mean())

0.8812962962962962
