## Classification
## Example: Predict survival on Titanic

In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#%matplotlib inline

## Working with data

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### We'll need test_pas_id for submission dataframe

In [6]:
test_pas_id = test['PassengerId']

### Make a list from train and test

In [7]:
full_data=[train, test]

### Impute missing values

#### Embarked

In [8]:
train[train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [9]:
train[(train['Fare']>79) & (train['Fare']<81) & (train['Pclass']==1)].groupby('Embarked').size()

Embarked
C    4
S    3
dtype: int64

In [10]:
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('C')

#### Fare

In [11]:
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

####  Age

In [12]:
# We have plenty of missing values in this feature. 
# Generate random numbers between (mean - std) and (mean + std). 

In [13]:
train['Age'].isnull().sum()

177

In [14]:
np.random.seed(0)
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset.loc[dataset['Age'].isnull(), 'Age'] = age_null_random_list 

In [15]:
train['Age'].isnull().sum()

0

### Data preprocessing

In [16]:
np.unique(full_data[0]['Sex'], return_counts = True)

(array(['female', 'male'], dtype=object), array([314, 577], dtype=int64))

In [17]:
  for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} )

In [18]:
np.unique(full_data[0]['Embarked'], return_counts = True)

(array(['C', 'Q', 'S'], dtype=object), array([170,  77, 644], dtype=int64))

In [19]:
for dataset in full_data:
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'C': 0, 'Q': 1, 'S': 2} ).astype(int)

In [20]:
train['Embarked'].dtype

dtype('int32')

In [21]:
np.unique(full_data[0]['Pclass'], return_counts = True)

(array([1, 2, 3], dtype=int64), array([216, 184, 491], dtype=int64))

In [22]:
factors_train = train[['Pclass','Embarked']]
factors_test = test[['Pclass','Embarked']]

In [23]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder(dtype = 'int32')
enc.fit(factors_train)

OneHotEncoder(categorical_features='all', dtype='int32',
       handle_unknown='error', n_values='auto', sparse=True)

In [24]:
train['Pclass'][:5,]

0    3
1    1
2    3
3    1
4    3
Name: Pclass, dtype: int64

In [25]:
tfactors_train = enc.transform(factors_train).toarray()
tfactors_train[:5,]

array([[0, 0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1],
       [1, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 1]], dtype=int32)

In [26]:
tfactors_test = enc.transform(factors_test).toarray()
tfactors_test[:5,]

array([[0, 0, 1, 0, 1, 0],
       [0, 0, 1, 0, 0, 1],
       [0, 1, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 1],
       [0, 0, 1, 0, 0, 1]], dtype=int32)

In [27]:
train_pclass_emb = pd.DataFrame(tfactors_train, columns = ('Pclass_1', 'Pclass_2', 'Pclass_3', 'Emb_C', 'Emb_Q', 'Emb_S'))

In [28]:
train_pclass_emb.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Emb_C,Emb_Q,Emb_S
0,0,0,1,0,0,1
1,1,0,0,1,0,0
2,0,0,1,0,0,1
3,1,0,0,0,0,1
4,0,0,1,0,0,1


In [29]:
test_pclass_emb = pd.DataFrame(tfactors_test, columns = ('Pclass_1', 'Pclass_2', 'Pclass_3', 'Emb_C', 'Emb_Q', 'Emb_S'))

In [30]:
test_pclass_emb.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Emb_C,Emb_Q,Emb_S
0,0,0,1,0,1,0
1,0,0,1,0,0,1
2,0,1,0,0,1,0
3,0,0,1,0,0,1
4,0,0,1,0,0,1


### Feature Selection

In [31]:
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Pclass', 'Embarked']
train = train.drop(drop_elements, axis = 1)
test = test.drop(drop_elements, axis = 1)

In [32]:
train = pd.concat([train,train_pclass_emb], axis=1)
test = pd.concat([test,test_pclass_emb], axis=1)

In [33]:
train.columns

Index(['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Emb_C', 'Emb_Q', 'Emb_S'],
      dtype='object')

In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
Survived    891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Pclass_1    891 non-null int32
Pclass_2    891 non-null int32
Pclass_3    891 non-null int32
Emb_C       891 non-null int32
Emb_Q       891 non-null int32
Emb_S       891 non-null int32
dtypes: float64(2), int32(6), int64(4)
memory usage: 62.7 KB


In [35]:
trainv = train.values

In [36]:
trainv.shape

(891, 12)

In [37]:
type(trainv)

numpy.ndarray

In [38]:
trainv.dtype

dtype('float64')

In [39]:
X = trainv[0:, 1:]
y = trainv[0:, 0]

In [40]:
# Standardize features by removing the mean and scaling to unit variance
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

### Data prepared to predict for submission

In [41]:
test.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Emb_C', 'Emb_Q', 'Emb_S'],
      dtype='object')

In [42]:
Xnew = test.values
Xnew.shape

(418, 11)

## Modeling

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report

In [44]:
# Split to train and test
# 75% and 25% by default
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=2)
print(Xtrain.shape, Xtest.shape)

(668, 11) (223, 11)


In [45]:
np.unique(ytrain, return_counts = True)

(array([0., 1.]), array([418, 250], dtype=int64))

In [46]:
np.unique(ytest, return_counts = True)

(array([0., 1.]), array([131,  92], dtype=int64))

## LogisticRegression

In [47]:
# http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [48]:
# Model
model_lgr = LogisticRegression(random_state = 1)
print(model_lgr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [49]:
# C - Inverse of regularization strength; must be a positive float.
# Smaller values specify stronger regularization.

### Fit the model

In [50]:
model_lgr.fit(Xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Model fit parameters

In [51]:
model_lgr.coef_

array([[-1.30311867, -0.51394113, -0.39622857, -0.00945507,  0.11178718,
         0.45518689,  0.12177519, -0.49129866,  0.09522484,  0.00531859,
        -0.08692794]])

In [52]:
model_lgr.coef_.reshape(11,)

array([-1.30311867, -0.51394113, -0.39622857, -0.00945507,  0.11178718,
        0.45518689,  0.12177519, -0.49129866,  0.09522484,  0.00531859,
       -0.08692794])

In [53]:
params = pd.Series(model_lgr.coef_.reshape(11,), index=train.columns[1:])
params

Sex        -1.303119
Age        -0.513941
SibSp      -0.396229
Parch      -0.009455
Fare        0.111787
Pclass_1    0.455187
Pclass_2    0.121775
Pclass_3   -0.491299
Emb_C       0.095225
Emb_Q       0.005319
Emb_S      -0.086928
dtype: float64

In [54]:
model_lgr.intercept_

array([-0.7636303])

### Model validation

In [55]:
# Predict on train

ypred_train = model_lgr.predict(Xtrain)
ypred_train_proba = model_lgr.predict_proba(Xtrain)

In [56]:
# Predict on test

ypred = model_lgr.predict(Xtest)
print(ypred[:10])

ypred_proba = model_lgr.predict_proba(Xtest)
print(ypred_proba[:5,:])

# ypred_proba[:,0] - probability for class zero (not survived), 
# ypred_proba[:,1] - probability for class one - survived

[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[[0.69915956 0.30084044]
 [0.89445575 0.10554425]
 [0.18085052 0.81914948]
 [0.90835982 0.09164018]
 [0.67244666 0.32755334]]


#### Metrics: accuracy, confusion matrix, classification report, AUC
#### http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics

In [57]:
# We can check our classification accuracy by comparing 
# the true values of the test set to the predictions:

In [58]:
# Accuracy on train
accuracy_score(ytrain, ypred_train)

0.8098802395209581

In [59]:
# Accuracy on test
accuracy_score(ytest, ypred)

0.7847533632286996

In [60]:
# Score for classification models is accuracy
model_lgr.score(Xtest, ytest)

0.7847533632286996

In [61]:
# Accuracy doesn't tell us where we've gone wrong: 
# one nice way to do this is to use the confusion matrix

In [62]:
print(confusion_matrix(ytest, ypred))

[[118  13]
 [ 35  57]]


In [63]:
target_names = ['not survived', 'survived']
print(classification_report(ytest, ypred, target_names=target_names))

              precision    recall  f1-score   support

not survived       0.77      0.90      0.83       131
    survived       0.81      0.62      0.70        92

 avg / total       0.79      0.78      0.78       223



In [64]:
# AUC
# y_scores -  probability estimates of the positive class

print("AUC on traint =", roc_auc_score(ytrain, ypred_train_proba[:, 1]))
print("AUC on test =", roc_auc_score(ytest, ypred_proba[:, 1]))

AUC on traint = 0.8610909090909091
AUC on test = 0.8375373382011284


#### <span style="color:red">Submission to kaggle a prediction for Xnew with model_lgr was given a score (accuracy) 0.7799</span>

### K-fold Cross-Validation

In [65]:
# http://scikit-learn.org/stable/modules/cross_validation.html

from sklearn.model_selection import cross_val_score

In [66]:
lgr = LogisticRegression(random_state = 1)

# Split to train and test: 80% and 20% 
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=2, test_size=0.2)

scores = cross_val_score(lgr, Xtrain, ytrain, cv=5)
scores

array([0.85314685, 0.8041958 , 0.81818182, 0.79577465, 0.78014184])

In [67]:
print("Mean cv accuracy : %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Mean cv accuracy : 0.81 (+/- 0.05)


In [68]:
lgr.fit(Xtrain,ytrain)
print("Accuracy on train",lgr.score(Xtrain,ytrain))
print("Accuracy on test", lgr.score(Xtest, ytest))

Accuracy on train 0.8160112359550562
Accuracy on test 0.7653631284916201


### Hyperparameters Grid Search

In [69]:
# http://scikit-learn.org/stable/modules/grid_search.html#grid-search

# GridSearchCV exhaustively considers all parameter combinations

from sklearn.model_selection import GridSearchCV

param_grid = {'C': [.001, .01, 1, 10],
              'penalty': ['l1', 'l2']}
lgr = LogisticRegression(random_state=1)
grid = GridSearchCV(lgr, param_grid, cv=5)

In [70]:
grid.fit(Xtrain, ytrain)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 1, 10], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [71]:
# Mean cross-validated score of the best_estimator
grid.best_score_

0.8103932584269663

In [72]:
grid.best_params_

{'C': 1, 'penalty': 'l2'}

In [73]:
model = grid.best_estimator_

In [74]:
print(model)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [75]:
model.score(Xtest,ytest)

0.7653631284916201

In [76]:
model.score(Xtrain,ytrain)

0.8160112359550562

### Save / load a model

In [120]:
from sklearn.externals import joblib
joblib.dump(model, 'model.pkl') 

['model.pkl']

In [121]:
model1 = joblib.load('model.pkl') 

In [122]:
print(model1)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [123]:
ypred = model1.predict(Xtest)
ypred[:10]

array([ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

## RandomForestClassifier

In [99]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

In [None]:
# rfc = RandomForestClassifier(random_state = ...)

In [None]:
# Split data on train and test
# Xtrain, Xtest, ytrain, ytest = ...

In [None]:
# Fit the model


In [None]:
# Model accuracy on train and on test data


In [None]:
# Features importance
# pd.Series(rfc.feature_importances_, index=train.columns[1:])

In [None]:
# Predict on test

In [None]:
# Print classification report

#### Hyperparameters Grid Search

In [None]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {'n_estimators': [5, 10, 15, 20],
#              'max_depth': [2, 3, 4, 5, 6, 7, 8]}
# grid = GridSearchCV(RandomForestClassifier(random_state = 2), param_grid, cv=5)

In [None]:
# Fit grid


In [None]:
# Mean cross-validated score of the best_estimator


In [None]:
# Parameters (max_depth and n_estimators) of the best_estimator


In [None]:
# Accuracy of the best_estimator on train and test

### GradientBoostingClassifier

In [166]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Continue on analogue with RandomForestClassifier

### XGBoost
#### http://xgboost.readthedocs.io/en/latest/python/python_intro.html

### Submission

In [54]:
# Generate Submission File 

# Use model with the best accuracy on test to predict on Xnew (ypred_Xnew should be int)

# Example: ypred_Xnew = model_lgr.predict(Xnew).astype(int)

# submission = pd.DataFrame({ 'PassengerId': test_pas_id,
#                             'Survived': ypred_Xnew })
# submission.to_csv("submission.csv", index=False)

1) Register on https://www.kaggle.com
2) Go to https://www.kaggle.com/c/titanic/submit
3) Submit your csv file and get the score (accuracy)