In [1]:

# Importing all the modules

%matplotlib inline  
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
import re

In [2]:
# read train and test data

train = pd.read_csv(r'C:\Users\pv112g\Desktop\Kaggle\Titanic\titanic_train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train[train['Sex'] == 'female']['Age'].describe()

count    261.000000
mean      27.915709
std       14.110146
min        0.750000
25%       18.000000
50%       27.000000
75%       37.000000
max       63.000000
Name: Age, dtype: float64

In [5]:
# Creating dummy variables and adding them to train df.

Pclass_dum = pd.get_dummies(train['Pclass'],prefix = 'Pclass')
sex_dum = pd.get_dummies(train['Sex'],prefix = 'Gender')
Embarked_dum = pd.get_dummies(train['Embarked'],prefix = 'Embarked')

train = pd.merge(train,Pclass_dum,right_index=True, left_index=True)
train = pd.merge(train,sex_dum,right_index=True, left_index=True)
train = pd.merge(train,Embarked_dum,right_index=True, left_index=True)

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_1,Pclass_2,Pclass_3,Gender_female,Gender_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,0,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1,0,1,0,0,1


In [6]:
# Imputing missing age data

# Approach 1 - linear regression to predict age using fare, embarked, class & gender

reg_data = train[train['Age'].notnull()]
len(reg_data)

reg_x = reg_data[['Pclass_1','Pclass_2','Pclass_3','Gender_male','Gender_female','Fare','Embarked_C','Embarked_Q','Embarked_S']]
reg_y = reg_data[['Age']]

regr = linear_model.LinearRegression()
regr.fit(reg_x, reg_y)
pred = regr.predict(reg_x)

print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(reg_y, pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(reg_y, pred))

# Based on the r-squared value below we can see it's a poor fit. 


# Approach 2 - Imputation of age using mean

'''
However, instead of taking mean of the entire age, I am trying to impute using their prefix their relationships
as spouse sibling or parent children and then find mean age for everything and apply that age so that we can get 
slightly better age range. 
'''

def prefix(x):
    if re.search(r',(.*)',x):
        if re.search(r'(.*)\.',x):
            return re.findall(r'(.*)\.',re.findall(r',(.*)',x)[0])[0].strip()

train['prefix'] = train['Name'].apply(lambda x : prefix(x))
train.head()

mean_imputation = pd.DataFrame(train.groupby(['Parch','SibSp','prefix'])['Age'].mean()).reset_index().rename(columns = {'Age':
                                                                                                                       'Imp_Age'})
train = pd.merge(train,mean_imputation, on = ['Parch','SibSp','prefix'],how = 'left')
train['Age'] = train['Age'].fillna(train['Imp_Age'])
train['Age'].fillna((train['Age'].mean()), inplace=True)

Coefficients: 
 [[  9.77425833  -2.02744852  -7.74680981   2.03550361  -2.03550361
   -0.043053   -14.96224471 -11.03381036 -13.02521089]]
Mean squared error: 171.82
Variance score: 0.18


In [7]:
# Apply different machine learning model (asssumption is that Age, Pclass, gender and embarked are the biggest drivers)
from sklearn.cross_validation import train_test_split
train = train[train['Age'].notnull()]

train_x = train[['Pclass_1','Pclass_2','Pclass_3','Gender_male','Gender_female','Fare','Embarked_C','Embarked_Q',
                      'Embarked_S', 'Age']]
train_y = train[['Survived']]

X_train, X_test, y_train, y_test = train_test_split(train_x , 
                                                    train_y, 
                                                    test_size = 0.2, 
                                                    random_state = 0)


# from sklearn.metrics import classification_report
# print(classification_report(train_y, y_pred))



In [8]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

clf_A = LinearSVC(random_state = 42)
clf_B = LogisticRegression(random_state = 42)
clf_C = RandomForestClassifier(random_state = 42)

for clf in [clf_A,clf_B, clf_C]:
    clf.fit(train_x, train_y)
    print('Accuracy of clf classifier on train set: {:.2f}'.format(clf.score(X_train,y_train)))
    print('Accuracy of clf classifier on train set: {:.2f}'.format(clf.score(X_test,y_test)))

# clf_A.fit(train_x, train_y)

# print('Accuracy of SVM regression classifier on test set: {:.2f}'.format(clf_A.score(train_x, train_y)))

# clf_C.fit(train_x, train_y)

# print('Accuracy of decision tree regression classifier on test set: {:.2f}'.format(clf_C.score(train_x, train_y)))

Accuracy of clf classifier on train set: 0.74
Accuracy of clf classifier on train set: 0.70
Accuracy of clf classifier on train set: 0.79
Accuracy of clf classifier on train set: 0.80
Accuracy of clf classifier on train set: 0.96
Accuracy of clf classifier on train set: 0.98


  y = column_or_1d(y, warn=True)
  # Remove the CWD from sys.path while we load stuff.


In [9]:
test = pd.read_csv(r'C:\Users\pv112g\Desktop\Kaggle\Titanic\test.csv')
Pclass_dum = pd.get_dummies(test['Pclass'],prefix = 'Pclass')
sex_dum = pd.get_dummies(test['Sex'],prefix = 'Gender')
Embarked_dum = pd.get_dummies(test['Embarked'],prefix = 'Embarked')

test = pd.merge(test,Pclass_dum,right_index=True, left_index=True)
test = pd.merge(test,sex_dum,right_index=True, left_index=True)
test = pd.merge(test,Embarked_dum,right_index=True, left_index=True)

test['prefix'] = test['Name'].apply(lambda x : prefix(x))
test = pd.merge(test,mean_imputation, on = ['Parch','SibSp','prefix'],how = 'left')
test['Age'] = test['Age'].fillna(test['Imp_Age'])
test['Age'].fillna((test['Age'].mean()), inplace=True)
test['Fare'].fillna((test['Fare'].mean()), inplace=True)
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 21 columns):
PassengerId      418 non-null int64
Pclass           418 non-null int64
Name             418 non-null object
Sex              418 non-null object
Age              418 non-null float64
SibSp            418 non-null int64
Parch            418 non-null int64
Ticket           418 non-null object
Fare             418 non-null float64
Cabin            91 non-null object
Embarked         418 non-null object
Pclass_1         418 non-null uint8
Pclass_2         418 non-null uint8
Pclass_3         418 non-null uint8
Gender_female    418 non-null uint8
Gender_male      418 non-null uint8
Embarked_C       418 non-null uint8
Embarked_Q       418 non-null uint8
Embarked_S       418 non-null uint8
prefix           418 non-null object
Imp_Age          404 non-null float64
dtypes: float64(3), int64(4), object(6), uint8(8)
memory usage: 49.0+ KB


In [10]:
test_x = test[['Pclass_1','Pclass_2','Pclass_3','Gender_male','Gender_female','Fare','Embarked_C','Embarked_Q',
                      'Embarked_S', 'Age']]

test_y = pd.DataFrame({'Survived': clf.predict(test_x)})
test_a = pd.merge(test,test_y,right_index=True, left_index=True)
test_a.head()

submission_1 = test_a[['PassengerId','Survived']]

In [11]:
submission_1.to_csv(r'C:\Users\pv112g\Desktop\Kaggle\Titanic\subvmission_2.csv')