In [11]:
import sys
import numpy as np
import matplotlib
import sklearn
import pandas as pd
import scipy as sp

#import the dataset
import os

# misc librabries
import warnings
import random
import time
import IPython

from IPython import display
from subprocess import check_output

In [19]:
print ("Python version: {}".format(sys.version))
print ("pandas version : {}".format(pd.__version__))
print ("matplotlib version: {}".format(matplotlib.__version__))
print ("Numpy version: {}".format(np.__version__))
print ("Scipy version: {}".format(sp.__version__))
print ("IPython version: {}".format(IPython.__version__))
print ("scikit-learn version: {}".format(sklearn.__version__))
warnings.filterwarnings('ignore')
print('-' * 66)

print(check_output(["ls", "../datasets"]).decode("utf8"))

Python version: 2.7.14 |Anaconda, Inc.| (default, Dec  7 2017, 11:07:58) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
pandas version : 0.22.0
matplotlib version: 2.1.2
Numpy version: 1.14.0
Scipy version: 1.0.0
IPython version: 5.4.1
scikit-learn version: 0.19.1
------------------------------------------------------------------
gender_submission.csv
test.csv
train.csv



In [22]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12, 8

In [28]:
#import data from file
data_raw = pd.read_csv('../datasets/train.csv')

#A dataset must be divided into 3 parts: train, test and cross validation
#the test file provided is the validation file for competition submission
#we will split the train and test data in future sections
data_val = pd.read_csv('../datasets/test.csv')

# to play with the data we should create a copy 

data1 = data_raw.copy(deep = True)

#however passing the reference is conveninent, because we can clean both dayasets at once
data_cleaner = [data1, data_val]

#preview data
print(data_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


In [29]:
print('Train columns with null values: \n', data1.isnull().sum())
print('-' *  10)

print('Test/Validation columns with null values:\n', data_val.isnull().sum())
print('-' * 10)

data_raw.describe(include = 'all')

('Train columns with null values: \n', PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64)
----------
('Test/Validation columns with null values:\n', PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64)
----------


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Graham, Mr. George Edward",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [30]:
###COMPLETING : complete or delete missing values in train and test/Vaslidation dataset
for dataset in data_cleaner:
    #complete missing age with median
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    
    #complete embarked with mode
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0],inplace = True)
    
    #complete missing fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)

    
#delete the cabin feature/column and others previously stated to exclude in train dataset
drop_column = ['PassengerId', 'Cabin', 'Ticket']
data1.drop(drop_column, axis = 1, inplace = True)

print(data1.isnull().sum)
print('_' * 20)
print(data_val.isnull().sum())

<bound method DataFrame.sum of      Survived  Pclass   Name    Sex    Age  SibSp  Parch   Fare  Embarked
0       False   False  False  False  False  False  False  False     False
1       False   False  False  False  False  False  False  False     False
2       False   False  False  False  False  False  False  False     False
3       False   False  False  False  False  False  False  False     False
4       False   False  False  False  False  False  False  False     False
5       False   False  False  False  False  False  False  False     False
6       False   False  False  False  False  False  False  False     False
7       False   False  False  False  False  False  False  False     False
8       False   False  False  False  False  False  False  False     False
9       False   False  False  False  False  False  False  False     False
10      False   False  False  False  False  False  False  False     False
11      False   False  False  False  False  False  False  False     False
12     

In [37]:
for dataset in data_cleaner:
    #Discrete variables
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    dataset['IsAlone'] = 1 #initialize to yes/1 is alone
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0#now update to no/0 if familysize greater than 1
    
    #dirty and quick code split title 
    dataset['Title'] = dataset['Name'].str.split(", ", expand = True)[1].str.split(".", expand = True)[0]
    #dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.',expand = False)
    
    #Continuous variable bins; qcut vs cut
    #Fare bins/Buckets using qcut or frequency bins
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    
    

#cleanup rare title names
#print(data1['Title'].value_counts())
stat_min = 10

title_names = (data1['Title'].value_counts() < stat_min)#this will create a true false series with title name as index

data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
print(data1['Title'].value_counts())
print("_" * 20)

#preview data again
data1.info()
data_val.info()
data1.sample(10)

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64
____________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Name          891 non-null object
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null object
FamilySize    891 non-null int64
IsAlone       891 non-null int64
Title         891 non-null object
FareBin       891 non-null category
AgeBin        891 non-null category
dtypes: category(2), float64(2), int64(6), object(4)
memory usage: 85.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
572,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36.0,0,0,26.3875,S,1,1,Mr,"(14.454, 31.0]","(32.0, 48.0]"
195,1,1,"Lurette, Miss. Elise",female,58.0,0,0,146.5208,C,1,1,Miss,"(31.0, 512.329]","(48.0, 64.0]"
633,0,1,"Parr, Mr. William Henry Marsh",male,28.0,0,0,0.0,S,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
441,0,3,"Hampe, Mr. Leon",male,20.0,0,0,9.5,S,1,1,Mr,"(7.91, 14.454]","(16.0, 32.0]"
523,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44.0,0,1,57.9792,C,2,0,Mrs,"(31.0, 512.329]","(32.0, 48.0]"
715,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19.0,0,0,7.65,S,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
593,0,3,"Bourke, Miss. Mary",female,28.0,0,2,7.75,Q,3,0,Miss,"(-0.001, 7.91]","(16.0, 32.0]"
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,30.0708,C,2,0,Mrs,"(14.454, 31.0]","(-0.08, 16.0]"
719,0,3,"Johnson, Mr. Malkolm Joackim",male,33.0,0,0,7.775,S,1,1,Mr,"(-0.001, 7.91]","(32.0, 48.0]"
196,0,3,"Mernagh, Mr. Robert",male,28.0,0,0,7.75,Q,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"


In [41]:
#Convert : convert objects to category using Label Encoder for train and test/Validation dataset

#code categorical dataset
label = LabelEncoder()
for dataset in data_cleaner:
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])

#define y variable aka target/outcome
Target = ['Survived']

#define x variables for original features aka feature selection
data1_x = ['Sex', 'Pclass', 'Embarked', 'Title', 'SibSp','Parch','Age', 'Fare', 'FamilySize', 'IsAlone']

data1_x_calc = ['Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare']


data1_xy = Target + data1_x
print('Original X Y:', data1_xy, '\n')

#define x variables for original w/bin features to remove continuous variables
data1_x_bin = ['Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']
data1_xy_bin = Target + data1_x_bin
print('Bin X Y:' , data1_xy_bin, '\n')


#define x and y variable for dummy features original
data1_dummy = pd.get_dummies(data1[data1_x])
data1_x_dummy = data1_dummy.columns.tolist()
data1_xy_dummy = Target + data1_x_dummy
print('Dummy X Y: ', data1_xy_dummy, '\n')

data1_dummy.head()

('Original X Y:', ['Survived', 'Sex', 'Pclass', 'Embarked', 'Title', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'], '\n')
('Bin X Y:', ['Survived', 'Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code'], '\n')
('Dummy X Y: ', ['Survived', 'Pclass', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Misc', 'Title_Miss', 'Title_Mr', 'Title_Mrs'], '\n')


Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3,1,0,22.0,7.25,2,0,0,1,0,0,1,0,0,0,1,0
1,1,1,0,38.0,71.2833,2,0,1,0,1,0,0,0,0,0,0,1
2,3,0,0,26.0,7.925,1,1,1,0,0,0,1,0,0,1,0,0
3,1,1,0,35.0,53.1,2,0,1,0,0,0,1,0,0,0,0,1
4,3,0,0,35.0,8.05,1,1,0,1,0,0,1,0,0,0,1,0
