In [42]:
# This is the library import session.
import sys # system parameter
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
import pandas as pd # database processing package similar to SQL

# Common Machine Learning Algorithms
from sklearn import svm, tree, linear_model, neighbors, \
naive_bayes, ensemble, discriminant_analysis, gaussian_process
#from xgboost import XGBClassifier missing wait for add
#Common Model Helper package
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12, 8

In [9]:
# Data import
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_test = train_df.copy(deep = True)
data_cleaner = [train_test, test_df]

In [18]:
# Data preview and exploration
train_df.info()
train_df.sample(10)
train_df.groupby('Survived').count()
print('Train Columns with null:\n', train_test.isnull().sum())
print("-" * 10)

print('Test/Validation columns with null:\n', test_df.isnull().sum())
print("-" * 10)

train_df.describe(include = 'all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
Train Columns with null:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------
Test/Validation columns with null:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Chambers, Mr. Norman Campbell",male,,,,1601.0,,C23 C25 C27,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [38]:
# Data Cleaning
for dataset in data_cleaner:
    # fill the missing data with median
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    
    #Complete Embark information with mode
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True) #mode is the number with largest frequency
    #complete the missing fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
drop_column = ['PassengerId', 'Cabin', 'Ticket']
train_test.drop(drop_column, axis=1, inplace=True)
    
print(train_test.isnull().sum())
print("-" * 10)
print(test_df.isnull().sum())

KeyError: "labels ['PassengerId' 'Cabin' 'Ticket'] not contained in axis"

In [55]:
### Feature engineering for train and test/validation dataset
for dataset in data_cleaner:
    # count the total number of family number
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    dataset['IsAlone'] = 1 # use 1 to denote alone
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
    # filter out the title like Mr. Ms. Miss etc
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1]\
    .str.split(".", expand = True)[0]
    # catogorize using the frequency distribution 0%, 25%, 50%, 75%, 100%
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    #catogorize using the age range evenly separated into 5
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)

stat_min = 10
title_names = (train_test['Title'].value_counts() < stat_min)
    
train_test['Title'] = train_test['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
print(train_test['Title'].value_counts())
print("-" * 10)
    
train_test.info()
test_df.info()
train_test.sample(10)

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Name          891 non-null object
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null object
FamilySize    891 non-null int64
IsAlone       891 non-null int64
Title         891 non-null object
FareBin       891 non-null category
AgeBin        891 non-null category
dtypes: category(2), float64(2), int64(6), object(4)
memory usage: 85.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null 

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
488,0,3,"Somerton, Mr. Francis William",male,30.0,0,0,8.05,S,1,1,Mr,"(7.91, 14.454]","(16.0, 32.0]"
463,0,2,"Milling, Mr. Jacob Christian",male,48.0,0,0,13.0,S,1,1,Mr,"(7.91, 14.454]","(32.0, 48.0]"
152,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,8.05,S,1,1,Mr,"(7.91, 14.454]","(48.0, 64.0]"
780,1,3,"Ayoub, Miss. Banoura",female,13.0,0,0,7.2292,C,1,1,Miss,"(-0.001, 7.91]","(-0.08, 16.0]"
44,1,3,"Devaney, Miss. Margaret Delia",female,19.0,0,0,7.8792,Q,1,1,Miss,"(-0.001, 7.91]","(16.0, 32.0]"
288,1,2,"Hosono, Mr. Masabumi",male,42.0,0,0,13.0,S,1,1,Mr,"(7.91, 14.454]","(32.0, 48.0]"
821,1,3,"Lulic, Mr. Nikola",male,27.0,0,0,8.6625,S,1,1,Mr,"(7.91, 14.454]","(16.0, 32.0]"
327,1,2,"Ball, Mrs. (Ada E Hall)",female,36.0,0,0,13.0,S,1,1,Mrs,"(7.91, 14.454]","(32.0, 48.0]"
104,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37.0,2,0,7.925,S,3,0,Mr,"(7.91, 14.454]","(32.0, 48.0]"
91,0,3,"Andreasson, Mr. Paul Edvin",male,20.0,0,0,7.8542,S,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
