## Python titanic exercise

[exercise description](https://github.com/tlmohren/python320kincaid/blob/master/exercises/optional_exercise_machine_learning.md)

[titanic data description](https://www.kaggle.com/c/titanic/data)

In [1]:
import numpy as np
import pandas as pd
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# %matplotlib inline
# # machine learning
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC, LinearSVC
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.linear_model import Perceptron
# from sklearn.linear_model import SGDClassifier
# from sklearn.tree import DecisionTreeClassifier

In [2]:
train_df = pd.read_csv('data\\titanic_train.csv')
test_df = pd.read_csv('data\\titanic_test.csv')
combine = [train_df,test_df]
print(train_df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [3]:
train_df.groupby(['Sex','Survived'])['Survived'].count()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64

In [4]:
train_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Brocklebank, Mr. William Alfred",male,347082,C23 C25 C27,S
freq,1,577,7,4,644


In [5]:
print( train_df[['Pclass','Survived']].groupby(['Pclass'],as_index=False).mean() )
print('_'*20)
print(train_df[["Sex","Survived"]].groupby(['Sex'],as_index=False).mean() )
print('_'*20)
print( train_df[["SibSp","Survived"]].groupby(['SibSp'],as_index=False).mean() )
print('_'*20)
print( train_df[["Parch","Survived"]].groupby(["Parch"]).mean() )

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363
____________________
      Sex  Survived
0  female  0.742038
1    male  0.188908
____________________
   SibSp  Survived
0      0  0.345395
1      1  0.535885
2      2  0.464286
3      3  0.250000
4      4  0.166667
5      5  0.000000
6      8  0.000000
____________________
       Survived
Parch          
0      0.343658
1      0.550847
2      0.500000
3      0.600000
4      0.000000
5      0.200000
6      0.000000


In [6]:
guess_ages = np.zeros((2,3))
title_mapping = {"Mr":1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in combine:
    # find title
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.',expand=False)
    
    #replace rare titles by 'rare'
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
                'Don','Dr','Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],'Rare')
    # replace misspellings 
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # map title into cardinal value
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    #map gender into categorical feature
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male':0}).astype(int)
    
    # guess non given ages 
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                               (dataset['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
    for i in range(0,2):
        for j in range(0,3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex ==i) & (dataset.Pclass == j+1), \
                       'Age'] = guess_ages[i,j] 
    dataset['Age'] = dataset['Age'].astype(int)

In [7]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], \
                        as_index=False).mean().sort_values(by='FareBand', ascending=True)

# # convert age into cardinal values 
train_df['AgeBand'] = pd.cut( train_df['Age'],5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean()

freq_port = train_df.Embarked.dropna().mode()[0] 
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
                                           
dataset.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,"Kelly, Mr. James",0,34,0,0,330911,7.8292,,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47,1,0,363272,7.0,,S,3
2,894,2,"Myles, Mr. Thomas Francis",0,62,0,0,240276,9.6875,,Q,1
3,895,3,"Wirz, Mr. Albert",0,27,0,0,315154,8.6625,,S,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22,1,1,3101298,12.2875,,S,3
5,897,3,"Svensson, Mr. Johan Cervin",0,14,0,0,7538,9.225,,S,1
6,898,3,"Connolly, Miss. Kate",1,30,0,0,330972,7.6292,,Q,2
7,899,2,"Caldwell, Mr. Albert Francis",0,26,1,1,248738,29.0,,S,1
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",1,18,0,0,2657,7.2292,,C,3
9,901,3,"Davies, Mr. John Samuel",0,21,2,0,A/4 48871,24.15,,S,1


In [8]:
for dataset in combine:
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0 
    dataset.loc[(dataset['Age'] >16) & (dataset['Age'] <=32), 'Age'] = 1
    dataset.loc[(dataset['Age'] >32) & (dataset['Age'] <=48), 'Age'] = 2
    dataset.loc[(dataset['Age'] >48) & (dataset['Age'] <=64), 'Age'] = 3
    dataset.loc[(dataset['Age'] >64),'Age']  = 4

    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

    dataset['FamilySize'] = dataset['SibSp']+ dataset['Parch'] + 1
    dataset['IsAlone'] = 0
    
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
    dataset['Age*Class'] = dataset.Age * dataset.Pclass
    dataset['Embarked'] = dataset['Embarked'].map( {'S':0, 'C': 1, 'Q':2 } ).astype(int)
    
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
combine = [train_df, test_df]

In [9]:
# train_df = train_df.drop(['AgeBand'] , axis= 1)
# #drop names 
train_df = train_df.drop(['AgeBand','FareBand','Name','Cabin','Ticket','Parch', 'SibSp', 'FamilySize'],axis=1) 
test_df = test_df.drop(['AgeBand','FareBand','Name','Cabin','Ticket','Parch', 'SibSp', 'FamilySize'],axis=1) 
# 'AgeBand','FareBand'
# 
# # train_df =  train_df.drop('AgeBand', axis=1)
# train_df.head()

KeyError: "['FamilySize'] not in index"

In [11]:
# for dataset in combine:
train_df.head()
# train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FareBand,AgeBand,IsAlone,Age*Class
0,1,0,3,0,1,0,0,1,"(-0.001, 7.91]","(16.0, 32.0]",0,3
1,2,1,1,1,2,3,1,3,"(31.0, 512.329]","(32.0, 48.0]",0,2
2,3,1,3,1,1,1,0,2,"(7.91, 14.454]","(16.0, 32.0]",1,3
3,4,1,1,1,2,3,0,3,"(31.0, 512.329]","(32.0, 48.0]",0,2
4,5,0,3,0,2,1,0,1,"(7.91, 14.454]","(32.0, 48.0]",1,6


In [None]:

combine = [train_df, test_df]

train_df.head()

In [None]:
for dataset in combine: 
    
train_df.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

In [None]:

print( freq_port)

In [None]:
for dataset in combine:
    
train_df[['Embarked', 'Survived']].groupby(['Embarked'],as_index=False).mean()

In [None]:
for dataset in combine:
    
train_df.head()

In [None]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
test_df.head()

In [None]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], 
as_index=False).mean().sort_values(by='FareBand', ascending=True)

In [None]:
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]
    
train_df.head(10)

In [None]:
sns.heatmap(train_df.corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':20})
fig=plt.gcf()
fig.set_size_inches(18,15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
g = sns.FacetGrid(train_df,col='Survived')
g.map(plt.hist, 'Age', bins=20)

In [None]:
grid = sns.FacetGrid(train_df,col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist,'Age', alpha =.5, bins=20)

In [None]:
grid = sns.FacetGrid(train_df,row='Embarked', size=2.2, aspect= 1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()

In [None]:
grid = sns.FacetGrid(train_df, row='Embarked', col='Survived',  size=2.2, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()