In [2]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))

In [3]:
import math
import common 
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [4]:
train_df = pd.read_csv("./data/train.csv", index_col="PassengerId")
test_df = pd.read_csv("./data/test.csv", index_col="PassengerId")

In [5]:
train_size = train_df.shape[0]
test_size = test_df.shape[0]

In [6]:
combined_df = pd.concat([train_df, test_df])

# Generate Female + Children Related Features

In [7]:
def age_encoding(x):
    if x is np.nan or x is None or math.isnan(x):
        return 0
        
    elif x <= 15: 
        return 1
    elif x <= 60:
        return 2
    else:
        return 3

combined_df['AgeEncoded'] = combined_df['Age'].map(age_encoding)

In [8]:
def sex_encoding(x):
    if x == 'female':
        return 1
    elif x == 'male':
        return 2
    else:
        return 0
    
combined_df['SexEncoded'] = combined_df['Sex'].map(sex_encoding)

sexEncoder = OneHotEncoder(handle_unknown='ignore')
sexEncoder.fit(combined_df[['SexEncoded']])
sex_temp_df = pd.DataFrame(sexEncoder.transform(combined_df[['SexEncoded']]).toarray(), columns=sexEncoder.get_feature_names(['SexEncoded']), index=combined_df.index)

combined_df = pd.concat([combined_df, sex_temp_df], axis=1)

In [13]:
combined_df.head(train_size)[combined_df['Sex'] == 'male'][:5]

  combined_df.head(train_size)[combined_df['Sex'] == 'male'][:5]


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeEncoded,SexEncoded,SexEncoded_1,SexEncoded_2
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,2,0.0,1.0
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2,2,0.0,1.0
6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,0,2,0.0,1.0
7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,2,2,0.0,1.0
8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,1,2,0.0,1.0


## generate family ID

In [15]:
def extract_surename(x):
    return x.split(",")[0].strip().lower()
combined_df['Surename'] = combined_df['Name'].apply(lambda x: extract_surename(x))

In [16]:
def get_family_id(x):
    return "{}_{}_{}".format(x['Surename'], x['Pclass'], x['Embarked'])
combined_df['Family'] = combined_df.apply(lambda x: get_family_id(x), axis=1)

In [17]:
combined_df['Family'].value_counts()

sage_3_S           11
andersson_3_S      11
goodwin_3_S         8
asplund_3_S         8
panula_3_S          6
                   ..
charters_3_Q        1
morley_3_S          1
birnbaum_1_C        1
berglund_3_S        1
francatelli_1_C     1
Name: Family, Length: 915, dtype: int64

In [24]:
combined_df.head(train_size)[combined_df['Sex'] == 'male'].sort_values('Family')[:50]

  combined_df.head(train_size)[combined_df['Sex'] == 'male'].sort_values('Family')[:50]


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeEncoded,SexEncoded,SexEncoded_1,SexEncoded_2,Surename,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
846,0.0,3,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,M,S,2,2,0.0,1.0,abbing,abbing_3_S
747,0.0,3,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,C.A. 2673,20.25,M,S,2,2,0.0,1.0,abbott,abbott_3_S
309,0.0,2,"Abelson, Mr. Samuel",male,30.0,1,0,P/PP 3381,24.0,M,C,2,2,0.0,1.0,abelson,abelson_2_C
366,0.0,3,"Adahl, Mr. Mauritz Nils Martin",male,30.0,0,0,C 7076,7.25,M,S,2,2,0.0,1.0,adahl,adahl_3_S
402,0.0,3,"Adams, Mr. John",male,26.0,0,0,341826,8.05,M,S,2,2,0.0,1.0,adams,adams_3_S
208,1.0,3,"Albimona, Mr. Nassef Cassem",male,26.0,0,0,2699,18.7875,M,C,2,2,0.0,1.0,albimona,albimona_3_C
811,0.0,3,"Alexander, Mr. William",male,26.0,0,0,3474,7.8875,M,S,2,2,0.0,1.0,alexander,alexander_3_S
841,0.0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,M,S,2,2,0.0,1.0,alhomaki,alhomaki_3_S
785,0.0,3,"Ali, Mr. William",male,25.0,0,0,SOTON/O.Q. 3101312,7.05,M,S,2,2,0.0,1.0,ali,ali_3_S
211,0.0,3,"Ali, Mr. Ahmed",male,24.0,0,0,SOTON/O.Q. 3101311,7.05,M,S,2,2,0.0,1.0,ali,ali_3_S


In [23]:
combined_df['Cabin'].fillna('M', inplace=True)
combined_df['Cabin'] = combined_df['Cabin'].astype(str).str[0]

## generate family survive rate

In [37]:
def generate_family_survive_rate(x):
    x_filtered = x[(x['Sex'] == 'female') | (x['AgeEncoded'] == 1)]
    if x_filtered.shape[0] >= 1:
        return np.mean(x_filtered['Survived'])
    else:
        return None

In [38]:
def cal_dataframe_survive_rate(df):
    family_survive_rate = df.groupby('Family').apply(lambda x: generate_family_survive_rate(x)).to_frame()
    family_survive_rate.reset_index(inplace=True)
    family_survive_rate.columns = ['Family', 'Familay_Survive_Rate']
    return family_survive_rate

In [39]:
family_survive_rate = cal_dataframe_survive_rate(combined_df)
temp3 = combined_df.merge(family_survive_rate, left_on='Family', right_on='Family', how='left')
combined_df['Familay_Survive_Rate'] = temp3['Familay_Survive_Rate'].values

KeyError: 'Familay_Survive_Rate'

In [14]:
combined_df[combined_df['Familay_Survive_Rate'] == 0].sort_values('Ticket')[:10]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeEncoded,SexEncoded,Surename,Family,Familay_Survive_Rate
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
118,0.0,2,"Turpin, Mr. William John Robert",male,29.0,1,0,11668,21.0,,S,2,2,turpin,turpin_2_S,0.0
42,0.0,2,"Turpin, Mrs. William John Robert (Dorothy Ann ...",female,27.0,1,0,11668,21.0,,S,2,1,turpin,turpin_2_S,0.0
358,0.0,2,"Funk, Miss. Annie Clemmer",female,38.0,0,0,237671,13.0,,S,2,1,funk,funk_2_S,0.0
855,0.0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44.0,1,0,244252,26.0,,S,2,1,carter,carter_2_S,0.0
250,0.0,2,"Carter, Rev. Ernest Courtenay",male,54.0,1,0,244252,26.0,,S,2,2,carter,carter_2_S,0.0
200,0.0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24.0,0,0,248747,13.0,,S,2,1,yrois,yrois_2_S,0.0
1041,,2,"Lahtinen, Rev. William",male,30.0,1,1,250651,26.0,,S,2,2,lahtinen,lahtinen_2_S,0.0
313,0.0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26.0,1,1,250651,26.0,,S,2,1,lahtinen,lahtinen_2_S,0.0
115,0.0,3,"Attalah, Miss. Malake",female,17.0,0,0,2627,14.4583,,C,2,1,attalah,attalah_3_C,0.0
599,0.0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C,0,2,boulos,boulos_3_C,0.0


# decison rules:

1) male 
    
    child NO => Die
    
    child Yes => family rate? follow family rate
    
    => Die
    
2) female
    
    family rate? follow family rate
    
    Survive

In [15]:
def gender_family_model(x, family_survive_column="Familay_Survive_Rate"):
    if x['Sex'] == 'female':
        if x[family_survive_column] >= 0 and x[family_survive_column] <= 0.5:
                return 0
        return 1
    else:
        if x['AgeEncoded'] == 1:
            if x[family_survive_column] > 0.5:
                return 1
            else:
                return 0
        else:
            return 0

# Cross Validation

In [16]:
kf3 = KFold(n_splits=5, shuffle=True)
original_train_df = combined_df.head(train_size)
accuracies = []

for tune_train_index, tune_test_index in kf3.split(original_train_df):
    X_train = original_train_df.iloc[tune_train_index].copy()
    X_test = original_train_df.iloc[tune_test_index].copy()
    
    X_train.drop('Familay_Survive_Rate', inplace=True, axis=1)
    X_test.drop('Familay_Survive_Rate', inplace=True, axis=1)
    
    family_survive_rate = cal_dataframe_survive_rate(X_train)
    temp3 = X_test.merge(family_survive_rate, left_on='Family', right_on='Family', how='left')
    X_test['Familay_Survive_Rate'] = temp3['Familay_Survive_Rate'].values
    
    true_y = X_test['Survived'].values
    prediction_y = X_test.apply(lambda x: gender_family_model(x), axis=1).values
    accuracies.append(accuracy_score(true_y, prediction_y))
    print({"accuracy": accuracy_score(true_y, prediction_y)})

print("average accuracy:",  np.mean(accuracies))

{'accuracy': 0.8547486033519553}
{'accuracy': 0.7865168539325843}
{'accuracy': 0.8089887640449438}
{'accuracy': 0.8595505617977528}
{'accuracy': 0.8707865168539326}
average accuracy: 0.8361182599962339


In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from subprocess import call
from random import randrange


# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth' : [4],
    'n_estimators': [10, 20, 50, 100]
}

kf3 = KFold(n_splits=5, shuffle=True)
original_train_df = combined_df.head(train_size)
accuracies = []

for params in ParameterGrid(param_grid):
    for tune_train_index, tune_test_index in kf3.split(original_train_df):
        X_train = original_train_df.iloc[tune_train_index].copy()
        X_test = original_train_df.iloc[tune_test_index].copy()

        X_train.drop('Familay_Survive_Rate', inplace=True, axis=1)
        X_test.drop('Familay_Survive_Rate', inplace=True, axis=1)

        family_survive_rate = cal_dataframe_survive_rate(X_train)
        temp2 = X_train.merge(family_survive_rate, left_on='Family', right_on='Family', how='left')
        X_train['Familay_Survive_Rate'] = temp2['Familay_Survive_Rate'].values
        X_train['Familay_Survive_Rate'].fillna(np.mean(X_train['Familay_Survive_Rate']), inplace=True)

        print(X_train['Familay_Survive_Rate'].describe())
        
        temp3 = X_test.merge(family_survive_rate, left_on='Family', right_on='Family', how='left')
        X_test['Familay_Survive_Rate'] = temp3['Familay_Survive_Rate'].values
        X_test['Familay_Survive_Rate'].fillna(np.mean(X_train['Familay_Survive_Rate']), inplace=True)

        rf_model = RandomForestClassifier(random_state=31, max_depth=params['max_depth'], n_estimators=params['n_estimators'])
        rf_model.fit(X_train[['SexEncoded_1','SexEncoded_2','AgeEncoded','Familay_Survive_Rate']], X_train['Survived'])

        print(X_test[['SexEncoded','AgeEncoded','Familay_Survive_Rate']].apply(lambda x: x.isna().sum()))
        
        true_y = X_test['Survived'].values
        prediction_y = rf_model.predict(X_test[['SexEncoded_1','SexEncoded_2','AgeEncoded','Familay_Survive_Rate']])
        accuracies.append(accuracy_score(true_y, prediction_y))
#         print({"accuracy": accuracy_score(true_y, prediction_y)})

        export_graphviz(rf_model.estimators_[0],
                        out_file='tree.dot',
                        feature_names=['SexEncoded_1','SexEncoded_2','AgeEncoded','Familay_Survive_Rate'],
                        precision = 2,
                        filled=True,
                        rounded=True)
    
        call(['/usr/local/bin/dot', '-Tpng', 'tree.dot', '-o', 'tree_{}_{}_{}.png'.format(params['max_depth'], params['n_estimators'], randrange(100)), '-Gdpi=600'])
        
    print({"average accuracy":  np.mean(accuracies),
          "params": params})

count    712.000000
mean      -0.163858
std        0.908190
min       -1.000000
25%       -1.000000
50%       -1.000000
75%        1.000000
max        1.000000
Name: Familay_Survive_Rate, dtype: float64
SexEncoded              0
AgeEncoded              0
Familay_Survive_Rate    0
dtype: int64
count    713.000000
mean      -0.165498
std        0.908543
min       -1.000000
25%       -1.000000
50%       -1.000000
75%        1.000000
max        1.000000
Name: Familay_Survive_Rate, dtype: float64
SexEncoded              0
AgeEncoded              0
Familay_Survive_Rate    0
dtype: int64
count    713.000000
mean      -0.156522
std        0.912213
min       -1.000000
25%       -1.000000
50%       -1.000000
75%        1.000000
max        1.000000
Name: Familay_Survive_Rate, dtype: float64
SexEncoded              0
AgeEncoded              0
Familay_Survive_Rate    0
dtype: int64
count    713.000000
mean      -0.208575
std        0.902490
min       -1.000000
25%       -1.000000
50%       -1.00000

KeyboardInterrupt: 

In [20]:
import sys
print(sys.path)
sys.path.append('/usr/local/bin/')

['/Users/yanxu/Documents/kaggles', '/Users/yanxu/Documents/kaggles/Titanic', '/Users/yanxu/opt/anaconda3/lib/python38.zip', '/Users/yanxu/opt/anaconda3/lib/python3.8', '/Users/yanxu/opt/anaconda3/lib/python3.8/lib-dynload', '', '/Users/yanxu/opt/anaconda3/lib/python3.8/site-packages', '/Users/yanxu/opt/anaconda3/lib/python3.8/site-packages/aeosa', '/Users/yanxu/opt/anaconda3/lib/python3.8/site-packages/IPython/extensions', '/Users/yanxu/.ipython', '/usr/local/bin/']


# Simple Gender + Family Model (Training)

In [None]:
true_y = combined_df.head(train_size)['Survived'].values
prediction_y = combined_df.head(train_size).apply(lambda x: gender_family_model(x), axis=1)
print(accuracy_score(true_y, prediction_y))

In [None]:
combined_df.groupby(['Sex', 'Survived']).agg({'Survived': ['mean', 'count']})

# Prediction

In [38]:
combined_df['Prediction'] = combined_df.apply(lambda x: gender_family_model(x), axis=1)

In [39]:
combined_df.tail()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeEncoded,Surename,Family,Familay_Type,Familay_Survive_Rate,Prediction
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S,0,spector,spector_3_3,XXM,-1.0,0
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,2,oliva y ocana,oliva y ocana_1_1,XFX,,1
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S,2,saether,saether_3_3,XXM,-1.0,0
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S,0,ware,ware_3_3,XXM,-1.0,0
1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C,0,peter,peter_3_3,XFM,1.0,0


In [47]:
true_y = combined_df.head(train_size)['Survived'].values
prediction_y = combined_df.head(train_size)['Prediction'].values
print(accuracy_score(true_y, prediction_y))

0.8978675645342312


In [40]:
combined_df.head(train_size).groupby(['Sex','Survived']).agg({'Survived': ['mean', 'count']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
Sex,Survived,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.0,0.0,81
female,1.0,1.0,233
male,0.0,0.0,468
male,1.0,1.0,109


In [41]:
combined_df.head(train_size).groupby(['Sex','Prediction']).agg({'Prediction': ['mean', 'count']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
Sex,Prediction,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0,0,82
female,1,1,232
male,0,0,556
male,1,1,21


In [42]:
combined_df.head(train_size).shape

(891, 17)

In [43]:
combined_df.tail(test_size).groupby(['Sex','Prediction']).agg({'Prediction': ['mean', 'count']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
Sex,Prediction,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0,0,10
female,1,1,142
male,0,0,257
male,1,1,9


In [44]:
combined_df[combined_df['Ticket'] == '3701']

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeEncoded,Surename,Family,Familay_Type,Familay_Survive_Rate,Prediction
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,3,storey,storey_3_3,XXM,-1.0,0


# submission

In [45]:
temp1 = combined_df.tail(test_size)
temp1['PassengerId'] = temp1.index
tremp2 = temp1[['PassengerId','Prediction']]
tremp2.columns = ['PassengerId','Survived']
tremp2[['PassengerId', 'Survived']].to_csv("random_forest_submission_gender_family_model_v7.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp1['PassengerId'] = temp1.index


In [46]:
tremp2

Unnamed: 0_level_0,PassengerId,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
892,892,0
893,893,1
894,894,0
895,895,0
896,896,1
...,...,...
1305,1305,0
1306,1306,1
1307,1307,0
1308,1308,0
