In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))

In [29]:
import math
import common 
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
train_df = pd.read_csv("./data/train.csv", index_col="PassengerId")
test_df = pd.read_csv("./data/test.csv", index_col="PassengerId")

In [4]:
train_size = train_df.shape[0]
test_size = test_df.shape[0]

In [5]:
combined_df = pd.concat([train_df, test_df])

# Generate Female + Children Related Features

In [6]:
def age_encoding(x):
    if x is np.nan or x is None or math.isnan(x):
        return 0
        
    elif x <= 15: 
        return 1
    elif x > 60:
        return 3
    else:
        return 2
    
combined_df['AgeEncoded'] = combined_df['Age'].map(age_encoding)

## generate family type

In [7]:
def extract_surename(x):
    return x.split(",")[0].strip().lower()
combined_df['Surename'] = combined_df['Name'].apply(lambda x: extract_surename(x))

In [8]:
def get_family_id(x):
    return "{}_{}_{}".format(x['Surename'], x['Pclass'], x['Embarked'])
combined_df['Family'] = combined_df.apply(lambda x: get_family_id(x), axis=1)

In [9]:
combined_df['Family'].value_counts()

andersson_3_S    11
sage_3_S         11
goodwin_3_S       8
asplund_3_S       8
panula_3_S        6
                 ..
maenpaa_3_S       1
hedman_3_S        1
hoyt_1_C          1
lindblom_3_S      1
heikkinen_3_S     1
Name: Family, Length: 915, dtype: int64

In [10]:
def generate_family_type(x):
    familay_type = ""
    
    if 1 in x['AgeEncoded'].values:
        familay_type += 'C'
    else:
        familay_type += 'X'
        
    if 'female' in x[x['AgeEncoded'] != 1]['Sex'].values:
        familay_type += 'F'
    else:
        familay_type += 'X'
        
    if 'male' in x[x['AgeEncoded'] != 1]['Sex'].values:
        familay_type += 'M'
    else:
        familay_type += 'X'
    return familay_type

In [11]:
family_type = combined_df.groupby('Family').apply(lambda x: generate_family_type(x)).to_frame()

In [12]:
family_type.reset_index(inplace=True)
family_type.columns = ['Family', 'Familay_Type']

In [13]:
combined_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeEncoded,Surename,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,braund,braund_3_S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,cumings,cumings_1_C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2,heikkinen,heikkinen_3_S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,futrelle,futrelle_1_S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2,allen,allen_3_S


In [14]:
if 'Familay_Type' in combined_df.columns:
    combined_df.drop('Familay_Type', inplace=True, axis=1)
temp2 = combined_df.merge(family_type, left_on='Family', right_on='Family', how='left')

In [15]:
temp2.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeEncoded,Surename,Family,Familay_Type
0,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,braund,braund_3_S,XXM
1,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,cumings,cumings_1_C,XFM
2,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2,heikkinen,heikkinen_3_S,XFX
3,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,futrelle,futrelle_1_S,XFM
4,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2,allen,allen_3_S,XXM


In [16]:
combined_df.loc[:, 'Familay_Type'] = temp2['Familay_Type'].values

In [17]:
combined_df.groupby('Familay_Type')['Familay_Type'].count()

Familay_Type
CFM    149
CFX     58
CXM     25
CXX      9
XFM    274
XFX    203
XXM    591
Name: Familay_Type, dtype: int64

## generate family survive rate

In [18]:
def generate_family_survive_rate(x):
    x_filtered = x[(x['Sex'] == 'female') | (x['AgeEncoded'] == 1)]
    if x_filtered.shape[0] >= 1:
        return np.mean(x_filtered['Survived'])
    else:
        return -1

In [19]:
def cal_dataframe_survive_rate(df):
    family_survive_rate = df.groupby('Family').apply(lambda x: generate_family_survive_rate(x)).to_frame()
    family_survive_rate.reset_index(inplace=True)
    family_survive_rate.columns = ['Family', 'Familay_Survive_Rate']
    return family_survive_rate

In [20]:
family_survive_rate = cal_dataframe_survive_rate(combined_df)
temp3 = combined_df.merge(family_survive_rate, left_on='Family', right_on='Family', how='left')
combined_df['Familay_Survive_Rate'] = temp3['Familay_Survive_Rate'].values

In [21]:
combined_df[combined_df['Familay_Survive_Rate'] == 0].sort_values('Ticket')[:10]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeEncoded,Surename,Family,Familay_Type,Familay_Survive_Rate
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
118,0.0,2,"Turpin, Mr. William John Robert",male,29.0,1,0,11668,21.0,,S,2,turpin,turpin_2_S,XFM,0.0
42,0.0,2,"Turpin, Mrs. William John Robert (Dorothy Ann ...",female,27.0,1,0,11668,21.0,,S,2,turpin,turpin_2_S,XFM,0.0
358,0.0,2,"Funk, Miss. Annie Clemmer",female,38.0,0,0,237671,13.0,,S,2,funk,funk_2_S,XFX,0.0
855,0.0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44.0,1,0,244252,26.0,,S,2,carter,carter_2_S,XFM,0.0
250,0.0,2,"Carter, Rev. Ernest Courtenay",male,54.0,1,0,244252,26.0,,S,2,carter,carter_2_S,XFM,0.0
200,0.0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24.0,0,0,248747,13.0,,S,2,yrois,yrois_2_S,XFX,0.0
1041,,2,"Lahtinen, Rev. William",male,30.0,1,1,250651,26.0,,S,2,lahtinen,lahtinen_2_S,XFM,0.0
313,0.0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26.0,1,1,250651,26.0,,S,2,lahtinen,lahtinen_2_S,XFM,0.0
115,0.0,3,"Attalah, Miss. Malake",female,17.0,0,0,2627,14.4583,,C,2,attalah,attalah_3_C,XFM,0.0
599,0.0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C,0,boulos,boulos_3_C,CFM,0.0


# decison rules:

1) male 
    
    child NO => Die
    
    child Yes => family rate? follow family rate
    
    => Die
    
2) female
    
    family rate? follow family rate
    
    Survive

In [30]:
def gender_family_model(x, family_survive_column="Familay_Survive_Rate"):
    if x['Sex'] == 'female':
        if x[family_survive_column] >= 0 and x[family_survive_column] <= 0.5:
                return 0
        return 1
    else:
        if x['AgeEncoded'] == 1:
            if x[family_survive_column] > 0.5:
                return 1
            else:
                return 0
        else:
            return 0

# Cross Validation

In [32]:
kf3 = KFold(n_splits=3, shuffle=False)

for tune_train_index, tune_test_index in kf3.split(combined_df):
    X_train = combined_df.iloc[tune_train_index].copy()
    X_test = combined_df.iloc[tune_test_index].copy()
    
    family_survive_rate = cal_dataframe_survive_rate(X_train)
    temp3 = X_test.merge(family_survive_rate, left_on='Family', right_on='Family', how='left')
    X_test['Familay_Survive_Rate'] = temp3['Familay_Survive_Rate'].values
    
    true_y = X_test['Survived'].values
    prediction_y = X_test.apply(lambda x: gender_family_model(x), axis=1)
    print(accuracy_score(true_y, prediction_y))

KeyError: 'Familay_Survive_Rate'

In [22]:
true_y = combined_df.head(train_size)['Survived'].values
prediction_y = combined_df.head(train_size).apply(lambda x: 1 if x['Sex'] == 'female' else 0, axis=1)
print(accuracy_score(true_y, prediction_y))

0.7867564534231201


# Simple Gender + Family Model (Training)

In [26]:
true_y = combined_df.head(train_size)['Survived'].values
prediction_y = combined_df.head(train_size).apply(lambda x: gender_family_model(x), axis=1)
print(accuracy_score(true_y, prediction_y))

0.8978675645342312


In [28]:
combined_df.groupby(['Sex', 'Survived']).agg({'Survived': ['mean', 'count']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
Sex,Survived,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.0,0.0,81
female,1.0,1.0,233
male,0.0,0.0,468
male,1.0,1.0,109


# Prediction

In [38]:
combined_df['Prediction'] = combined_df.apply(lambda x: gender_family_model(x), axis=1)

In [39]:
combined_df.tail()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeEncoded,Surename,Family,Familay_Type,Familay_Survive_Rate,Prediction
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S,0,spector,spector_3_3,XXM,-1.0,0
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,2,oliva y ocana,oliva y ocana_1_1,XFX,,1
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S,2,saether,saether_3_3,XXM,-1.0,0
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S,0,ware,ware_3_3,XXM,-1.0,0
1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C,0,peter,peter_3_3,XFM,1.0,0


In [47]:
true_y = combined_df.head(train_size)['Survived'].values
prediction_y = combined_df.head(train_size)['Prediction'].values
print(accuracy_score(true_y, prediction_y))

0.8978675645342312


In [40]:
combined_df.head(train_size).groupby(['Sex','Survived']).agg({'Survived': ['mean', 'count']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived,Survived
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
Sex,Survived,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.0,0.0,81
female,1.0,1.0,233
male,0.0,0.0,468
male,1.0,1.0,109


In [41]:
combined_df.head(train_size).groupby(['Sex','Prediction']).agg({'Prediction': ['mean', 'count']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
Sex,Prediction,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0,0,82
female,1,1,232
male,0,0,556
male,1,1,21


In [42]:
combined_df.head(train_size).shape

(891, 17)

In [43]:
combined_df.tail(test_size).groupby(['Sex','Prediction']).agg({'Prediction': ['mean', 'count']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
Sex,Prediction,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0,0,10
female,1,1,142
male,0,0,257
male,1,1,9


In [44]:
combined_df[combined_df['Ticket'] == '3701']

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeEncoded,Surename,Family,Familay_Type,Familay_Survive_Rate,Prediction
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,3,storey,storey_3_3,XXM,-1.0,0


# submission

In [45]:
temp1 = combined_df.tail(test_size)
temp1['PassengerId'] = temp1.index
tremp2 = temp1[['PassengerId','Prediction']]
tremp2.columns = ['PassengerId','Survived']
tremp2[['PassengerId', 'Survived']].to_csv("random_forest_submission_gender_family_model_v7.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp1['PassengerId'] = temp1.index


In [46]:
tremp2

Unnamed: 0_level_0,PassengerId,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
892,892,0
893,893,1
894,894,0
895,895,0
896,896,1
...,...,...
1305,1305,0
1306,1306,1
1307,1307,0
1308,1308,0
