In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))

In [2]:
import math
import common 
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
train_df = pd.read_csv("./data/train.csv", index_col="PassengerId")
test_df = pd.read_csv("./data/test.csv", index_col="PassengerId")

# Generate Family Related Features

In [4]:
train_size = train_df.shape[0]
test_size = test_df.shape[0]

In [5]:
combined_df = pd.concat([train_df, test_df])

In [6]:
family_df = combined_df.groupby('Ticket')['Pclass'].count().to_frame()
family_df.reset_index(inplace=True)
family_df.columns = ['Ticket', 'Family_size']

In [7]:
print(family_df['Family_size'].sum(), combined_df.shape)

1309 (1309, 11)


In [8]:
temp = combined_df.merge(family_df, left_on='Ticket', right_on='Ticket', how='left')
combined_df['Family_size'] = temp['Family_size'].values

In [9]:
family_avg_survive_rate = combined_df.groupby('Ticket')['Survived'].mean().to_frame()
family_avg_survive_rate.fillna(family_avg_survive_rate.mean(), inplace=True)
family_avg_survive_rate.reset_index(inplace=True)
family_avg_survive_rate.columns = ['Ticket', 'Avg_Family_Survived_Rate']

In [10]:
temp2 = combined_df.merge(family_avg_survive_rate, left_on='Ticket', right_on='Ticket', how='left')
combined_df['Avg_Family_Survived_Rate'] = temp2['Avg_Family_Survived_Rate'].values

In [11]:
combined_df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_size,Avg_Family_Survived_Rate
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1,0.000000
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,1.000000
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1,1.000000
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,2,0.500000
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,1,0.349311
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,3,0.500000
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,1,0.349311
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,1,0.349311


# Update Fare to Individual Passenger's Fare, instead of Group Fare

In [12]:
combined_df[(combined_df['Family_size'] > 4) & (combined_df['Ticket'] == '347077')]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_size,Avg_Family_Survived_Rate
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
26,1.0,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...",female,38.0,1,5,347077,31.3875,,S,7,0.75
183,0.0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9.0,4,2,347077,31.3875,,S,7,0.75
234,1.0,3,"Asplund, Miss. Lillian Gertrud",female,5.0,4,2,347077,31.3875,,S,7,0.75
262,1.0,3,"Asplund, Master. Edvin Rojj Felix",male,3.0,4,2,347077,31.3875,,S,7,0.75
1046,,3,"Asplund, Master. Filip Oscar",male,13.0,4,2,347077,31.3875,,S,7,0.75
1066,,3,"Asplund, Mr. Carl Oscar Vilhelm Gustafsson",male,40.0,1,5,347077,31.3875,,S,7,0.75
1271,,3,"Asplund, Master. Carl Edgar",male,5.0,4,2,347077,31.3875,,S,7,0.75


In [13]:
combined_df[(combined_df['Family_size'] > 4)]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_size,Avg_Family_Survived_Rate
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,5,0.00
14,0.0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.2750,,S,7,0.00
17,0.0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.1250,,Q,6,0.00
25,0.0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.0750,,S,5,0.00
26,1.0,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...",female,38.0,1,5,347077,31.3875,,S,7,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,,1,"Bowen, Miss. Grace Scott",female,45.0,0,0,PC 17608,262.3750,,C,7,1.00
1271,,3,"Asplund, Master. Carl Edgar",male,5.0,4,2,347077,31.3875,,S,7,0.75
1277,,2,"Herman, Miss. Kate",female,24.0,1,2,220845,65.0000,,S,5,1.00
1281,,3,"Palsson, Master. Paul Folke",male,6.0,3,1,349909,21.0750,,S,5,0.00


In [14]:
combined_df.groupby(['Pclass','Family_size'])['Fare'].mean()

Pclass  Family_size
1       1               29.113715
        2               64.760228
        3              107.597767
        4              196.291150
        5              191.175000
        6              207.275000
        7              262.375000
2       1               12.176285
        2               23.595719
        3               26.788785
        4               36.269800
        5               65.000000
        7               73.500000
3       1                7.889955
        2               14.549358
        3               18.722019
        4               15.820825
        5               26.972233
        6               28.512500
        7               34.116667
        8               51.697900
        11              69.550000
Name: Fare, dtype: float64

In [15]:
def cal_passenger_fare(x):
    if x['Fare'] >= 0:
        return x['Fare'] / x['Family_size']
    return x['Fare']
    
combined_df['PassengerFare'] = combined_df.apply(lambda x: cal_passenger_fare(x), axis=1)

In [16]:
combined_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_size,Avg_Family_Survived_Rate,PassengerFare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,0.0,7.25
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,1.0,35.64165
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1.0,7.925
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,0.5,26.55
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,0.0,8.05


# Deal with None

In [17]:
combined_df.apply(lambda x: x.isna().sum())

Survived                     418
Pclass                         0
Name                           0
Sex                            0
Age                          263
SibSp                          0
Parch                          0
Ticket                         0
Fare                           1
Cabin                       1014
Embarked                       2
Family_size                    0
Avg_Family_Survived_Rate       0
PassengerFare                  1
dtype: int64

In [18]:
if combined_df[combined_df['PassengerFare'].isna()].shape[0] > 0:
    combined_df.at[1044, 'PassengerFare'] = combined_df[combined_df['Pclass'] == 3]['PassengerFare'].mean()
    combined_df.at[1044, 'Fare'] = combined_df[combined_df['Pclass'] == 3]['Fare'].mean()   # not used in this notebook
    print(combined_df[combined_df['Pclass'] == 3]['PassengerFare'].mean())

7.32914555084747


In [19]:
combined_df.apply(lambda x: x.isna().sum())

Survived                     418
Pclass                         0
Name                           0
Sex                            0
Age                          263
SibSp                          0
Parch                          0
Ticket                         0
Fare                           0
Cabin                       1014
Embarked                       2
Family_size                    0
Avg_Family_Survived_Rate       0
PassengerFare                  0
dtype: int64

In [20]:
combined_df['Embarked'].fillna('U', inplace=True)   # convert None as a seperate type

In [21]:
combined_df.apply(lambda x: x.isna().sum())

Survived                     418
Pclass                         0
Name                           0
Sex                            0
Age                          263
SibSp                          0
Parch                          0
Ticket                         0
Fare                           0
Cabin                       1014
Embarked                       0
Family_size                    0
Avg_Family_Survived_Rate       0
PassengerFare                  0
dtype: int64

In [22]:
combined_df.drop('Cabin', inplace=True, axis=1)

In [23]:
combined_df.apply(lambda x: x.isna().sum())

Survived                    418
Pclass                        0
Name                          0
Sex                           0
Age                         263
SibSp                         0
Parch                         0
Ticket                        0
Fare                          0
Embarked                      0
Family_size                   0
Avg_Family_Survived_Rate      0
PassengerFare                 0
dtype: int64

In [24]:
def age_encoding(x):
    if x is np.nan or x is None or math.isnan(x):
        return 0
        
    elif x < 18: 
        return 1
    elif x > 60:
        return 3
    else:
        return 2
combined_df['AgeEncoded'] = combined_df['Age'].map(age_encoding)
combined_df.drop('Age', inplace=True, axis=1)

In [25]:
combined_df.apply(lambda x: x.isna().sum())

Survived                    418
Pclass                        0
Name                          0
Sex                           0
SibSp                         0
Parch                         0
Ticket                        0
Fare                          0
Embarked                      0
Family_size                   0
Avg_Family_Survived_Rate      0
PassengerFare                 0
AgeEncoded                    0
dtype: int64

# Feature Encoding

In [26]:
gender_encoding = {'male': 1, 'female': 2}
combined_df['SexEncoded'] = combined_df['Sex'].map(gender_encoding).fillna(0)

In [27]:
# in practice, we will never know the future distribution, so we should not be able to one-hot encode based on future distribution, but for simplicity...
from sklearn.preprocessing import OneHotEncoder

pclassEncoder = OneHotEncoder(handle_unknown='ignore')
pclassEncoder.fit(combined_df[['Pclass']])
pclass_temp_df = pd.DataFrame(pclassEncoder.transform(combined_df[['Pclass']]).toarray(), columns=pclassEncoder.get_feature_names(['Pclass']), index=combined_df.index)

embarkedEncoder = OneHotEncoder(handle_unknown='ignore')
embarkedEncoder.fit(combined_df[['Embarked']])
embarked_temp_df = pd.DataFrame(embarkedEncoder.transform(combined_df[['Embarked']]).toarray(), columns=embarkedEncoder.get_feature_names(['Embarked']), index=combined_df.index)

sexEncoder = OneHotEncoder(handle_unknown='ignore')
sexEncoder.fit(combined_df[['SexEncoded']])
sex_temp_df = pd.DataFrame(sexEncoder.transform(combined_df[['SexEncoded']]).toarray(), columns=sexEncoder.get_feature_names(['SexEncoded']), index=combined_df.index)

ageEncoder = OneHotEncoder(handle_unknown='ignore')
ageEncoder.fit(combined_df[['AgeEncoded']])
age_temp_df = pd.DataFrame(ageEncoder.transform(combined_df[['AgeEncoded']]).toarray(), columns=ageEncoder.get_feature_names(['AgeEncoded']), index=combined_df.index)

In [28]:
combined_df_encoded = pd.concat([combined_df, pclass_temp_df, embarked_temp_df, sex_temp_df, age_temp_df], axis=1)
# selected_features = ['SibSp', 'Parch', 
#        'Family_size', 'PassengerFare',
#        'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q',
#        'Embarked_S', 'Embarked_U', 'SexEncoded_1', 'SexEncoded_2',
#        'AgeEncoded_0', 'AgeEncoded_1', 'AgeEncoded_2', 'AgeEncoded_3',
#        'AgeEncoded_4', 'AgeEncoded_5', 'AgeEncoded_6', 'AgeEncoded_7', 'Avg_Family_Survived_Rate']
selected_features = ['SexEncoded_1', 'SexEncoded_2', 'Avg_Family_Survived_Rate']

In [29]:
combined_df_encoded.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Embarked', 'Family_size', 'Avg_Family_Survived_Rate', 'PassengerFare',
       'AgeEncoded', 'SexEncoded', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Embarked_U', 'SexEncoded_1',
       'SexEncoded_2', 'AgeEncoded_0', 'AgeEncoded_1', 'AgeEncoded_2',
       'AgeEncoded_3', 'AgeEncoded_4', 'AgeEncoded_5', 'AgeEncoded_6',
       'AgeEncoded_7'],
      dtype='object')

In [30]:
combined_df_encoded.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked,Family_size,...,SexEncoded_1,SexEncoded_2,AgeEncoded_0,AgeEncoded_1,AgeEncoded_2,AgeEncoded_3,AgeEncoded_4,AgeEncoded_5,AgeEncoded_6,AgeEncoded_7
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,S,1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C,2,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,S,1,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,S,2,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,S,1,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# Divide for training and testing

In [31]:
combined_df_encoded[selected_features].head()

Unnamed: 0_level_0,SexEncoded_1,SexEncoded_2,Avg_Family_Survived_Rate
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.0,0.0,0.0
2,0.0,1.0,1.0
3,0.0,1.0,1.0
4,0.0,1.0,0.5
5,1.0,0.0,0.0


In [32]:
combined_df_encoded[selected_features].tail()

Unnamed: 0_level_0,SexEncoded_1,SexEncoded_2,Avg_Family_Survived_Rate
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1305,1.0,0.0,0.349311
1306,0.0,1.0,0.5
1307,1.0,0.0,0.349311
1308,1.0,0.0,0.349311
1309,1.0,0.0,1.0


In [33]:
X_train = combined_df_encoded[selected_features].head(train_size)
y = combined_df_encoded.head(train_size)['Survived']
X_test = combined_df_encoded[selected_features].tail(test_size)

In [34]:
X_train.head()

Unnamed: 0_level_0,SexEncoded_1,SexEncoded_2,Avg_Family_Survived_Rate
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.0,0.0,0.0
2,0.0,1.0,1.0
3,0.0,1.0,1.0
4,0.0,1.0,0.5
5,1.0,0.0,0.0


In [35]:
X_test.tail()

Unnamed: 0_level_0,SexEncoded_1,SexEncoded_2,Avg_Family_Survived_Rate
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1305,1.0,0.0,0.349311
1306,0.0,1.0,0.5
1307,1.0,0.0,0.349311
1308,1.0,0.0,0.349311
1309,1.0,0.0,1.0


In [36]:
y

PassengerId
1      0.0
2      1.0
3      1.0
4      1.0
5      0.0
      ... 
887    0.0
888    1.0
889    0.0
890    1.0
891    0.0
Name: Survived, Length: 891, dtype: float64

# Tuning Random Forest

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid


# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth' : [2, 3, 4, 5, 6, 7, 8],
    'n_estimators': [2, 10, 20, 50, 100, 200, 500]
}

X_train_temp = combined_df_encoded.head(train_size)
X_train_cp = X_train_temp.copy()
K = 3

kf3 = KFold(n_splits=K, shuffle=False)
for params in ParameterGrid(param_grid):
    scores = []
    for tune_train_index, tune_test_index in kf3.split(X_train_cp):
#             print({
#                 "i": i,
#                 "params": params,
#                 "train_survive_rate": X_train_cp.iloc[tune_train_index]['Survived'].mean(), 
#                 "test_survive_rate": X_train_cp.iloc[tune_test_index]['Survived'].mean()
#             })
#         print("============")
#         print(X_train_cp['Avg_Family_Survived_Rate'].head())
#         print("============")

        local_family_avg_survive_rate = X_train_cp.iloc[tune_train_index].groupby('Ticket')['Survived'].mean().to_frame()
        local_family_avg_survive_rate.fillna(local_family_avg_survive_rate.mean(), inplace=True)
        local_family_avg_survive_rate.reset_index(inplace=True)
        local_family_avg_survive_rate.columns = ['Ticket', 'Avg_Family_Survived_Rate2']
        
        temp = X_train_cp.merge(local_family_avg_survive_rate, left_on='Ticket', right_on='Ticket', how='left')
        mean_value = local_family_avg_survive_rate.mean().values[0]
        temp['Avg_Family_Survived_Rate2'].fillna(mean_value, inplace=True)
        X_train_cp['Avg_Family_Survived_Rate'] = temp['Avg_Family_Survived_Rate2'].values
        
        rf_model = RandomForestClassifier(random_state=31, max_depth=params['max_depth'], n_estimators=params['n_estimators'])
        rf_model.fit(X_train_cp.iloc[tune_train_index][selected_features], X_train_cp.iloc[tune_train_index]['Survived'])
        score = accuracy_score(X_train_cp.iloc[tune_test_index]['Survived'], rf_model.predict(X_train_cp.iloc[tune_test_index][selected_features]))
        scores.append(score)
        
    print({
        "params": params,
        "score": np.mean(scores),
        "scores": scores
    })
        
# rf_model = RandomForestClassifier(random_state=31)
# grid_search = GridSearchCV(estimator = rf_model, param_grid = param_grid, cv = 3, n_jobs = -1, verbose=200)
# grid_search.fit(X_train, y)
# print(grid_search.best_params_)

{'params': {'max_depth': 2, 'n_estimators': 2}, 'score': 0.7194163860830528, 'scores': [0.797979797979798, 0.6296296296296297, 0.7306397306397306]}
{'params': {'max_depth': 2, 'n_estimators': 10}, 'score': 0.7620650953984288, 'scores': [0.797979797979798, 0.7643097643097643, 0.7239057239057239]}
{'params': {'max_depth': 2, 'n_estimators': 20}, 'score': 0.7620650953984288, 'scores': [0.797979797979798, 0.7643097643097643, 0.7239057239057239]}
{'params': {'max_depth': 2, 'n_estimators': 50}, 'score': 0.7631874298540966, 'scores': [0.797979797979798, 0.7676767676767676, 0.7239057239057239]}
{'params': {'max_depth': 2, 'n_estimators': 100}, 'score': 0.7631874298540966, 'scores': [0.797979797979798, 0.7676767676767676, 0.7239057239057239]}
{'params': {'max_depth': 2, 'n_estimators': 200}, 'score': 0.7620650953984288, 'scores': [0.797979797979798, 0.7643097643097643, 0.7239057239057239]}
{'params': {'max_depth': 2, 'n_estimators': 500}, 'score': 0.7620650953984288, 'scores': [0.7979797979797

In [38]:
# X_train_cp

In [50]:
the_rf_model = RandomForestClassifier(n_estimators=50, max_depth=2, random_state=31)
the_rf_model.fit(X_train, y)
y_hat = the_rf_model.predict(X_train)

In [51]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y, y_hat))

0.9820426487093153


# Prediction

In [43]:
X_test['Survived'] = the_rf_model.predict(X_test[selected_features]).astype(int)

In [44]:
X_test['PassengerId'] = X_test.index

In [45]:
X_test[['PassengerId', 'Survived']].to_csv("random_forest_submission_with_sex_and_family_survive_rate.csv", index=False)

In [46]:
for a, b in zip(selected_features, the_rf_model.feature_importances_):
    print({
        "feature": a, "weight": b
    })

{'feature': 'SexEncoded_1', 'weight': 0.109830844003989}
{'feature': 'SexEncoded_2', 'weight': 0.10732431299836533}
{'feature': 'Avg_Family_Survived_Rate', 'weight': 0.7828448429976457}


In [47]:
X_test.groupby('SexEncoded_1').agg({'Survived': ['count', 'mean']})

Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,count,mean
SexEncoded_1,Unnamed: 1_level_2,Unnamed: 2_level_2
0.0,152,0.828947
1.0,266,0.142857


In [48]:
# 'SibSp', 'Parch', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Embarked_U', 'SexEncoded_1', 'SexEncoded_2', 
#                      'AgeEncoded_0', 'AgeEncoded_1', 'AgeEncoded_2', 'AgeEncoded_3', 'AgeEncoded_4', 'AgeEncoded_5', 'AgeEncoded_6', 'AgeEncoded_7'