In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train_file = "train.csv"
test_file = "test.csv"

train_df = pd.read_csv(train_file, index_col = 0)
test_df = pd.read_csv(test_file, index_col = 0)

train_df['train'] = True
test_df['train'] = False
test_df['Survived'] = None
df = pd.concat([train_df, test_df], sort=False)
del train_df
del test_df

In [3]:
print(df.tail(5))

            Survived  Pclass                          Name     Sex   Age  \
PassengerId                                                                
1305            None       3            Spector, Mr. Woolf    male   NaN   
1306            None       1  Oliva y Ocana, Dona. Fermina  female  39.0   
1307            None       3  Saether, Mr. Simon Sivertsen    male  38.5   
1308            None       3           Ware, Mr. Frederick    male   NaN   
1309            None       3      Peter, Master. Michael J    male   NaN   

             SibSp  Parch              Ticket      Fare Cabin Embarked  train  
PassengerId                                                                    
1305             0      0           A.5. 3236    8.0500   NaN        S  False  
1306             0      0            PC 17758  108.9000  C105        C  False  
1307             0      0  SOTON/O.Q. 3101262    7.2500   NaN        S  False  
1308             0      0              359309    8.0500   NaN      

In [4]:
#drop useless ticket column
df = df.drop(columns = 'Ticket')

# drop non_embarked people
df = df.dropna(how='all', subset = ['Embarked'])
df['Age'] = df['Age'].fillna(value = df['Age'].median())

#nb of training exemples
m = df.shape[0]

#find the Surname to det families
df['Surname'] = df['Name'].str.split(pat = ',').str[0]
#add total nb of relatives on the boat
df['Nb_rel'] = df['SibSp'] + df['Parch']

#create an index for the cabin, after sorting since similar numbers should be close
tmp_df = df
df = df.sort_values(by = 'Cabin')
le1 = preprocessing.LabelEncoder()
le1.fit(df['Cabin'].values.astype(str))
df['Cabin'] = le1.transform(df['Cabin'].values.astype(str))
df = df.reset_index()
df = df.set_index('PassengerId')

#create an index for each family by comparing Surname and numbers giving relatives
le2 = preprocessing.LabelEncoder()
arrays = [df['SibSp'].values, df['Parch'].values, df['Surname'].values]
joint_criterias = [str(arrays[0][i]) + str(arrays[1][i]) + str(arrays[2][i]) for i in range(m)]
le2.fit(joint_criterias)
df['Family_ind'] = le2.transform(joint_criterias)

#index for the departing port, the sex and the surname while we use it
cols = ['Embarked', 'Sex', 'Surname']
encoders = []
for col in cols:    
    le = preprocessing.LabelEncoder()
    le.fit(df[col].values.astype(str))
    df[col] = le.transform(df[col].values.astype(str))
    encoders.append(le)

#add mean, min and max age of the family
df['Mean_age'] = df.groupby('Family_ind')['Age'].transform(lambda x: x.mean())
df['Min_age'] = df.groupby('Family_ind')['Age'].transform(lambda x: x.min())
df['Max_age'] = df.groupby('Family_ind')['Age'].transform(lambda x: x.max())

#name as well as family are no longer useful, since information is now contained elsewhere ??? essayons avec d'abord puis sans
df = df.drop(columns = 'Name')

#normalize the rest of the Data ?

In [5]:
print(df.tail(5))

            Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Cabin  \
PassengerId                                                             
1304            None       3    0  28.0      0      0   7.7750    185   
1305            None       3    1  28.0      0      0   8.0500    185   
1307            None       3    1  38.5      0      0   7.2500    185   
1308            None       3    1  28.0      0      0   8.0500    185   
1309            None       3    1  28.0      1      1  22.3583    185   

             Embarked  train  Surname  Nb_rel  Family_ind  Mean_age  Min_age  \
PassengerId                                                                    
1304                2  False      333       0         262      28.0     28.0   
1305                2  False      752       0         604      28.0     28.0   
1307                2  False      698       0         559      38.5     38.5   
1308                2  False      825       0         663      29.5     28.0   
1309    

In [27]:
train_df = df.loc[lambda df:df['train'] == True].drop(columns = 'train')
test_df = df.loc[lambda df:df['train'] == False].drop(columns = 'train')

y = train_df['Survived'].astype(bool)
X = train_df.drop(columns = 'Survived')

In [28]:
print(y.astype(bool))
print(X.shape)

PassengerId
584    False
476    False
557     True
285    False
600     True
631     True
868    False
648     True
210     True
186    False
446     True
807    False
97     False
24      True
175    False
738     True
816    False
330     True
524     True
171    False
691     True
782     True
541     True
746    False
780     True
55     False
370     True
642     True
488    False
537    False
       ...  
853    False
855    False
856     True
857     True
859     True
860    False
861    False
862    False
864    False
865    False
866     True
867     True
869    False
870     True
871    False
874    False
875     True
876     True
877    False
878    False
879    False
881     True
882    False
883    False
884    False
885    False
886    False
887    False
889    False
891    False
Name: Survived, Length: 889, dtype: bool
(889, 14)


In [32]:
model = GradientBoostingClassifier()
model.fit(X, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [34]:
model.apply(X)

array([[[13.],
        [13.],
        [13.],
        ...,
        [ 3.],
        [13.],
        [ 3.]],

       [[13.],
        [13.],
        [13.],
        ...,
        [ 3.],
        [ 3.],
        [10.]],

       [[ 3.],
        [ 4.],
        [ 4.],
        ...,
        [ 3.],
        [13.],
        [ 3.]],

       ...,

       [[14.],
        [14.],
        [14.],
        ...,
        [ 6.],
        [13.],
        [10.]],

       [[ 6.],
        [ 7.],
        [ 7.],
        ...,
        [ 6.],
        [13.],
        [14.]],

       [[14.],
        [14.],
        [14.],
        ...,
        [ 6.],
        [ 3.],
        [ 3.]]])