In [1]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

print(df_test.shape)
print(df_test.columns)
print(df_test.dtypes)

print("\n\n")
print(df_train.shape)
print(df_train.columns)
print(df_train.dtypes)
print(df_train.head())

(418, 11)
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object



(891, 12)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3    

In [2]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [4]:
X = df_train.drop('Survived',axis=1)
Y = df_train['Survived']

In [5]:
from sklearn.impute import SimpleImputer
X[['Age']] = SimpleImputer(np.nan,strategy = 'mean').fit_transform(X[['Age']])
X[['Embarked']] = SimpleImputer(np.nan,strategy = 'most_frequent').fit_transform(X[['Embarked']])
df_test['Age'] = SimpleImputer(np.nan,strategy = 'mean').fit_transform(df_test[['Age']])
df_test['Fare'] = SimpleImputer(np.nan,strategy = 'mean').fit_transform(df_test[['Fare']])

In [6]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X['Sex_enc'] = encoder.fit_transform(X['Sex'])
X['Emb_enc'] = encoder.fit_transform(X['Embarked'])

In [8]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_enc,Emb_enc
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,2
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,2
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,2
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,2


In [9]:
df_test['Sex_enc'] = encoder.fit_transform(df_test['Sex'])
df_test['Emb_enc'] = encoder.fit_transform(df_test['Embarked'])

In [10]:
#selecting only the numeric variables
numeric_variables=list(X.dtypes[X.dtypes!="object"].index)
numeric_variables.remove('PassengerId')
print(numeric_variables)
X[numeric_variables].head()

#importing Random Forest Classifier
from sklearn.ensemble import RandomForestRegressor
 
#instantiate parametrs
model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42)
 
#fit the model
model.fit(X[numeric_variables], Y)

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_enc', 'Emb_enc']


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=True, random_state=42, verbose=0, warm_start=False)

In [11]:
#prediction error 
#oob score gives the R^2 value based on oob predictions
model.oob_score_

0.4064601324824336

In [12]:
from sklearn.metrics import roc_auc_score
Y_oob=model.oob_prediction_
print("C-stat", roc_auc_score(Y, Y_oob))

C-stat 0.8577610541228602


In [54]:
Y_res = model.predict(df_test[numeric_variables])>0.5
Y_res.shape

(418,)

In [49]:
#res = pd.DataFrame(data=Y_res, dtype=float, index = df_test['PassengerId'])
df_test['Survived'] = Y_res

In [63]:
df_test['Survived'].replace(True,1,inplace=True)
df_test['Survived'].replace(False,0,inplace=True)
df_test['Survived']=df_test['Survived'].astype('int64',copy=True)
df_test[['Name','PassengerId']][5:10]

Unnamed: 0,Name,PassengerId
5,"Svensson, Mr. Johan Cervin",897
6,"Connolly, Miss. Kate",898
7,"Caldwell, Mr. Albert Francis",899
8,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",900
9,"Davies, Mr. John Samuel",901


In [62]:
df_res = df_test[['PassengerId','Survived']]
df_res.set_index('PassengerId',inplace=True)
df_res.to_csv('./res_v2.csv')
df_res.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,1
895,1
896,1
