In [91]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [175]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

dataset = [train_df, test_df]

In [176]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [177]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [178]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [179]:
train_df['Embarked'].mode()[0]

'S'

In [180]:
for data in dataset:
    # fill null data
    data['Age'].fillna(data['Age'].mean(), inplace=True)
    data['Age'] = pd.cut(data['Age'], 4).cat.codes
    
    data['Fare'].fillna(data['Fare'].mean(), inplace=True)
    data['Fare'] = pd.qcut(data['Fare'], 4).cat.codes
    
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Embarked'] = data['Embarked'].astype('category').cat.codes
        
    data['Sex'] = data['Sex'].astype('category').cat.codes
    
    #create new feature
    data['Title'] = data['Name'].apply(lambda name: name.split(', ')[1].split('.')[0])
    min_freq = 10
    title_names = data['Title'].value_counts() < min_freq
    data['Title'] = data['Title'].apply(lambda x: 'Rare' if title_names[x] else x) #replace title with 'Rare' if it's frequence less than min_freq 
    data['Title'] = data['Title'].astype('category').cat.codes
    
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    data['IsAlone'] = data['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
    
    #drop column
    drop_column = ['Name', 'Ticket', 'Cabin']
    data.drop(drop_column, axis=1, inplace=True)

In [183]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone
0,1,0,3,1,1,1,0,0,2,2,2,0
1,2,1,1,0,1,1,0,3,0,3,2,0
2,3,1,3,0,1,0,0,1,2,1,1,1
3,4,1,1,0,1,1,0,3,2,3,2,0
4,5,0,3,1,1,0,0,1,2,2,1,1


In [184]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int8
Age            891 non-null int8
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null int8
Embarked       891 non-null int8
Title          891 non-null int8
FamilySize     891 non-null int64
IsAlone        891 non-null int64
dtypes: int64(7), int8(5)
memory usage: 53.2 KB


In [185]:
#drop PassengerId
train_df.drop("PassengerId", axis=1, inplace=True)

In [186]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 10), (891,), (418, 10))

In [189]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)

0.8866442199775533

In [188]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('./submission_random_forest.csv', index=False)