In [1]:
#import libraries

import pandas as pd
from sklearn import model_selection
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
#load the datasets

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
#get an idea of the data

train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
#descriptive statistics

train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
#drop columns we don't need

train.drop(['PassengerId','Cabin', 'Ticket', 'Name'], axis=1, inplace = True)
test.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace = True)

In [8]:
train.shape

(891, 8)

In [9]:
#create a list with both datasets

datasets = [train, test]

In [11]:
#Replace NaN with values (clean dataset)

for dataset in datasets:    
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)

In [12]:
train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [13]:
test.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [14]:
#set features and target/label
features = ['Sex','Pclass', 'Embarked','SibSp', 'Parch', 'Age', 'Fare']
target = ['Survived']

features_for_test = ['PassengerId', 'Sex','Pclass', 'Embarked','SibSp', 'Parch', 'Age', 'Fare']


In [15]:
#create dummy columns from categorical

train_dummy =pd.get_dummies(train[features], columns=['SibSp','Parch','Sex','Embarked','Pclass'])
test_dummy = pd.get_dummies(test[features_for_test], columns=['SibSp','Parch','Sex','Embarked','Pclass'])


train_dummy.head()

Unnamed: 0,Age,Fare,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_0,...,Parch_5,Parch_6,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,22.0,7.25,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,1
1,38.0,71.2833,0,1,0,0,0,0,0,1,...,0,0,1,0,1,0,0,1,0,0
2,26.0,7.925,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,1
3,35.0,53.1,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,1,1,0,0
4,35.0,8.05,1,0,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,1


In [16]:
#split the dataset
new_features = train_dummy.columns.tolist()
train_x, test_x, train_y, test_y = model_selection.train_test_split(train_dummy[new_features], train[target], train_size=0.7)



In [19]:
train_y.head(5)

Unnamed: 0,Survived
85,1
6,0
725,0
497,0
554,1


In [20]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(train_x[new_features], train_y)


  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [27]:
train_y.head(5)

Unnamed: 0,Survived
85,1
6,0
725,0
497,0
554,1


In [21]:
for i in range(len(new_features)):
    print(new_features[i],round(clf.feature_importances_[i],5))

Age 0.25785
Fare 0.24241
SibSp_0 0.00907
SibSp_1 0.01933
SibSp_2 0.00393
SibSp_3 0.00402
SibSp_4 0.00196
SibSp_5 0.00226
SibSp_8 0.003
Parch_0 0.01997
Parch_1 0.0111
Parch_2 0.00686
Parch_3 5e-05
Parch_4 0.00038
Parch_5 0.00369
Parch_6 0.00046
Sex_female 0.17629
Sex_male 0.11561
Embarked_C 0.01014
Embarked_Q 0.00872
Embarked_S 0.01271
Pclass_1 0.01351
Pclass_2 0.01849
Pclass_3 0.05819


In [22]:
predictions = clf.predict(test_x)
predictions

array([0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1])

In [23]:
print("Train Accuracy :: ", accuracy_score(train_y, clf.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, predictions))

Train Accuracy ::  0.969502407705
Test Accuracy  ::  0.798507462687


In [24]:
test_dummy['Survived'] = clf.predict(test_dummy[new_features])

In [25]:
test_dummy[['PassengerId','Survived']].to_csv('predictions.csv',index=False)