## Library

In [1]:
import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

#Ignore warnings 
import warnings
warnings.filterwarnings('ignore')

## Import Files

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Change Sex to boolean values

In [3]:
train.Sex[train.Sex == 'male'] = 0
train.Sex[train.Sex == 'female'] = 1

test.Sex[test.Sex == 'male'] = 0
test.Sex[test.Sex == 'female'] = 1

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


## Find missing values in Train dataset

In [5]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Fill in missing age values with median age values

In [6]:
train['Age'] = train['Age'].fillna(train['Age'].median())
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Do same with Test dataset

In [7]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Fill Age in Test dataset

In [8]:
test['Age'] = test['Age'].fillna(test['Age'].median())
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Fare should not be filled by just a median, since Fare varies by Class - we need to find
## the median by each Class. First lets find the specific row.

In [9]:
null = test.columns[test.isnull().any()]
test[test['Fare'].isnull()][null]

Unnamed: 0,Fare,Cabin
152,,


## Display the whole row

In [10]:
test.loc[152,:]

PassengerId                  1044
Pclass                          3
Name           Storey, Mr. Thomas
Sex                             0
Age                          60.5
SibSp                           0
Parch                           0
Ticket                       3701
Fare                          NaN
Cabin                         NaN
Embarked                        S
Name: 152, dtype: object

## Let's group by Pclass & Sex since these two factors affect Fare, 
## then get mean Fare by Class

In [11]:
group = test.groupby(['Pclass','Sex'])
group.Fare.mean()

Pclass  Sex
1       0       75.586551
        1      115.591168
2       0       20.184654
        1       26.438750
3       0       11.826350
        1       13.735129
Name: Fare, dtype: float64

## Our passenger was a male in Class 3. Let's fill in the missing value

In [12]:
test.at[152,'Fare'] = 11.826350

## Define dependent & independent variables, and drop columns not needed.

In [13]:
depvar = train.iloc[:,1:2]
features = train.loc[:,['PassengerId','Pclass','Sex','Age','SibSp','Parch','Fare']]
test = test.loc[:,['PassengerId','Pclass','Sex','Age','SibSp','Parch','Fare']]

In [14]:
features.dtypes, test.dtypes

(PassengerId      int64
 Pclass           int64
 Sex             object
 Age            float64
 SibSp            int64
 Parch            int64
 Fare           float64
 dtype: object, PassengerId      int64
 Pclass           int64
 Sex             object
 Age            float64
 SibSp            int64
 Parch            int64
 Fare           float64
 dtype: object)

## Change data types as needed

## Split Train dataset

In [15]:
#Split train data 
x_train, x_test, y_train, y_test = train_test_split(features, depvar, test_size = 0.2)

## Models

In [28]:
tree = DecisionTreeClassifier()
RF = RandomForestClassifier(n_estimators = 500)

tree.fit(x_train, y_train)
RF.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Model Scores

In [29]:
tree.score(x_test, y_test)

0.776536312849162

In [30]:
RF.score(x_test, y_test)

0.8435754189944135

## Predict

In [31]:
prediction = RF.predict(test)
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':prediction})

In [32]:
submission.to_csv('Submission.csv', index = False)