In [50]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [51]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

PATH='data/titanic/'

In [52]:
import warnings
warnings.filterwarnings('ignore')

## Create Dataset

In [53]:
train = pd.read_csv(f'{PATH}train.csv', low_memory=False)
test = pd.read_csv(f'{PATH}test.csv', low_memory=False)

In [54]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [55]:
DataFrameSummary(test).summary()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418,418,,,332,418,418,,417,,
mean,1100.5,2.26555,,,30.2726,0.447368,0.392344,,35.6272,,
std,120.81,0.841838,,,14.1812,0.89676,0.981429,,55.9076,,
min,892,1,,,0.17,0,0,,0,,
25%,996.25,1,,,21,0,0,,7.8958,,
50%,1100.5,3,,,27,0,0,,14.4542,,
75%,1204.75,3,,,39,1,0,,31.5,,
max,1309,3,,,76,8,9,,512.329,,
counts,418,418,418,418,332,418,418,418,417,91,418
uniques,418,3,418,2,79,7,8,363,169,76,3


In [56]:
DataFrameSummary(train).summary()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891,891,891,,,714,891,891,,891,,
mean,446,0.383838,2.30864,,,29.6991,0.523008,0.381594,,32.2042,,
std,257.354,0.486592,0.836071,,,14.5265,1.10274,0.806057,,49.6934,,
min,1,0,1,,,0.42,0,0,,0,,
25%,223.5,0,2,,,20.125,0,0,,7.9104,,
50%,446,0,3,,,28,0,0,,14.4542,,
75%,668.5,1,3,,,38,1,0,,31,,
max,891,1,3,,,80,8,6,,512.329,,
counts,891,891,891,891,891,714,891,891,891,891,204,889
uniques,891,2,3,891,2,88,7,7,681,248,147,3


After loading the datasets, we'll fill out the missing values in some of the columns. In particular, we'll fill the missing values of age and fair with approximately the mean of those columns. I tested other values as well, such as setting the NAs to 0, but there was no improvement to the validation or test scores.

In [57]:
for df in [train, test]:
    df['Age'] = df['Age'].fillna(30).astype(np.int32)
    df['Fare'] = df['Fare'].fillna(33).astype(np.int32)

We'll also drop the names and passenger IDs, since we won't be using them. Additionally, we'll drop the 'Ticket' column, since it is predominantly unique categorical values. This improved the final score by several percentage points, likely because it made it harder for the neural network to overfit.

In [58]:
train = train.drop(['Name', 'Ticket', 'PassengerId'], axis=1)
test_passengers = test.set_index('PassengerId')
test = test.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

Next we'll proceed by turning some of the variables into categorical variables

In [59]:
cat_vars = ['Pclass', 'Sex', 'Cabin', 'Embarked']
cont_vars = ['Age', 'SibSp', 'Parch', 'Fare']

In [60]:
test['Survived'] = 0

In [61]:
for v in cat_vars: train[v] = train[v].astype('category').cat.as_ordered()

In [62]:
apply_cats(test, train)

Then we use proc_df to scale and do final manipulation of the data

In [63]:
df, y, nas, mapper = proc_df(train, 'Survived', do_scale=True)

In [64]:
df_test, _, nas, mapper = proc_df(test, 'Survived', do_scale=True, mapper=mapper, na_dict=nas)

We set up the validation ids

In [65]:
n = len(list(open(f'{PATH}train.csv')))-1
val_idxs = get_cv_idxs(n)
n

891

# Model

In [66]:
y = y.astype(np.int64)

In [67]:
md = ColumnarModelData.from_data_frame(PATH, val_idxs, df, y, cat_flds=cat_vars, bs=128,
                                       test_df=df_test, is_reg=False, is_multi=False)

In [68]:
cat_sz = [(c, len(train[c].cat.categories)+1) for c in cat_vars]
cat_sz

[('Pclass', 4), ('Sex', 3), ('Cabin', 148), ('Embarked', 4)]

We set the embedding sizes here.

In [69]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

Next, we need to create our learner. Since we're classifying for two classes, we need to set `out_sz=2`. We also choose the size of the network here, which we set to `[200, 100]`. For the `StructuredLearner` this means that after the input layers, we'll have BatchNorm and Dropout layers of 200x200, followed by a Linear layer of 200x100, followed by BatchNorm and Dropout of 100x100, and a final Linear layer of 100x2. 

Other important inputs are the dropout parameters and `use_bn`. Using BatchNorm and high dropout was instrumental to keeping the network from overfitting.

In [70]:
learner = md.get_learner(emb_szs, len(df.columns)-len(cat_vars), 0.04, 2, [200,100], [0.4,0.6], use_bn=True)

In [71]:
lr = 0.001
learner.lr_find()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss       
    0      0.931637   1141.300171



In [72]:
learner.fit(lr, 5, metrics=[accuracy])

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss   accuracy   
    0      0.689479   0.665828   0.589888  
    1      0.669288   0.64624    0.589888  
    2      0.650909   0.627271   0.589888  
    3      0.63772    0.60289    0.589888  
    4      0.625669   0.573314   0.691011  



[0.5733138918876648, 0.6910112500190735]

In [73]:
learner.fit(lr, 5, cycle_len=1, metrics=[accuracy])

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss   accuracy   
    0      0.571847   0.559634   0.741573  
    1      0.559744   0.541527   0.769663  
    2      0.550977   0.525173   0.792135  
    3      0.542157   0.510883   0.792135  
    4      0.533977   0.497695   0.792135  



[0.49769535660743713, 0.7921348214149475]

In [74]:
learner.fit(lr, 5, cycle_len=3, cycle_mult=2, metrics=[accuracy], best_save_name='best_nn')

HBox(children=(IntProgress(value=0, description='Epoch', max=93), HTML(value='')))

epoch      trn_loss   val_loss   accuracy   
    0      0.513347   0.486209   0.792135  
    1      0.49838    0.478866   0.792135  
    2      0.497944   0.477311   0.792135  
    3      0.495271   0.466266   0.792135  
    4      0.489223   0.458115   0.792135  
    5      0.483104   0.453469   0.797753  
    6      0.479184   0.451366   0.797753  
    7      0.473696   0.450659   0.797753  
    8      0.469131   0.450527   0.797753  
    9      0.462742   0.445753   0.803371  
    10     0.461105   0.442956   0.803371  
    11     0.454413   0.441394   0.797753  
    12     0.449944   0.440421   0.803371  
    13     0.445436   0.439483   0.797753  
    14     0.443116   0.438952   0.792135  
    15     0.439567   0.438459   0.792135  
    16     0.436677   0.438076   0.792135  
    17     0.43263    0.438037   0.792135  
    18     0.431056   0.438077   0.792135  
    19     0.426782   0.438092   0.792135  
    20     0.425621   0.438099   0.792135  
    21     0.422645   0.439114 

[0.5085940361022949, 0.7977527976036072]

## Prediction

For the prediction step we load the weights that we had saved previously and run `learner.predict(True)`

In [75]:
learner.load('best_nn')

We take the `argmax` of the predictions, since we're doing binary classification.

In [76]:
pred = np.argmax(learner.predict(True), axis=1)

In [77]:
df_pred = pd.DataFrame({'PassengerId':test_passengers.index, 'Survived': pred})

In [78]:
df_pred.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [79]:
df_pred.to_csv(f'{PATH}pred/pred.csv', index=False)
FileLink(f'{PATH}pred/pred.csv')