In [357]:
from dataclasses import dataclass

from keras.models import Sequential
from keras.layers import Dense, Input, Embedding, Dropout, BatchNormalization
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tensorflow import keras
import xgboost as xgb

In [212]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


Training data has labels; test does not have labels.

In [213]:
train_raw = pd.read_csv('/kaggle/input/titanic/train.csv')
df = train_raw.copy()

In [214]:
train_raw

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [215]:
test_raw = pd.read_csv('/kaggle/input/titanic/test.csv')

In [216]:
test_raw

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## Overall Plan
1. Quick survey of every column
2. Make a simple naive neural model, see how it performs
3. Try with XGBoost
4. Compare with NN model with embeddings
5. See how XGBoost performs with the embeddings I created

# Data Preprocessing

## Column Review
Here we'll do a quick review of all of the columns to determine how we want to handle each column.

### Pclass

In [217]:
df['Pclass'].unique()

array([3, 1, 2])

No missing data.  This is an ordinal field, where `1` is strictly better than `2` and `3`, but the native values are already sorted, so no special processing will be needed. We'll just use this one as a categorical variable and we're good.


## Name

In [218]:
list(df['Name'])

['Braund, Mr. Owen Harris',
 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
 'Heikkinen, Miss. Laina',
 'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
 'Allen, Mr. William Henry',
 'Moran, Mr. James',
 'McCarthy, Mr. Timothy J',
 'Palsson, Master. Gosta Leonard',
 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
 'Nasser, Mrs. Nicholas (Adele Achem)',
 'Sandstrom, Miss. Marguerite Rut',
 'Bonnell, Miss. Elizabeth',
 'Saundercock, Mr. William Henry',
 'Andersson, Mr. Anders Johan',
 'Vestrom, Miss. Hulda Amanda Adolfina',
 'Hewlett, Mrs. (Mary D Kingcome) ',
 'Rice, Master. Eugene',
 'Williams, Mr. Charles Eugene',
 'Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)',
 'Masselmani, Mrs. Fatima',
 'Fynney, Mr. Joseph J',
 'Beesley, Mr. Lawrence',
 'McGowan, Miss. Anna "Annie"',
 'Sloper, Mr. William Thompson',
 'Palsson, Miss. Torborg Danira',
 'Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)',
 'Emir, Mr. Farred Chehab',
 'Fortune, Mr. Charles Alexander',
 '

There is a little bit of extra information in here.  For the women, we can see if they are married or not.  Also some people have nicknames; it's possible that can correlate with survival but I'm going to ignore that for now.

## Age

In [219]:
df['Age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

This column is going to require some special handling.
* For infants, age is fractional
* If age is unknown, then it's estimated as xx.5
* There are some unknown ages (nan).

In [220]:
def age_processor(df):
    df['age_known'] = (df.Age < 1) | (df.Age == np.floor(df.Age))
    # People with an age > 1 and a fraction of 0.5, their age is estimated.
    df['adjusted_age'] = np.where(df.age_known, df.Age, np.floor(df.Age))
    default_age = df[df.age_known].adjusted_age.mean()
    df['adjusted_age'] = np.where(df.adjusted_age.isna(), default_age, df.adjusted_age)
    return df

In [221]:
df = age_processor(train_raw.copy())
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_known,adjusted_age
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,True,22.000000
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,True,38.000000
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,True,26.000000
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,True,35.000000
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,True,35.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,True,27.000000
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,True,19.000000
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,False,29.554842
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,True,26.000000


In [222]:
df[df.age_known].adjusted_age.mean()

29.554841954022987

In [223]:
df[~df.age_known]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_known,adjusted_age
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,False,29.554842
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,False,29.554842
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,False,29.554842
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C,False,29.554842
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,False,29.554842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C,False,29.554842
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S,False,29.554842
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S,False,29.554842
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S,False,29.554842


## SibSp, Parch
Looks ok; not thinking of any changes I want to make to these columns right now.

In [224]:
df['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8])

In [225]:
df['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6])

## Ticket

In [226]:
df['Ticket'].unique()

array(['A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450',
       '330877', '17463', '349909', '347742', '237736', 'PP 9549',
       '113783', 'A/5. 2151', '347082', '350406', '248706', '382652',
       '244373', '345763', '2649', '239865', '248698', '330923', '113788',
       '347077', '2631', '19950', '330959', '349216', 'PC 17601',
       'PC 17569', '335677', 'C.A. 24579', 'PC 17604', '113789', '2677',
       'A./5. 2152', '345764', '2651', '7546', '11668', '349253',
       'SC/Paris 2123', '330958', 'S.C./A.4. 23567', '370371', '14311',
       '2662', '349237', '3101295', 'A/4. 39886', 'PC 17572', '2926',
       '113509', '19947', 'C.A. 31026', '2697', 'C.A. 34651', 'CA 2144',
       '2669', '113572', '36973', '347088', 'PC 17605', '2661',
       'C.A. 29395', 'S.P. 3464', '3101281', '315151', 'C.A. 33111',
       'S.O.C. 14879', '2680', '1601', '348123', '349208', '374746',
       '248738', '364516', '345767', '345779', '330932', '113059',
       'SO/C 14885', '31012

There could be some useful data in here.  Let's find the unique tokens.

In [227]:
tokens = set()
numerics = set()
for t in df['Ticket'].unique():
    t = t.upper()
    t = t.replace('.','')
    parts = t.split(' ')
    for part in parts:
        if part.isnumeric():
            numerics.add(int(part))
            continue
        tokens.add(part)

In [228]:
sorted(tokens)

['A/4',
 'A/5',
 'A/S',
 'A4',
 'A5',
 'BASLE',
 'C',
 'CA',
 'CA/SOTON',
 'FA',
 'FC',
 'FCC',
 'LINE',
 'P/PP',
 'PC',
 'PP',
 'SC',
 'SC/A4',
 'SC/AH',
 'SC/PARIS',
 'SCO/W',
 'SO/C',
 'SO/PP',
 'SOC',
 'SOP',
 'SOTON/O2',
 'SOTON/OQ',
 'SP',
 'STON/O',
 'STON/O2',
 'SW/PP',
 'W/C',
 'WE/P',
 'WEP']

In [229]:
sorted(numerics)

[2,
 3,
 541,
 693,
 695,
 751,
 752,
 851,
 1166,
 1585,
 1601,
 1748,
 2003,
 2079,
 2123,
 2131,
 2133,
 2144,
 2146,
 2149,
 2151,
 2152,
 2163,
 2167,
 2223,
 2314,
 2315,
 2343,
 2466,
 2620,
 2623,
 2624,
 2625,
 2626,
 2627,
 2628,
 2629,
 2631,
 2641,
 2647,
 2648,
 2649,
 2650,
 2651,
 2653,
 2659,
 2661,
 2662,
 2663,
 2664,
 2665,
 2666,
 2667,
 2668,
 2669,
 2671,
 2672,
 2673,
 2674,
 2677,
 2678,
 2680,
 2683,
 2685,
 2686,
 2687,
 2689,
 2690,
 2691,
 2693,
 2694,
 2695,
 2697,
 2699,
 2700,
 2816,
 2817,
 2908,
 2926,
 3085,
 3235,
 3336,
 3337,
 3381,
 3411,
 3460,
 3464,
 3474,
 3536,
 3540,
 3594,
 3902,
 4001,
 4133,
 4134,
 4135,
 4136,
 4137,
 4138,
 4348,
 4579,
 5547,
 5727,
 5734,
 5735,
 6212,
 6563,
 6607,
 6608,
 6609,
 7075,
 7076,
 7077,
 7267,
 7534,
 7540,
 7545,
 7546,
 7552,
 7553,
 7598,
 8471,
 8475,
 9234,
 9549,
 10482,
 11206,
 11668,
 11751,
 11752,
 11753,
 11755,
 11765,
 11767,
 11769,
 11771,
 11774,
 11813,
 11967,
 12233,
 12460,
 12749,
 

Seems like there could be something useful in those alpha tokens in the tickets.  Also it seems interesting that there are some large groups of tickets in the ticket numbers.  This could also be useful information.

In [230]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_known,adjusted_age
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,True,22.000000
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,True,38.000000
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,True,26.000000
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,True,35.000000
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,True,35.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,True,27.000000
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,True,19.000000
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,False,29.554842
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,True,26.000000


In [231]:
def ticket_tokens(ticket_str):
    t = ticket_str.upper()
    for remove in ['.','/']:
        t = t.replace(remove,'')
    for token in t.split(' '):
        yield token

def ticket_code(ticket_str):
    for token in ticket_tokens(ticket_str):
        if not token.isnumeric():
            return token
    return ''

def ticket_numeric(ticket_str):
    for token in ticket_tokens(ticket_str):
        if token.isnumeric():
            return int(token)
    return 0
        
def ticket_processor(df):
    df['ticket_code'] = df.Ticket.apply(ticket_code)
    df['ticket_numeric'] = df.Ticket.apply(ticket_numeric)
    return df
    

In [232]:
ticket = 'PC 17599'
list(ticket_tokens(ticket))

['PC', '17599']

In [233]:
ticket_code(ticket)

'PC'

In [234]:
ticket_numeric(ticket)

17599

In [235]:
df = ticket_processor(train_raw.copy())


In [236]:
sorted(list(df['ticket_code'].unique()))

['',
 'A4',
 'A5',
 'AS',
 'C',
 'CA',
 'CASOTON',
 'FA',
 'FC',
 'FCC',
 'LINE',
 'PC',
 'PP',
 'PPP',
 'SC',
 'SCA4',
 'SCAH',
 'SCOW',
 'SCPARIS',
 'SOC',
 'SOP',
 'SOPP',
 'SOTONO2',
 'SOTONOQ',
 'SP',
 'STONO',
 'STONO2',
 'SWPP',
 'WC',
 'WEP']

There might be more room to play with this; it's possible that things like `WE/P` and `WEP` are really the same thing.  And maybe there's some correlation of meaning between `SC/A4`, `SC`, and `A4`.  In fact, let's go ahead and integrate those by dropping slashes.

In [237]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_code,ticket_numeric
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,A5,21171
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,PC,17599
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,STONO2,3101282
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,,113803
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,,373450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,,211536
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,,112053
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,WC,6607
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,,111369


## Fare

In [238]:
sorted(list(df['Fare'].unique()))

[0.0,
 4.0125,
 5.0,
 6.2375,
 6.4375,
 6.45,
 6.4958,
 6.75,
 6.8583,
 6.95,
 6.975,
 7.0458,
 7.05,
 7.0542,
 7.125,
 7.1417,
 7.225,
 7.2292,
 7.25,
 7.3125,
 7.4958,
 7.5208,
 7.55,
 7.6292,
 7.65,
 7.725,
 7.7292,
 7.7333,
 7.7375,
 7.7417,
 7.75,
 7.775,
 7.7875,
 7.7958,
 7.8,
 7.8292,
 7.8542,
 7.875,
 7.8792,
 7.8875,
 7.8958,
 7.925,
 8.0292,
 8.05,
 8.1125,
 8.1375,
 8.1583,
 8.3,
 8.3625,
 8.4042,
 8.4333,
 8.4583,
 8.5167,
 8.6542,
 8.6625,
 8.6833,
 8.7125,
 8.85,
 9.0,
 9.2167,
 9.225,
 9.35,
 9.475,
 9.4833,
 9.5,
 9.5875,
 9.825,
 9.8375,
 9.8417,
 9.8458,
 10.1708,
 10.4625,
 10.5,
 10.5167,
 11.1333,
 11.2417,
 11.5,
 12.0,
 12.275,
 12.2875,
 12.35,
 12.475,
 12.525,
 12.65,
 12.875,
 13.0,
 13.4167,
 13.5,
 13.7917,
 13.8583,
 13.8625,
 14.0,
 14.1083,
 14.4,
 14.4542,
 14.4583,
 14.5,
 15.0,
 15.0458,
 15.05,
 15.1,
 15.2458,
 15.5,
 15.55,
 15.7417,
 15.75,
 15.85,
 15.9,
 16.0,
 16.1,
 16.7,
 17.4,
 17.8,
 18.0,
 18.75,
 18.7875,
 19.2583,
 19.5,
 19.9667,
 20.2

Looks ok.  I'm not sure why some of them are zero.  The fractional numbers suggests maybe there is currency conversion involved.  I don't see any NaN or things that need processing.

Actually, while there are no missing items in the training data, one of the test items is missing the fare.  Let's fill in the average fare for that.

In [239]:
def fare_processor(df):
    avg_fare = df.Fare.mean()
    df['adjusted_fare'] = df.Fare.fillna(avg_fare)
    return df

## Cabin

In [240]:
def cabin_tokens(s):
    if pd.isna(s):
        return None
    return s
    
for s in df.Cabin:
    t = cabin_tokens(s)
    if not t:
        continue
    print(t)

C85
C123
E46
G6
C103
D56
A6
C23 C25 C27
B78
D33
B30
C52
B28
C83
F33
F G73
C23 C25 C27
E31
A5
D10 D12
D26
C110
B58 B60
E101
D26
F E69
D47
C123
B86
F2
C2
E33
B19
A7
C49
F4
A32
F2
B4
B80
G6
A31
D36
D15
C93
C83
C78
D35
G6
C87
B77
E67
B94
C125
C99
C118
D7
A19
B49
D
C22 C26
C106
B58 B60
E101
C22 C26
C65
E36
C54
B57 B59 B63 B66
C7
E34
C32
D
B18
C124
C91
C2
E40
T
F2
C23 C25 C27
F33
C128
E33
D37
B35
E50
C82
B96 B98
D36
G6
C78
E10
C52
E44
B96 B98
C23 C25 C27
A34
C104
C111
C92
E38
D21
E12
E63
D
A14
B49
C93
B37
C30
D20
C22 C26
B79
C65
E25
D46
F33
B73
B18
C95
B38
B39
B22
C86
C70
A16
E67
C101
E25
E44
C68
A10
E68
B41
D20
A20
C125
F4
D19
D50
D9
A23
B50
B35
D33
A26
D48
E58
C126
B71
B51 B53 B55
D49
B5
B20
C68
F G63
C62 C64
E24
E24
C90
C124
C126
F G73
C45
E101
E8
B5
B101
D45
C46
B57 B59 B63 B66
B22
D30
E121
B77
B96 B98
D11
E77
F38
B3
B20
D6
B82 B84
D17
B96 B98
A36
E8
B102
B69
E121
B28
E49
C47
C92
D28
E17
D17
A24
D35
B51 B53 B55
C50
B42
C148


Some people have multiple cabins, apparently?  Number of cabins assigned could indicate how large of a group you're in, and/or a level of comfort.  Let's add a feature that exposes the room count, since we're going to be trimming this down to exposing a single room.  Plus there are some other interesting things in here.

In [241]:
df[df.Cabin.isin(['B57 B59 B63 B66', 'F G73'])].sort_values(['Cabin'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_code,ticket_numeric
311,312,1,1,"Ryerson, Miss. Emily Borie",female,18.0,2,2,PC 17608,262.375,B57 B59 B63 B66,C,PC,17608
742,743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21.0,2,2,PC 17608,262.375,B57 B59 B63 B66,C,PC,17608
75,76,0,3,"Moen, Mr. Sigurd Hansen",male,25.0,0,0,348123,7.65,F G73,S,,348123
715,716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19.0,0,0,348124,7.65,F G73,S,,348124


For now, I'll divide the cabin information into 2 fields:
* A single code for the cabin section, `A`,`B`, etc. Select the first code from a cabin with a room number.  So for `F G73` we will return `G`.
* A single numeric for the room number.  If there are multiple, I'll use the first one.

In [242]:
def room_number(cabin):
    if pd.isna(cabin):
        return 0
    for part in cabin.split(' '):
        if len(part) <=1: # Skip single letter things
            continue
        return int(part[1:])
    return 0

def room_known(cabin):
    return (room_number(cabin) > 0)

def room_section(cabin):
    if pd.isna(cabin):
        return 'Unk'
    result = 'Unk'
    for part in cabin.split(' '):
        if len(part) <=1: # Skip single letter things
            if result == 'Unk': # Use this for the section if we haven't found something better.
                result = part
        return part[0:1]
    return result

def room_count(cabin):
    if pd.isna(cabin):
        return 0
    return len(cabin.split(' '))
        

In [243]:
test_cabins = ['B57 B58', 'F G73','A12', 'T']
[room_number(c) for c in test_cabins], [room_count(c) for c in test_cabins]

([57, 73, 12, 0], [2, 2, 1, 1])

In [244]:
[room_section(c) for c in test_cabins]

['B', 'F', 'A', 'T']

In [245]:
def cabin_processor(df):
    df['cabin_known'] = df.Cabin.apply(room_known)
    df['cabin_number'] = df.Cabin.apply(room_number)
    df['cabin_section'] = df.Cabin.apply(room_section)
    df['cabin_count'] = df.Cabin.apply(room_count)
    return df

In [246]:
df = cabin_processor(test_raw.copy())
df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cabin_known,cabin_number,cabin_section,cabin_count
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,False,0,Unk,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,False,0,Unk,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,False,0,Unk,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,False,0,Unk,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,False,0,Unk,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,False,0,Unk,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,True,105,C,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,False,0,Unk,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,False,0,Unk,0


In [247]:
df[df.cabin_section != 'Unk']

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cabin_known,cabin_number,cabin_section,cabin_count
12,904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23.0,1,0,21228,82.2667,B45,S,True,45,B,1
14,906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",female,47.0,1,0,W.E.P. 5734,61.1750,E31,S,True,31,E,1
24,916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48.0,1,3,PC 17608,262.3750,B57 B59 B63 B66,C,True,57,B,4
26,918,1,"Ostby, Miss. Helene Ragnhild",female,22.0,0,1,113509,61.9792,B36,C,True,36,B,1
28,920,1,"Brady, Mr. John Bertram",male,41.0,0,0,113054,30.5000,A21,S,True,21,A,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,1296,1,"Frauenthal, Mr. Isaac Gerald",male,43.0,1,0,17765,27.7208,D40,C,True,40,D,1
405,1297,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20.0,0,0,SC/PARIS 2166,13.8625,D38,C,True,38,D,1
407,1299,1,"Widener, Mr. George Dunton",male,50.0,1,1,113503,211.5000,C80,C,True,80,C,1
411,1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0000,C78,Q,True,78,C,1


## Embarked

In [248]:
df.Embarked.unique()

array(['Q', 'S', 'C'], dtype=object)

It looks like that's all for the data preprocessing.  Let's bring all the processors together.

## Final Data Prep

In [249]:
processors = [
    age_processor,
    fare_processor,
    ticket_processor,
    cabin_processor,
]

@dataclass
class Dataset:
    raw: pd.DataFrame # The dataframe directly from the csv.
    enhanced: pd.DataFrame # Updated, pre-processed fields added to raw
    trimmed: pd.DataFrame # Cut down to just the fields we will run models on
    encoded: pd.DataFrame # What we run models on, one-hot-encoded

    def X(self):
        """
        Gets a numpy array suitable for feeding into a model.
        """
        return self.encoded.drop(columns=['Survived']).to_numpy()
    
    def Y(self):
        return self.encoded[['Survived']].to_numpy()

    def Y_flat(self):
        """Returns predictions as a single-dimensional array."""
        return self.encoded[['Survived']].to_numpy().ravel()
    
def select_rows(df, train=True):
    """
    Get rows either for train or test.
    """
    if train:
        return df[~df.Survived.isna()]
    else:
        return df[df.Survived.isna()]
    
def load_data():
    # Load test and train together so that they share the same encodings.
    df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
    df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
    df_all = df_train.copy().append(df_test, ignore_index=True)

    # Preprocess.
    df = df_all
    for processor in processors:
        df = processor(df)

    # Trim down to just the columns we'll look at.
    trimmed = df.copy().drop(columns=['Name','Age','Ticket','Cabin'])
    
    # The columns we want one-hot encoded are the categorical columns that don't have intrinsic numerical meaning.
    # We are leaving Pclass out of this list because even though it is categorical, it has intrinsic and sorted numeric meaning.
    one_hot_cols = ['Sex','Embarked','ticket_code','cabin_section']
    continuous_cols = ['Survived', 'Pclass', 'SibSp', 'Parch', 'adjusted_fare', 'age_known', 'adjusted_age', 'ticket_numeric', 'cabin_known', 'cabin_number', 'cabin_count']
    one_hot_encoded_fields = pd.get_dummies(trimmed[one_hot_cols])
    # Join continuous fields with categorical fields.
    encoded_inputs = trimmed[continuous_cols].join(one_hot_encoded_fields)
    
    ds_train = Dataset(raw=df_train,
                       enhanced=select_rows(df, train=True),
                       trimmed=select_rows(trimmed, train=True),
                       encoded=select_rows(encoded_inputs, train=True)
                      )
    ds_test = Dataset(raw=df_test,
                       enhanced=select_rows(df, train=False),
                       trimmed=select_rows(trimmed, train=False),
                       encoded=select_rows(encoded_inputs, train=False)
                      )
    return ds_train, ds_test

In [250]:
ds_train, ds_test = load_data()

Check for nan's:

In [251]:
ds_train.encoded.isnull().values.any()

False

In [252]:
ds = ds_test
#ds = ds_train
for col in ds.encoded.columns:
    null_count = ds.encoded[col].isnull().sum()
    if null_count == 0:
        continue
    print(f'col {col} has null={null_count}')

col Survived has null=418


In [253]:
ds_train.raw

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [254]:
ds_test.raw

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [255]:
ds_train.enhanced

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,age_known,adjusted_age,adjusted_fare,ticket_code,ticket_numeric,cabin_known,cabin_number,cabin_section,cabin_count
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,...,S,True,22.000000,7.2500,A5,21171,False,0,Unk,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,True,38.000000,71.2833,PC,17599,True,85,C,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,...,S,True,26.000000,7.9250,STONO2,3101282,False,0,Unk,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,...,S,True,35.000000,53.1000,,113803,True,123,C,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,...,S,True,35.000000,8.0500,,373450,False,0,Unk,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0.0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,...,S,True,27.000000,13.0000,,211536,False,0,Unk,0
887,888,1.0,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,...,S,True,19.000000,30.0000,,112053,True,42,B,1
888,889,0.0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,...,S,False,29.798786,23.4500,WC,6607,False,0,Unk,0
889,890,1.0,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,...,C,True,26.000000,30.0000,,111369,True,148,C,1


In [256]:
ds_test.enhanced

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,age_known,adjusted_age,adjusted_fare,ticket_code,ticket_numeric,cabin_known,cabin_number,cabin_section,cabin_count
891,892,,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,...,Q,False,34.000000,7.8292,,330911,False,0,Unk,0
892,893,,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,...,S,True,47.000000,7.0000,,363272,False,0,Unk,0
893,894,,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,...,Q,True,62.000000,9.6875,,240276,False,0,Unk,0
894,895,,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,...,S,True,27.000000,8.6625,,315154,False,0,Unk,0
895,896,,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,...,S,True,22.000000,12.2875,,3101298,False,0,Unk,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,...,S,False,29.798786,8.0500,A5,3236,False,0,Unk,0
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,...,C,True,39.000000,108.9000,PC,17758,True,105,C,1
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,...,S,False,38.000000,7.2500,SOTONOQ,3101262,False,0,Unk,0
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,...,S,False,29.798786,8.0500,,359309,False,0,Unk,0


In [257]:
ds_train.encoded

Unnamed: 0,Survived,Pclass,SibSp,Parch,adjusted_fare,age_known,adjusted_age,ticket_numeric,cabin_known,cabin_number,...,ticket_code_WEP,cabin_section_A,cabin_section_B,cabin_section_C,cabin_section_D,cabin_section_E,cabin_section_F,cabin_section_G,cabin_section_T,cabin_section_Unk
0,0.0,3,1,0,7.2500,True,22.000000,21171,False,0,...,0,0,0,0,0,0,0,0,0,1
1,1.0,1,1,0,71.2833,True,38.000000,17599,True,85,...,0,0,0,1,0,0,0,0,0,0
2,1.0,3,0,0,7.9250,True,26.000000,3101282,False,0,...,0,0,0,0,0,0,0,0,0,1
3,1.0,1,1,0,53.1000,True,35.000000,113803,True,123,...,0,0,0,1,0,0,0,0,0,0
4,0.0,3,0,0,8.0500,True,35.000000,373450,False,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,2,0,0,13.0000,True,27.000000,211536,False,0,...,0,0,0,0,0,0,0,0,0,1
887,1.0,1,0,0,30.0000,True,19.000000,112053,True,42,...,0,0,1,0,0,0,0,0,0,0
888,0.0,3,1,2,23.4500,False,29.798786,6607,False,0,...,0,0,0,0,0,0,0,0,0,1
889,1.0,1,0,0,30.0000,True,26.000000,111369,True,148,...,0,0,0,1,0,0,0,0,0,0


Here is what we'll train with:

In [258]:
ds_train.X()

array([[3, 1, 0, ..., 0, 0, 1],
       [1, 1, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 1],
       ...,
       [3, 1, 2, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 1]], dtype=object)

In [259]:
ds_train.X().shape

(891, 60)

In [260]:
ds_train.Y()

array([[0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],

In [261]:
ds_test.trimmed

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,age_known,adjusted_age,adjusted_fare,ticket_code,ticket_numeric,cabin_known,cabin_number,cabin_section,cabin_count
891,892,,3,male,0,0,7.8292,Q,False,34.000000,7.8292,,330911,False,0,Unk,0
892,893,,3,female,1,0,7.0000,S,True,47.000000,7.0000,,363272,False,0,Unk,0
893,894,,2,male,0,0,9.6875,Q,True,62.000000,9.6875,,240276,False,0,Unk,0
894,895,,3,male,0,0,8.6625,S,True,27.000000,8.6625,,315154,False,0,Unk,0
895,896,,3,female,1,1,12.2875,S,True,22.000000,12.2875,,3101298,False,0,Unk,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,male,0,0,8.0500,S,False,29.798786,8.0500,A5,3236,False,0,Unk,0
1305,1306,,1,female,0,0,108.9000,C,True,39.000000,108.9000,PC,17758,True,105,C,1
1306,1307,,3,male,0,0,7.2500,S,False,38.000000,7.2500,SOTONOQ,3101262,False,0,Unk,0
1307,1308,,3,male,0,0,8.0500,S,False,29.798786,8.0500,,359309,False,0,Unk,0


# Models
1. Random forest
2. XGBoost.
3. Neural net, simple
4. Neural net, embeddings

## Random Forest
As a baseline, let's try the random forest classifier in [the Alexis Tutorial](https://www.kaggle.com/alexisbcook/titanic-tutorial).

In [262]:
forest_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
forest_model.fit(ds_train.X(), ds_train.Y().ravel())
#predictions = forest_model.predict(ds_test.X())

RandomForestClassifier(max_depth=5, random_state=1)

In [263]:
forest_predictions = forest_model.predict(ds_test.X()).astype(int)


In [264]:
forest_predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [265]:
def submit(predictions, passenger_ids, output_csv_name):
    # Float values result in zero score. Convert to int.
    predictions = predictions.astype(int)
    output = pd.DataFrame({
        'PassengerId': passenger_ids,
        'Survived': predictions
    })
    output.to_csv(output_csv_name, index=False)

## Evaluating the baseline model
1. Run the `submit()` function below for the model
2. Click "Save Version" at the top right
3. Ensure "Save and Run All" option is selected, the click "Save"
4. After the notebook has finished running, click on the number to the right of the "Save Version" button.  Click on the ... and select Open in Viewer.
5. Click on the Output tab on the right, then click on "Submit to the Competition"

In [266]:
submit(forest_predictions, ds_test.raw.PassengerId, 'submit-baseline.csv')

That model has an accuracy of 0.76794.

## XGBoost

In [267]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False)

In [268]:
xgb_model.fit(ds_train.X(), ds_train.Y_flat())



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [269]:
xgb_predictions = xgb_model.predict(ds_test.X())

Compare versus baseline:

In [270]:
comparison = pd.DataFrame(data={
    'forest': forest_predictions,
    'xgb': xgb_predictions
})
comparison

Unnamed: 0,forest,xgb
0,0,0
1,0,0
2,0,0
3,0,1
4,1,1
...,...,...
413,0,0
414,1,1
415,0,0
416,0,0


In [271]:
comparison[comparison.forest != comparison.xgb]

Unnamed: 0,forest,xgb
3,0,1
6,1,0
11,0,1
18,0,1
21,0,1
...,...,...
376,1,0
379,0,1
380,0,1
404,1,0


In [272]:
submit(xgb_predictions, ds_test.raw.PassengerId, 'submit-xgb.csv')

xgboost model was 0.76555 accuracy, which is a little worse but pretty much identical to the random forest.

## Neural Model

In [341]:
class NN:
    def __init__(self, input_width, layer_width=100, layer_count=3):
        self.model = self._build_model(input_width, layer_width, layer_count)
    
    def _build_model(self, input_width, layer_width, layer_count):
        model = Sequential()
        model.add(BatchNormalization(input_shape=(input_width,)))
        model.add(Dropout(0.2))
        
        model.add(Dense(layer_width, activation='relu'))
        #model.add(BatchNormalization(input_shape=(input_width,)))#
        model.add(Dropout(0.2))
        w = layer_width
        for _ in range(layer_count - 1):
            w = int(w/2)
            model.add(Dense(w, activation='relu'))
            #model.add(BatchNormalization(input_shape=(input_width,)))#
            model.add(Dropout(0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model
    
    def train(self, X, Y, epochs=50):
        self.model.fit(X.astype(float), Y.astype(float), epochs=epochs)
    
    def accuracy(self, X_test, Y_test):
        loss, accuracy = self.model.evaluate(X_test.astype(float), Y_test)
        return accuracy

In [274]:
input_width = ds_train.X().shape[1]
input_width

60

In [275]:
nn_model = NN(input_width, layer_width=500, layer_count=5)

In [276]:
nn_model.model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_2 (Batch (None, 60)                240       
_________________________________________________________________
dropout_12 (Dropout)         (None, 60)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 500)               30500     
_________________________________________________________________
dropout_13 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 250)               125250    
_________________________________________________________________
dropout_14 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 125)              

In [277]:
nn_model.model.fit(ds_train.X().astype(float), ds_train.Y().astype(float), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f89d2169650>

In [278]:
survive_chance = nn_model.model.predict(ds_test.X().astype(float)).flatten()
survive_chance.shape

(418,)

In [279]:
r = pd.DataFrame(index=ds_test.raw.PassengerId, data={'survive_chance': survive_chance})
r['Survived'] = [int(p>0.5) for p in survive_chance]
r

Unnamed: 0_level_0,survive_chance,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
892,0.076844,0
893,0.544190,1
894,0.030226,0
895,0.098928,0
896,0.785195,1
...,...,...
1305,0.005772,0
1306,0.910648,1
1307,0.000087,0
1308,0.113271,0


In [280]:
submit(r.Survived, r.index, 'submit-nn.csv')

In [281]:
r['forest'] = forest_predictions

In [282]:
r

Unnamed: 0_level_0,survive_chance,Survived,forest
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
892,0.076844,0,0
893,0.544190,1,0
894,0.030226,0,0
895,0.098928,0,0
896,0.785195,1,1
...,...,...,...
1305,0.005772,0,0
1306,0.910648,1,1
1307,0.000087,0,0
1308,0.113271,0,0


In [283]:
len(r[r.Survived != r.forest])

54

In [284]:
def submit2(predictions, passenger_ids):
    predictions = predictions.astype(int)
    output = pd.DataFrame({
        'PassengerId': passenger_ids,
        'Survived': predictions
    })
    return output

In [285]:
submit2(r.Survived, r.index)

Unnamed: 0_level_0,PassengerId,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
892,892,0
893,893,1
894,894,0
895,895,0
896,896,1
...,...,...
1305,1305,0
1306,1306,1
1307,1307,0
1308,1308,0


Official accuracy for that was 0.75837.

## Internal Accuracy Measurement
Submitting to the official Kaggle thing is a time-consuming affair.  Let's set up our own internal accuracy measurement.


In [342]:
X = ds_train.X()
Y = ds_train.Y_flat()

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [343]:
nn_model = NN(input_width, layer_width=100, layer_count=3)
nn_model.train(X_train, Y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [344]:
accuracy = nn_model.accuracy(X_test, Y_test)
accuracy #83



0.8320895433425903

In [353]:
#nn_model = NN(input_width, layer_width=1000, layer_count=3)
#nn_model.train(X_train, Y_train, epochs=500) # 77
#nn_model = NN(input_width, layer_width=100, layer_count=2)
#nn_model.train(X_train, Y_train, epochs=100) # 82
nn_model = NN(input_width, layer_width=100, layer_count=1)
nn_model.train(X_train, Y_train, epochs=100) # 82

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [354]:
loss, accuracy = nn_model.model.evaluate(X_test.astype(float), Y_test)
accuracy



0.8283582329750061

## NN with Embeddings

In [362]:
class NN_functional_style(NN):
    def _build_model(self, input_width, layer_width, layer_count):
        inputs = Input(shape=(input_width,))
        batch0 = BatchNormalization()(inputs)
        drop0 = Dropout(0.2)(batch0)

        dense1 = Dense(layer_width, activation='relu')(drop0)
        drop1 = Dropout(0.2)(dense1)
        
        prev_layer = drop1
        w = layer_width
        for _ in range(layer_count - 1):
            w = int(w/2)
            dense = Dense(w, activation='relu')(prev_layer)
            drop = Dropout(0.2)(dense)
            prev_layer = drop
        
        output = Dense(1, activation='sigmoid')(prev_layer)
        
        model = keras.Model(inputs=[inputs], outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

In [None]:
class Embedded_NN(NN):
    def _build_model(self, input_width, layer_width, layer_count):
        inputs = Input(shape=(input_width,))
        batch0 = BatchNormalization()(inputs)
        drop0 = Dropout(0.2)(batch0)

        dense1 = Dense(layer_width, activation='relu')(drop0)
        drop1 = Dropout(0.2)(dense1)
        
        prev_layer = drop1
        w = layer_width
        for _ in range(layer_count - 1):
            w = int(w/2)
            dense = Dense(w, activation='relu')(prev_layer)
            drop = Dropout(0.2)(dense)
            prev_layer = drop
        
        output = Dense(1, activation='sigmoid')(prev_layer)
        
        model = keras.Model(inputs=[inputs], outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

In [360]:
ee_model = Embedded_NN(input_width, layer_width=100, layer_count=3)
ee_model.train(X_train, Y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [361]:
ee_model.accuracy(X_test, Y_test)



0.8358209133148193