In [1]:
from fastai.tabular.all import *
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df.drop('PassengerId', axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
dls = TabularDataLoaders.from_csv('/kaggle/input/titanic/train.csv', y_names="Survived", y_block = CategoryBlock,
    cat_names = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket'],
    cont_names = ['Age', 'Fare', 'SibSp', 'Parch'],
    procs = [Categorify, FillMissing, Normalize])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)


In [3]:
dls.show_batch()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Cabin,Embarked,Ticket,Age_na,Age,Fare,SibSp,Parch,Survived
0,470,3,"Baclini, Miss. Helene Barbara",female,#na#,C,2666,False,0.749999,19.258301,2.0,1.0,1
1,106,3,"Mionoff, Mr. Stoytcho",male,#na#,S,349207,False,28.0,7.895799,-5.799985e-10,6.166637e-09,0
2,760,1,"Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)",female,B77,S,110152,False,33.0,86.500003,-5.799985e-10,6.166637e-09,1
3,727,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,#na#,S,31027,False,30.0,21.0,3.0,6.166637e-09,1
4,863,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,D17,S,17466,False,47.999999,25.929199,-5.799985e-10,6.166637e-09,1
5,167,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,E33,S,113505,True,28.0,55.0,-5.799985e-10,1.0,1
6,151,2,"Bateman, Rev. Robert James",male,#na#,S,S.O.P. 1166,False,51.0,12.524999,-5.799985e-10,6.166637e-09,0
7,618,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,#na#,S,A/5. 3336,False,26.0,16.1,1.0,6.166637e-09,0
8,474,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,D,C,SC/AH Basle 541,False,23.0,13.791701,-5.799985e-10,6.166637e-09,1
9,161,3,"Cribb, Mr. John Hatfield",male,#na#,S,371362,False,44.0,16.1,-5.799985e-10,1.0,0


In [4]:
learn = tabular_learner(dls, metrics=accuracy)

In [5]:
learn.fit_one_cycle(3)

epoch,train_loss,valid_loss,accuracy,time
0,0.643348,0.66778,0.617977,00:00
1,0.554879,0.656821,0.634831,00:00
2,0.460976,0.648176,0.713483,00:00


In [6]:
learn.show_results()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Cabin,Embarked,Ticket,Age_na,Age,Fare,SibSp,Parch,Survived,Survived_pred
0,0.0,1.0,699.0,2.0,0.0,3.0,12.0,1.0,1.211115,-0.111217,-0.494076,-0.46753,1.0,0.0
1,0.0,3.0,408.0,2.0,0.0,3.0,345.0,1.0,-1.946924,-0.424167,0.492692,0.767095,1.0,1.0
2,0.0,1.0,95.0,2.0,9.0,1.0,18.0,1.0,0.825988,-0.020885,-0.494076,-0.46753,1.0,1.0
3,0.0,3.0,108.0,2.0,0.0,3.0,316.0,1.0,-0.021291,-0.507141,0.492692,-0.46753,0.0,0.0
4,0.0,3.0,789.0,2.0,0.0,3.0,663.0,1.0,0.13276,-0.489294,-0.494076,-0.46753,1.0,0.0
5,0.0,3.0,730.0,2.0,0.0,3.0,536.0,1.0,-0.714519,-0.486757,-0.494076,-0.46753,0.0,0.0
6,0.0,3.0,372.0,2.0,0.0,3.0,544.0,1.0,1.057064,-0.519236,-0.494076,-0.46753,0.0,0.0
7,0.0,2.0,78.0,1.0,145.0,3.0,115.0,1.0,-1.946924,0.14151,1.47946,0.767095,1.0,1.0
8,0.0,3.0,574.0,2.0,0.0,1.0,188.0,1.0,-0.714519,-0.33062,0.492692,0.767095,1.0,0.0


Lets try now by using pandas

In [7]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df))

In [8]:
to = TabularPandas(df, procs=[Categorify, FillMissing,Normalize],
                   cat_names = ['Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket'],
                   cont_names = ['Age', 'Fare', 'SibSp', 'Parch'],
                   y_names='Survived',
                   splits=splits)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)


In [9]:
to.xs.iloc[:2]

Unnamed: 0,Pclass,Name,Sex,Cabin,Embarked,Ticket,Age_na,Age,Fare,SibSp,Parch
678,3,303,1,0,3,567,1,1.089565,0.308853,0.409827,6.890134
22,3,524,1,0,2,279,1,-1.085462,-0.465232,-0.465044,-0.475182


In [10]:
dls2 = to.dataloaders(bs=64)

In [11]:
dls2.show_batch()

Unnamed: 0,Pclass,Name,Sex,Cabin,Embarked,Ticket,Age_na,Age,Fare,SibSp,Parch,Survived
0,3,"Badt, Mr. Mohamed",male,#na#,C,2623,False,40.0,7.225,1.390937e-08,7.796364e-09,0.0
1,3,"Lindblom, Miss. Augusta Charlotta",female,#na#,S,347073,False,45.0,7.750001,1.390937e-08,7.796364e-09,0.0
2,3,"Henry, Miss. Delia",female,#na#,Q,382649,True,28.0,7.750001,1.390937e-08,7.796364e-09,0.0
3,3,"Sage, Master. Thomas Henry",male,#na#,S,CA. 2343,True,28.0,69.550002,8.0,2.0,0.0
4,3,"Lefebre, Miss. Jeannie",female,#na#,S,4133,True,28.0,25.4667,3.0,1.0,0.0
5,1,"Partner, Mr. Austen",male,C124,S,113043,False,45.499999,28.5,1.390937e-08,7.796364e-09,0.0
6,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,C125,S,PC 17582,False,58.0,153.462491,1.390937e-08,1.0,1.0
7,3,"Markun, Mr. Johann",male,#na#,S,349257,False,33.0,7.8958,1.390937e-08,7.796364e-09,0.0
8,3,"Ilmakangas, Miss. Pieta Sofia",female,#na#,S,STON/O2. 3101271,False,25.0,7.925,1.0,7.796364e-09,0.0
9,1,"Kent, Mr. Edward Austin",male,B37,C,11771,False,58.0,29.700001,1.390937e-08,7.796364e-09,0.0


In [12]:
learn2 = tabular_learner(dls2, metrics=accuracy)

In [13]:
learn2.fit_one_cycle(3)

epoch,train_loss,valid_loss,accuracy,time
0,0.55695,0.36388,0.646067,00:00
1,0.442881,0.341257,0.646067,00:00
2,0.361664,0.314855,0.646067,00:00


Now lets tree random forests (better at tabular data in general see: https://medium.com/geekculture/why-tree-based-models-beat-deep-learning-on-tabular-data-fcad692b1456)

The following code is from a tutorial on the titanic submissions https://www.kaggle.com/code/alexisbcook/titanic-tutorial

In [14]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [15]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [16]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [17]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


In [18]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"] # sqrt(12) == 3.5
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})

output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


let check against trained data since test does not have survived

In [19]:
X_test_train = pd.get_dummies(train_data[features])
predictions = model.predict(X_test_train)
wrong = 0
for index, prediction in enumerate(predictions):
    if prediction != train_data.at[index, 'Survived']:
        wrong = wrong + 1
print("accuracy", 1 - (wrong / len(predictions)))

accuracy 0.8159371492704826
