# Goal of the analysis

- Check rates of various cols
- Fare comparison
- Survival rate
- Guess survival status in test data
    - age
    - gender
    - pclass
    - sex
    - fare

# Preprocessing

## Load data

In [1]:
# import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_theme(style='whitegrid')

import sklearn
from sklearn.model_selection import KFold, train_test_split

In [2]:
# load data
submission = pd.read_csv('./data/gender_submission.csv')
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')

## Train data

In [3]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Null values

In [6]:
# quick check of the null vals in train data
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

##### Can see from above that train dataset has age: 177 null vals, cabin: 687 null vals, embarked: 2 null vals

### Cabin data

In [7]:
# too many null values in cabin col and no consistency of dataflow, delete.
train = train.drop(columns='Cabin', axis=1)

### Embarked data

In [8]:
# check null vals in Embarked col
# data indicates that both passengers are 1st class
train[train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,


In [9]:
# check the categorical amt of embarked, who are female 1st class passengers
train[(train['Pclass']==1)&(train['Sex']=='female')]['Embarked'].value_counts()

S    48
C    43
Q     1
Name: Embarked, dtype: int64

##### Embarked data from https://www.encyclopedia-titanica.org/titanic-first-class-passengers/

In [10]:
# replace nulls vals with 'S' in embarked col
train['Embarked'] = train['Embarked'].fillna('S')

### Age data

In [11]:
# check null vals
train[train['Age'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,Q
...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,S


##### Age will be difference based on pclass

In [12]:
train[train['Pclass']==1].describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,216.0,216.0,216.0,186.0,216.0,216.0,216.0
mean,461.597222,0.62963,1.0,38.233441,0.416667,0.356481,84.154687
std,246.737616,0.484026,0.0,14.802856,0.611898,0.693997,78.380373
min,2.0,0.0,1.0,0.92,0.0,0.0,0.0
25%,270.75,0.0,1.0,27.0,0.0,0.0,30.92395
50%,472.0,1.0,1.0,37.0,0.0,0.0,60.2875
75%,670.5,1.0,1.0,49.0,1.0,0.0,93.5
max,890.0,1.0,1.0,80.0,3.0,4.0,512.3292


In [13]:
train[train['Pclass']==1]['Sex'].value_counts()

male      122
female     94
Name: Sex, dtype: int64

In [14]:
train[(train['Pclass']==1)&(train['Sex']=='male')]['Age'].mean()

41.28138613861386

In [15]:
train[(train['Pclass']==1)&(train['Sex']=='male')]['Age'].median()

40.0

In [16]:
train[(train['Pclass']==1)&(train['Sex']=='male')]['Age'].mode()

0    36.0
dtype: float64

In [17]:
train[(train['Pclass']==1)&(train['Sex']=='female')]['Age'].mean()

34.61176470588235

In [18]:
train[(train['Pclass']==1)&(train['Sex']=='female')]['Age'].median()

35.0

In [19]:
train[(train['Pclass']==1)&(train['Sex']=='female')]['Age'].mode()

0    35.0
dtype: float64

In [36]:
train.loc[(train['Pclass']==1)&(train['Sex']=='male')]['Age'].fillna(train[(train['Pclass']==1)&(train['Sex']=='male')]['Age'].median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [29]:
# df.loc[df['a'] == 1,'b'].fillna(df['b'].median(), inplace=True)
train.loc[(train['Pclass']==1)&(train['Sex']=='male')]['Age'].isnull() = train[(train['Pclass']==1)&(train['Sex']=='male')]['Age'].fillna(train[(train['Pclass']==1)&(train['Sex']=='male')]['Age'].median())

SyntaxError: cannot assign to function call (1677820910.py, line 2)

In [25]:
train[train['Age'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,Q
...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,S


In [40]:
train['Age'] = train['Age'].fillna(train[(train['Pclass']==1)&(train['Sex']=='male')]['Age'].median())

In [41]:
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [None]:
train[train['Age']] = train['Age'].fillna()

In [16]:
train[train['Pclass']==2].describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,184.0,184.0,184.0,173.0,184.0,184.0,184.0
mean,445.956522,0.472826,2.0,29.87763,0.402174,0.380435,20.662183
std,250.852161,0.500623,0.0,14.001077,0.601633,0.690963,13.417399
min,10.0,0.0,2.0,0.67,0.0,0.0,0.0
25%,234.5,0.0,2.0,23.0,0.0,0.0,13.0
50%,435.5,0.0,2.0,29.0,0.0,0.0,14.25
75%,668.0,1.0,2.0,36.0,1.0,1.0,26.0
max,887.0,1.0,2.0,70.0,3.0,3.0,73.5


In [17]:
train[train['Pclass']==3].describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,491.0,491.0,491.0,355.0,491.0,491.0,491.0
mean,439.154786,0.242363,3.0,25.14062,0.615071,0.393075,13.67555
std,264.441453,0.428949,0.0,12.495398,1.374883,0.888861,11.778142
min,1.0,0.0,3.0,0.42,0.0,0.0,0.0
25%,200.0,0.0,3.0,18.0,0.0,0.0,7.75
50%,432.0,0.0,3.0,24.0,0.0,0.0,8.05
75%,666.5,0.0,3.0,32.0,1.0,0.0,15.5
max,891.0,1.0,3.0,74.0,8.0,6.0,69.55


SyntaxError: invalid syntax (1037885886.py, line 1)

In [12]:
train[train['Fare']==0]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
179,180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0,S
263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,S
271,272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0,S
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0,S
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0,S
413,414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0.0,S
466,467,0,2,"Campbell, Mr. William",male,,0,0,239853,0.0,S
481,482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,S
597,598,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,S
633,634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0,S


### Test data

In [17]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [18]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [20]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64