In [1]:
import pandas as pd

In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.metrics import classification_report

In [5]:
titanic_df = pd.read_csv('train.csv')

In [6]:
titanic_df = titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [7]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [8]:
titanic_df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
5,0,3,male,,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.075,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


In [9]:
titanic_df['AgeIsMissing'] = 0

In [10]:
titanic_df.loc[titanic_df.Age.isnull(), 'AgeIsMissing'] = 1

In [11]:
titanic_df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeIsMissing
0,0,3,male,22.0,1,0,7.25,S,0
1,1,1,female,38.0,1,0,71.2833,C,0
2,1,3,female,26.0,0,0,7.925,S,0
3,1,1,female,35.0,1,0,53.1,S,0
4,0,3,male,35.0,0,0,8.05,S,0
5,0,3,male,,0,0,8.4583,Q,1
6,0,1,male,54.0,0,0,51.8625,S,0
7,0,3,male,2.0,3,1,21.075,S,0
8,1,3,female,27.0,0,2,11.1333,S,0
9,1,2,female,14.0,1,0,30.0708,C,0


In [12]:
age_mean = round(titanic_df['Age'].mean())

In [13]:
age_mean

30

In [14]:
titanic_df.Age.fillna(age_mean, inplace=True)

In [15]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Pclass        891 non-null    int64  
 2   Sex           891 non-null    object 
 3   Age           891 non-null    float64
 4   SibSp         891 non-null    int64  
 5   Parch         891 non-null    int64  
 6   Fare          891 non-null    float64
 7   Embarked      889 non-null    object 
 8   AgeIsMissing  891 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


In [16]:
titanic_df.Embarked.fillna('S', inplace=True)

In [17]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Pclass        891 non-null    int64  
 2   Sex           891 non-null    object 
 3   Age           891 non-null    float64
 4   SibSp         891 non-null    int64  
 5   Parch         891 non-null    int64  
 6   Fare          891 non-null    float64
 7   Embarked      891 non-null    object 
 8   AgeIsMissing  891 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


In [18]:
cut_points = [0,18,25,40,60,100]

In [19]:
titanic_df['Age_bin'] = pd.cut(titanic_df.Age, bins=cut_points)

In [20]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeIsMissing,Age_bin
0,0,3,male,22.0,1,0,7.25,S,0,"(18, 25]"
1,1,1,female,38.0,1,0,71.2833,C,0,"(25, 40]"
2,1,3,female,26.0,0,0,7.925,S,0,"(25, 40]"
3,1,1,female,35.0,1,0,53.1,S,0,"(25, 40]"
4,0,3,male,35.0,0,0,8.05,S,0,"(25, 40]"


In [21]:
titanic_df['Fare_bin'] = pd.qcut(titanic_df.Fare, 5)

In [22]:
titanic_df.Fare_bin.unique()

[(-0.001, 7.854], (39.688, 512.329], (7.854, 10.5], (10.5, 21.679], (21.679, 39.688]]
Categories (5, interval[float64]): [(-0.001, 7.854] < (7.854, 10.5] < (10.5, 21.679] < (21.679, 39.688] < (39.688, 512.329]]

In [23]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeIsMissing,Age_bin,Fare_bin
0,0,3,male,22.0,1,0,7.25,S,0,"(18, 25]","(-0.001, 7.854]"
1,1,1,female,38.0,1,0,71.2833,C,0,"(25, 40]","(39.688, 512.329]"
2,1,3,female,26.0,0,0,7.925,S,0,"(25, 40]","(7.854, 10.5]"
3,1,1,female,35.0,1,0,53.1,S,0,"(25, 40]","(39.688, 512.329]"
4,0,3,male,35.0,0,0,8.05,S,0,"(25, 40]","(7.854, 10.5]"


In [24]:
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1

In [25]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeIsMissing,Age_bin,Fare_bin,FamilySize
0,0,3,male,22.0,1,0,7.25,S,0,"(18, 25]","(-0.001, 7.854]",2
1,1,1,female,38.0,1,0,71.2833,C,0,"(25, 40]","(39.688, 512.329]",2
2,1,3,female,26.0,0,0,7.925,S,0,"(25, 40]","(7.854, 10.5]",1
3,1,1,female,35.0,1,0,53.1,S,0,"(25, 40]","(39.688, 512.329]",2
4,0,3,male,35.0,0,0,8.05,S,0,"(25, 40]","(7.854, 10.5]",1


In [26]:
titanic_df['IsAlone'] = 0

In [27]:
titanic_df.loc[titanic_df['FamilySize'] == 1, 'IsAlone'] = 1

In [28]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeIsMissing,Age_bin,Fare_bin,FamilySize,IsAlone
0,0,3,male,22.0,1,0,7.25,S,0,"(18, 25]","(-0.001, 7.854]",2,0
1,1,1,female,38.0,1,0,71.2833,C,0,"(25, 40]","(39.688, 512.329]",2,0
2,1,3,female,26.0,0,0,7.925,S,0,"(25, 40]","(7.854, 10.5]",1,1
3,1,1,female,35.0,1,0,53.1,S,0,"(25, 40]","(39.688, 512.329]",2,0
4,0,3,male,35.0,0,0,8.05,S,0,"(25, 40]","(7.854, 10.5]",1,1


In [29]:
pd.crosstab(titanic_df.Survived,titanic_df.IsAlone).apply(lambda a:a/a.sum(),axis=0)

IsAlone,0,1
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.49435,0.696462
1,0.50565,0.303538


In [30]:
titanic_df['IsMother'] = 0

In [31]:
titanic_df.loc[(titanic_df['Sex']=='female') & (titanic_df['Parch']>0) & (titanic_df['Age']>20),
'IsMother'] = 1

In [32]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeIsMissing,Age_bin,Fare_bin,FamilySize,IsAlone,IsMother
0,0,3,male,22.0,1,0,7.25,S,0,"(18, 25]","(-0.001, 7.854]",2,0,0
1,1,1,female,38.0,1,0,71.2833,C,0,"(25, 40]","(39.688, 512.329]",2,0,0
2,1,3,female,26.0,0,0,7.925,S,0,"(25, 40]","(7.854, 10.5]",1,1,0
3,1,1,female,35.0,1,0,53.1,S,0,"(25, 40]","(39.688, 512.329]",2,0,0
4,0,3,male,35.0,0,0,8.05,S,0,"(25, 40]","(7.854, 10.5]",1,1,0


In [33]:
pd.crosstab(titanic_df.Survived,titanic_df.IsMother).apply(lambda a:a/a.sum(),axis=0)

IsMother,0,1
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.642157,0.333333
1,0.357843,0.666667


In [34]:
titanic_df['SexAge_Combo'] = titanic_df['Sex'] + "_" + titanic_df['Age_bin'].astype(str)

In [35]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeIsMissing,Age_bin,Fare_bin,FamilySize,IsAlone,IsMother,SexAge_Combo
0,0,3,male,22.0,1,0,7.25,S,0,"(18, 25]","(-0.001, 7.854]",2,0,0,"male_(18, 25]"
1,1,1,female,38.0,1,0,71.2833,C,0,"(25, 40]","(39.688, 512.329]",2,0,0,"female_(25, 40]"
2,1,3,female,26.0,0,0,7.925,S,0,"(25, 40]","(7.854, 10.5]",1,1,0,"female_(25, 40]"
3,1,1,female,35.0,1,0,53.1,S,0,"(25, 40]","(39.688, 512.329]",2,0,0,"female_(25, 40]"
4,0,3,male,35.0,0,0,8.05,S,0,"(25, 40]","(7.854, 10.5]",1,1,0,"male_(25, 40]"


In [36]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Survived      891 non-null    int64   
 1   Pclass        891 non-null    int64   
 2   Sex           891 non-null    object  
 3   Age           891 non-null    float64 
 4   SibSp         891 non-null    int64   
 5   Parch         891 non-null    int64   
 6   Fare          891 non-null    float64 
 7   Embarked      891 non-null    object  
 8   AgeIsMissing  891 non-null    int64   
 9   Age_bin       891 non-null    category
 10  Fare_bin      891 non-null    category
 11  FamilySize    891 non-null    int64   
 12  IsAlone       891 non-null    int64   
 13  IsMother      891 non-null    int64   
 14  SexAge_Combo  891 non-null    object  
dtypes: category(2), float64(2), int64(8), object(3)
memory usage: 92.8+ KB


In [37]:
Pclass = pd.get_dummies(titanic_df.Pclass,prefix='Pclass')

In [38]:
Sex = pd.get_dummies(titanic_df.Sex,prefix='Sex')

In [39]:
Embarked = pd.get_dummies(titanic_df.Embarked,prefix='Embarked')

In [40]:
Age_bin = pd.get_dummies(titanic_df.Age_bin,prefix='Age_bin')

In [41]:
Fare_bin = pd.get_dummies(titanic_df.Fare_bin,prefix='Fare_bin')

In [42]:
FamilySize = pd.get_dummies(titanic_df.FamilySize,prefix='FamilySize')

In [43]:
SexAge_Combo = pd.get_dummies(titanic_df.SexAge_Combo,prefix='SexAge_Combo')

In [44]:
TrainData=pd.concat([titanic_df[['Survived','AgeIsMissing','IsAlone','IsMother']],Pclass,Sex,Embarked,Age_bin,Fare_bin,FamilySize,SexAge_Combo],axis=1)

In [45]:
TrainData.head(10)

Unnamed: 0,Survived,AgeIsMissing,IsAlone,IsMother,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,...,"SexAge_Combo_female_(0, 18]","SexAge_Combo_female_(18, 25]","SexAge_Combo_female_(25, 40]","SexAge_Combo_female_(40, 60]","SexAge_Combo_female_(60, 100]","SexAge_Combo_male_(0, 18]","SexAge_Combo_male_(18, 25]","SexAge_Combo_male_(25, 40]","SexAge_Combo_male_(40, 60]","SexAge_Combo_male_(60, 100]"
0,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,1,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
2,1,0,1,0,0,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
5,0,1,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
6,0,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
7,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8,1,0,0,1,0,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,0
9,1,0,0,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0


In [46]:
TrainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 41 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   Survived                       891 non-null    int64
 1   AgeIsMissing                   891 non-null    int64
 2   IsAlone                        891 non-null    int64
 3   IsMother                       891 non-null    int64
 4   Pclass_1                       891 non-null    uint8
 5   Pclass_2                       891 non-null    uint8
 6   Pclass_3                       891 non-null    uint8
 7   Sex_female                     891 non-null    uint8
 8   Sex_male                       891 non-null    uint8
 9   Embarked_C                     891 non-null    uint8
 10  Embarked_Q                     891 non-null    uint8
 11  Embarked_S                     891 non-null    uint8
 12  Age_bin_(0, 18]                891 non-null    uint8
 13  Age_bin_(18, 25]    

In [47]:
# as such we prepare the data for training

In [48]:
TrainData_X = TrainData.drop(['Survived'], axis=1)

In [49]:
TrainData_X.head()

Unnamed: 0,AgeIsMissing,IsAlone,IsMother,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,...,"SexAge_Combo_female_(0, 18]","SexAge_Combo_female_(18, 25]","SexAge_Combo_female_(25, 40]","SexAge_Combo_female_(40, 60]","SexAge_Combo_female_(60, 100]","SexAge_Combo_male_(0, 18]","SexAge_Combo_male_(18, 25]","SexAge_Combo_male_(25, 40]","SexAge_Combo_male_(40, 60]","SexAge_Combo_male_(60, 100]"
0,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,0,1,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [50]:
TrainData_y = TrainData.Survived

In [51]:
TrainData_y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [52]:
X_train, X_test, y_train, y_test = train_test_split(TrainData_X, TrainData_y, test_size = 0.3, random_state=123456)

In [53]:
lr = LogisticRegression(solver='liblinear')

In [54]:
lr.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [55]:
y_test_pred = lr.predict(X_test)

In [56]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.82      0.81      0.81       166
           1       0.70      0.72      0.71       102

    accuracy                           0.77       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.77      0.77      0.77       268



In [57]:
# Now we already have a model to predict