In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tarfile
import urllib.request
from pathlib import Path

In [2]:
def load_titanic_data():
    tarball_path = Path("datasets/titanic.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://homl.info/titanic.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as tarball_file:
            tarball_file.extractall(path="datasets")
    return pd.read_csv(Path("datasets/titanic/train.csv"))

In [3]:
ttnc = load_titanic_data()

In [4]:
ttnc.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
classdata = ttnc[["Pclass", "Survived"]]
class1y = classdata[(classdata["Pclass"]==1) & (classdata["Survived"]==1)]
class2y = classdata[(classdata["Pclass"]==2) & (classdata["Survived"]==1)]
class3y = classdata[(classdata["Pclass"]==3) & (classdata["Survived"]==1)]
class1n = classdata[(classdata["Pclass"]==1) & (classdata["Survived"]==0)]
class2n = classdata[(classdata["Pclass"]==2) & (classdata["Survived"]==0)]
class3n = classdata[(classdata["Pclass"]==3) & (classdata["Survived"]==0)]
print(class1n.shape[0]/classdata.shape[0]*100)
print(class2n.shape[0]/classdata.shape[0]*100)
print(class3n.shape[0]/classdata.shape[0]*100)

8.978675645342312
10.886644219977553
41.75084175084175


In [7]:
short = ["Mr.", "Mrs.", "Miss.", "Dr.", "Capt.", "Col.", "Rev.", "Mlle.", "Major.", "Mme.", "Don.", "Lady.", "Sir.", "Countess.", "Master.", "Ms.", "Jonkheer."]
subs ={
    0:"mr",
    1:"mrs",
    2:"miss",
    3:"mr",
    4:"mr",
    5:"mr",
    6:"mr",
    7:"mrs",
    8:"mr",
    9:"mrs",
    10:"mr",
    11:"mrs",
    12:"mr",
    13:"mrs",
    14:"mass",
    15:"mass",
    16:"mr"
}
def replace_with_subs(x):
    for i, s in enumerate(short):
        if s in x:
            return subs[i]
    return x


ttnc_copy['Name'] = ttnc_copy['Name'].apply(replace_with_subs)

In [8]:
import numpy as np
ttnc_num = ttnc.select_dtypes(include=[np.number])
corr_matrix = ttnc_num.corr()
corr_matrix

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096066
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096066,0.159651,0.216225,1.0


In [6]:
ttnc_copy = ttnc

In [9]:
ttnc_copy.drop(['Pclass', 'PassengerId', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

In [10]:
ttnc_copy

Unnamed: 0,Survived,Name,Sex,Age,SibSp,Parch,Fare
0,0,mr,male,22.0,1,0,7.2500
1,1,mrs,female,38.0,1,0,71.2833
2,1,miss,female,26.0,0,0,7.9250
3,1,mrs,female,35.0,1,0,53.1000
4,0,mr,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,mr,male,27.0,0,0,13.0000
887,1,miss,female,19.0,0,0,30.0000
888,0,miss,female,,1,2,23.4500
889,1,mr,male,26.0,0,0,30.0000


In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

cat_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown="ignore")
)

attribs = ["Name", "Sex"]
preprocess = ColumnTransformer(
    [
         ("encoder", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    ]
)
ttnc_transformed = pd.DataFrame(preprocess.fit_transform(ttnc_copy), columns=preprocess.get_feature_names_out(), index=ttnc_copy.index)
ttnc_transformed

Unnamed: 0,encoder__Name_mass,encoder__Name_miss,encoder__Name_mr,encoder__Name_mrs,encoder__Sex_female,encoder__Sex_male
0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
886,0.0,0.0,1.0,0.0,0.0,1.0
887,0.0,1.0,0.0,0.0,1.0,0.0
888,0.0,1.0,0.0,0.0,1.0,0.0
889,0.0,0.0,1.0,0.0,0.0,1.0


In [12]:
ttnc_copy.drop(["Name", "Sex"], axis=1, inplace=True)
ttnc_copy = pd.concat([ttnc_copy, ttnc_transformed], axis=1)
ttnc_copy

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,encoder__Name_mass,encoder__Name_miss,encoder__Name_mr,encoder__Name_mrs,encoder__Sex_female,encoder__Sex_male
0,0,22.0,1,0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,0.0,0.0,0.0,1.0,1.0,0.0
2,1,26.0,0,0,7.9250,0.0,1.0,0.0,0.0,1.0,0.0
3,1,35.0,1,0,53.1000,0.0,0.0,0.0,1.0,1.0,0.0
4,0,35.0,0,0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,0.0,0.0,1.0,0.0,0.0,1.0
887,1,19.0,0,0,30.0000,0.0,1.0,0.0,0.0,1.0,0.0
888,0,,1,2,23.4500,0.0,1.0,0.0,0.0,1.0,0.0
889,1,26.0,0,0,30.0000,0.0,0.0,1.0,0.0,0.0,1.0


In [None]:
ttnc_copy

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,encoder__Name_mass,encoder__Name_miss,encoder__Name_mr,encoder__Name_mrs,encoder__Sex_female,encoder__Sex_male
0,0,22.0,1,0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,0.0,0.0,0.0,1.0,1.0,0.0
2,1,26.0,0,0,7.9250,0.0,1.0,0.0,0.0,1.0,0.0
3,1,35.0,1,0,53.1000,0.0,0.0,0.0,1.0,1.0,0.0
4,0,35.0,0,0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,0.0,0.0,1.0,0.0,0.0,1.0
887,1,19.0,0,0,30.0000,0.0,1.0,0.0,0.0,1.0,0.0
888,0,,1,2,23.4500,0.0,1.0,0.0,0.0,1.0,0.0
889,1,26.0,0,0,30.0000,0.0,0.0,1.0,0.0,0.0,1.0


In [13]:
ttnc_copy1 = ttnc_copy
ttnc_copy

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,encoder__Name_mass,encoder__Name_miss,encoder__Name_mr,encoder__Name_mrs,encoder__Sex_female,encoder__Sex_male
0,0,22.0,1,0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0
1,1,38.0,1,0,71.2833,0.0,0.0,0.0,1.0,1.0,0.0
2,1,26.0,0,0,7.9250,0.0,1.0,0.0,0.0,1.0,0.0
3,1,35.0,1,0,53.1000,0.0,0.0,0.0,1.0,1.0,0.0
4,0,35.0,0,0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,0.0,0.0,1.0,0.0,0.0,1.0
887,1,19.0,0,0,30.0000,0.0,1.0,0.0,0.0,1.0,0.0
888,0,,1,2,23.4500,0.0,1.0,0.0,0.0,1.0,0.0
889,1,26.0,0,0,30.0000,0.0,0.0,1.0,0.0,0.0,1.0


In [14]:
ttnc_copy['Age'].fillna(0, inplace=True)

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
import numpy as np

class CustomTransform(BaseEstimator, TransformerMixin):
  def __init__(self, with_mean=True):
      self.with_mean = with_mean
      self.mr, self.mrs, self.mass, self.miss = [], [], [], []

  def fit(self, X, y=None):
      self.n_features_in_ = X.shape[1]
      X = check_array(X)
      for i in range(X.shape[0]):
        if(X[i][5]==1):
          self.mass.append(X[i][1])
        elif(X[i][6]==1):
          self.miss.append(X[i][1])
        elif(X[i][7]==1):
          self.mr.append(X[i][1])
        elif(X[i][8]==1):
          self.mrs.append(X[i][1])
      self.mr = np.array(self.mr)
      self.mrs = np.array(self.mrs)
      self.miss = np.array(self.miss)
      self.mass = np.array(self.mass)
      self.mr_med = np.median(self.mr)
      self.mrs_med = np.median(self.mrs)
      self.miss_med = np.median(self.miss)
      self.mass_med = np.median(self.mass)
      return self

  def transform(self, X):
      check_is_fitted(self)
      X = check_array(X)
      assert self.n_features_in_ == X.shape[1]
      for i in range(X.shape[0]):
        if(X[i][1]==0.0 and X[i][5]==1):
          X[i][1] = self.mass_med
        elif(X[i][1]==0.0 and X[i][6]==1):
          X[i][1] = self.miss_med
        elif(X[i][1]==0.0 and X[i][7]==1):
          X[i][1] = self.mr_med
        elif(X[i][1]==0.0 and X[i][8]==1):
          X[i][1] = self.mrs_med
      return X


In [None]:
transformer = CustomTransform()
transformed_df = transformer.fit_transform(ttnc_copy)
transformed_df

array([[ 0., 22.,  1., ...,  0.,  0.,  1.],
       [ 1., 38.,  1., ...,  1.,  1.,  0.],
       [ 1., 26.,  0., ...,  0.,  1.,  0.],
       ...,
       [ 0., 18.,  1., ...,  0.,  1.,  0.],
       [ 1., 26.,  0., ...,  0.,  0.,  1.],
       [ 0., 32.,  0., ...,  0.,  0.,  1.]])

In [16]:
colmns = ttnc_copy.columns
indxs = ttnc_copy.index
trnsf = CustomTransform()
testmat = trnsf.fit_transform(ttnc_copy1)
testmat

array([[ 0., 22.,  1., ...,  0.,  0.,  1.],
       [ 1., 38.,  1., ...,  1.,  1.,  0.],
       [ 1., 26.,  0., ...,  0.,  1.,  0.],
       ...,
       [ 0., 18.,  1., ...,  0.,  1.,  0.],
       [ 1., 26.,  0., ...,  0.,  0.,  1.],
       [ 0., 32.,  0., ...,  0.,  0.,  1.]])

In [17]:
transformed_ttnc = pd.DataFrame(testmat, columns=colmns, index=indxs)
transformed_ttnc

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,encoder__Name_mass,encoder__Name_miss,encoder__Name_mr,encoder__Name_mrs,encoder__Sex_female,encoder__Sex_male
0,0.0,22.0,1.0,0.0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,38.0,1.0,0.0,71.2833,0.0,0.0,0.0,1.0,1.0,0.0
2,1.0,26.0,0.0,0.0,7.9250,0.0,1.0,0.0,0.0,1.0,0.0
3,1.0,35.0,1.0,0.0,53.1000,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,35.0,0.0,0.0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,27.0,0.0,0.0,13.0000,0.0,0.0,1.0,0.0,0.0,1.0
887,1.0,19.0,0.0,0.0,30.0000,0.0,1.0,0.0,0.0,1.0,0.0
888,0.0,18.0,1.0,2.0,23.4500,0.0,1.0,0.0,0.0,1.0,0.0
889,1.0,26.0,0.0,0.0,30.0000,0.0,0.0,1.0,0.0,0.0,1.0


In [18]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score
target_lbl = ttnc_copy['Survived']
transformed_ttnc.drop('Survived', axis=1, inplace=True)
sgd_clf = SGDClassifier()
train_pred = cross_val_predict(sgd_clf, transformed_ttnc, target_lbl, cv=3)
print(precision_score(target_lbl, train_pred))
print(recall_score(target_lbl, train_pred))

0.6719745222929936
0.6169590643274854
