In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

X_train = pd.read_csv("titanic/train.csv")
X_test = pd.read_csv("titanic/test.csv")

In [2]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
y_train = X_train["Survived"]
del X_train["Survived"]

# y_test = X_test["Survived"]
# del X_test["Survived"]

In [4]:
dataInfo = X_train.dtypes
dataInfo

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
num_columns = []
for i in range(len(dataInfo)):
    if dataInfo.values[i] in [np.int64, np.float64]:
        num_columns.append(dataInfo.index[i])
print(num_columns)

X_train_numerical = X_train[num_columns]
X_test_numerical = X_test[num_columns]
X_train_numerical

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,1,3,22.0,1,0,7.2500
1,2,1,38.0,1,0,71.2833
2,3,3,26.0,0,0,7.9250
3,4,1,35.0,1,0,53.1000
4,5,3,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,887,2,27.0,0,0,13.0000
887,888,1,19.0,0,0,30.0000
888,889,3,,1,2,23.4500
889,890,1,26.0,0,0,30.0000


In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

simp_imp = SimpleImputer(strategy="median")
mmScaler = MinMaxScaler()
log_reg = LogisticRegression()
X_train_imp = simp_imp.fit_transform(X_train_numerical)
X_train_mm = mmScaler.fit_transform(X_train_imp)
print(X_train_mm)

[[0.         1.         0.27117366 0.125      0.         0.01415106]
 [0.0011236  0.         0.4722292  0.125      0.         0.13913574]
 [0.00224719 1.         0.32143755 0.         0.         0.01546857]
 ...
 [0.99775281 1.         0.34656949 0.125      0.33333333 0.04577135]
 [0.9988764  0.         0.32143755 0.         0.         0.0585561 ]
 [1.         1.         0.39683338 0.         0.         0.01512699]]


In [7]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin 

class cabinClassTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        adaptCabinVec = np.vectorize(self.adaptCabin)
        X_new_cabin = adaptCabinVec(X)
        return list(X_new_cabin.reshape(-1,1))#sklearn has an issue with np arrays here
    
    def adaptCabin(self, cabin):
        if pd.isnull(cabin):
            return "0"
        return cabin[0]

In [None]:
# #Ordinal encoding
# categories = ["0","T","G","F","E","D","C","B","A"]
# from sklearn.preprocessing import OrdinalEncoder
# ord_enc = OrdinalEncoder(categories=[categories])#categories seed to be of shape (n_features,)
# X_train["CabinClassEnc"] = ord_enc.fit_transform(X_train["CabinClass"].values.reshape(-1,1))
# del X_train["Cabin"]
# del X_train["CabinClass"]
# X_train.head()

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline


#for the numerical columns
num_pipe_transform = Pipeline([
                    ("simp_imp",SimpleImputer(strategy="median")),
                    ("mm_scaler", MinMaxScaler())#last estimator only calls fit
])

#just for testing, no need to call it here (that's why it's in comments)
# num_pipe_transform.fit(X_train_numerical)
# num_pipe_transform.transform(X_train_numerical)


cabin_cats = ["0","T","G","F","E","D","C","B","A"]

#calling our self build estimator from the cell above (cabinClassTransformer) + cabin_cats(line above)
cabin_pipeline = Pipeline([
                    ("cabin_rename",cabinClassTransformer()),
                    ("cabin_ord_transform", OrdinalEncoder(categories=[cabin_cats]))#(n_features,)
])

#for the text columns: fill missing values by SimpleImputer using OneHotEncoder()
text_pipeline = Pipeline([
                    ("constant_impute",SimpleImputer(strategy="constant",fill_value="0")), #constant: fill constantly with 0
                    ("one_hot_enc", OneHotEncoder())
])

cabin_column = ["Cabin"]
text_columns = ["Sex","Embarked"]

#combining numerical and text columns
full_pipeline = ColumnTransformer([
         ("num", num_pipe_transform, num_columns),
         ("cabin", cabin_pipeline, cabin_column),
        ("text", text_pipeline, text_columns)
     ])

#"name" column has to be dropped because there is no real use for it in computations
if "Name" in X_train.columns:
    X_train = X_train.drop(labels=["Name","Ticket"],axis=1)
    X_test = X_test.drop(labels=["Name","Ticket"],axis=1)

#transform the newly created full_pipeline
X_train_transformed = full_pipeline.fit_transform(X_train)
log_reg = LogisticRegression()
log_reg.fit(X_train_transformed,y_train)
print("Training score:",log_reg.score(X_train_transformed,y_train))
print("Testing score", log_reg.predict(full_pipeline.transform(X_test)))

Training score: 0.7968574635241302
Testing score [0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 1 1 0 1
 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 1 0 1 1
 1 1 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 1 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0
 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [10]:
# ord_enc.fit_transform(X_train["Embarked"].values.reshape(-1,1))
# ord_enc.fit_transform(X_train["Embarked"].fillna(0).values.reshape(-1,1))
# #We defined categories in intialization

ord_enc_gen = OrdinalEncoder()
# #When categories aren't given the order isn't known and is determined through comparisons
# ord_enc_gen.fit_transform(X_train["Embarked"].fillna(0).values.reshape(-1,1))#int vs string comparison error
ord_enc_gen.fit_transform(X_train["Embarked"].fillna("0").values.reshape(-1,1))
ord_enc_gen.categories_

[array(['0', 'C', 'Q', 'S'], dtype=object)]

In [12]:
#One hot encoding
one_hot_enc = OneHotEncoder()#sparse=True
transformed = one_hot_enc.fit_transform(X_train[["Sex","Embarked"]].fillna("0"))
print(transformed)#by default returns a sparse array
print(one_hot_enc.categories_)
oneHotDF = pd.DataFrame(transformed.toarray(),columns = one_hot_enc.get_feature_names_out())
print(oneHotDF)
X_train = pd.concat([X_train,oneHotDF],axis=1)
del X_train["Sex"]
del X_train["Embarked"]
X_train

  (0, 1)	1.0
  (0, 5)	1.0
  (1, 0)	1.0
  (1, 3)	1.0
  (2, 0)	1.0
  (2, 5)	1.0
  (3, 0)	1.0
  (3, 5)	1.0
  (4, 1)	1.0
  (4, 5)	1.0
  (5, 1)	1.0
  (5, 4)	1.0
  (6, 1)	1.0
  (6, 5)	1.0
  (7, 1)	1.0
  (7, 5)	1.0
  (8, 0)	1.0
  (8, 5)	1.0
  (9, 0)	1.0
  (9, 3)	1.0
  (10, 0)	1.0
  (10, 5)	1.0
  (11, 0)	1.0
  (11, 5)	1.0
  (12, 1)	1.0
  :	:
  (878, 5)	1.0
  (879, 0)	1.0
  (879, 3)	1.0
  (880, 0)	1.0
  (880, 5)	1.0
  (881, 1)	1.0
  (881, 5)	1.0
  (882, 0)	1.0
  (882, 5)	1.0
  (883, 1)	1.0
  (883, 5)	1.0
  (884, 1)	1.0
  (884, 5)	1.0
  (885, 0)	1.0
  (885, 4)	1.0
  (886, 1)	1.0
  (886, 5)	1.0
  (887, 0)	1.0
  (887, 5)	1.0
  (888, 0)	1.0
  (888, 5)	1.0
  (889, 1)	1.0
  (889, 3)	1.0
  (890, 1)	1.0
  (890, 4)	1.0
[array(['female', 'male'], dtype=object), array(['0', 'C', 'Q', 'S'], dtype=object)]
     Sex_female  Sex_male  Embarked_0  Embarked_C  Embarked_Q  Embarked_S
0           0.0       1.0         0.0         0.0         0.0         1.0
1           1.0       0.0         0.0         1.0       

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_female,Sex_male,Embarked_0,Embarked_C,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.2500,,0.0,1.0,0.0,0.0,0.0,1.0
1,2,1,38.0,1,0,71.2833,C85,1.0,0.0,0.0,1.0,0.0,0.0
2,3,3,26.0,0,0,7.9250,,1.0,0.0,0.0,0.0,0.0,1.0
3,4,1,35.0,1,0,53.1000,C123,1.0,0.0,0.0,0.0,0.0,1.0
4,5,3,35.0,0,0,8.0500,,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,27.0,0,0,13.0000,,0.0,1.0,0.0,0.0,0.0,1.0
887,888,1,19.0,0,0,30.0000,B42,1.0,0.0,0.0,0.0,0.0,1.0
888,889,3,,1,2,23.4500,,1.0,0.0,0.0,0.0,0.0,1.0
889,890,1,26.0,0,0,30.0000,C148,0.0,1.0,0.0,1.0,0.0,0.0


In [13]:
# one_hot_enc.transform(X_train[["Embarked","Sex"]].fillna("0"))
# one_hot_enc.transform(X_train[["Sex","Embarked"]].fillna("1"))

In [14]:
X_train = X_train.drop(labels=["Name","Ticket"],axis=1)
print(X_train.info())
X_train.head()

KeyError: "['Name', 'Ticket'] not found in axis"

In [15]:
from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy="median")
X_train = sim_imp.fit_transform(X_train)

ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'C85'

In [16]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
log_reg.score(X_train,y_train)#We need to transform our test dataset too

ValueError: could not convert string to float: 'C85'

In [None]:
#embeddings