In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

X_train = pd.read_csv("titanic/train.csv")
X_test = pd.read_csv("titanic/test.csv")

In [2]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
y_train = X_train["Survived"]
del X_train["Survived"]

# y_test = X_test["Survived"]
# del X_test["Survived"]

In [4]:
#show all datatypes as a prliminary analysis
dataInfo = X_train.dtypes
dataInfo

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
#loop that filters only numerical columns (int + float)
numColumns = []
for i in range(len(dataInfo)): #len: len of our dataframe
    if dataInfo.values[i] in [np.int64, np.float64]: #iterate over int + float
        numColumns.append(dataInfo.index[i]) #add int+float to the new column using index
print(numColumns)

X_train_numerical = X_train[numColumns] #apply the new column to the trainig set
X_test_numerical = X_test[numColumns] #apply the new column to the test set
X_train_numerical

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,1,3,22.0,1,0,7.2500
1,2,1,38.0,1,0,71.2833
2,3,3,26.0,0,0,7.9250
3,4,1,35.0,1,0,53.1000
4,5,3,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,887,2,27.0,0,0,13.0000
887,888,1,19.0,0,0,30.0000
888,889,3,,1,2,23.4500
889,890,1,26.0,0,0,30.0000


In [6]:
#MORE COMPLICATED:

#trying different transformations
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

#defining imputer (fills missing values with median) and scaler
simp_imp = SimpleImputer(strategy="median")
mmScaler = MinMaxScaler()

#variable for our logistic regression
log_reg = LogisticRegression()

#simple imputer
X_train_imp = simp_imp.fit_transform(X_train_numerical)

#MinMaxScaler on SimpleImputer
X_train_mm = mmScaler.fit_transform(X_train_imp)
print(X_train_mm)

[[0.         1.         0.27117366 0.125      0.         0.01415106]
 [0.0011236  0.         0.4722292  0.125      0.         0.13913574]
 [0.00224719 1.         0.32143755 0.         0.         0.01546857]
 ...
 [0.99775281 1.         0.34656949 0.125      0.33333333 0.04577135]
 [0.9988764  0.         0.32143755 0.         0.         0.0585561 ]
 [1.         1.         0.39683338 0.         0.         0.01512699]]


In [20]:
#EASIER (than in the row before, despite it does exactly the same thing)
#BUT NOT RECOMMENDED: DON'T USE THE LOGISTIC REGRESSION IN A PIPELINE

from sklearn.pipeline import Pipeline

#(name, estimator)
# All but last estimator need to have transform method
num_pipe_model = Pipeline([
                    ("simp_imp",SimpleImputer(strategy="median")), #simp_imp: name it whatever you want
                    ("mm_scaler", MinMaxScaler()),
                    ("log_reg", LogisticRegression())#last estimator only calls fit
])

#pipeline has the methods of our last estimator (fit, predict, ...), but can also be fit, transform, fit_transform...
#calling .fit() on pipeline runs fit_transform() on all steps (imputer + scaler) except the last (logistic regression), where it just runs fit

num_pipe_model.fit(X_train_numerical, y_train)
num_pipe_model.predict(X_test_numerical)

#following step not working because the logistic regression does NOT have a transform method:
#num_pipe_model.transform(X_test_numerical)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,

In [23]:
#BEST SOLUTION (now without the logistic regression in the pipeline):

num_pipe_transform = Pipeline([
                    ("simp_imp",SimpleImputer(strategy="median")),
                    ("mm_scaler", MinMaxScaler())#last estimator only calls fit
])

#NOW TRANSFORM IS WORKING
num_pipe_transform.fit(X_train_numerical)
num_pipe_transform.transform(X_train_numerical)

#shorter than the 2 lines before because it combines fit AND transform
#num_pipe_transform.fit_transform(X_train_numerical)


#can also be used on the testing data:
#num_pipe_transform.transform(X_test_numerical)


#result: transformed numerical data set

array([[0.        , 1.        , 0.27117366, 0.125     , 0.        ,
        0.01415106],
       [0.0011236 , 0.        , 0.4722292 , 0.125     , 0.        ,
        0.13913574],
       [0.00224719, 1.        , 0.32143755, 0.        , 0.        ,
        0.01546857],
       ...,
       [0.99775281, 1.        , 0.34656949, 0.125     , 0.33333333,
        0.04577135],
       [0.9988764 , 0.        , 0.32143755, 0.        , 0.        ,
        0.0585561 ],
       [1.        , 1.        , 0.39683338, 0.        , 0.        ,
        0.01512699]])

In [24]:
#data preparation
def getCabinClass(cabin):
    if pd.isnull(cabin):
        return 0
    return cabin[0]
X_train["CabinClass"] = X_train["Cabin"].apply(getCabinClass)
print("CabinClass:",X_train["CabinClass"].unique())

CabinClass: [0 'C' 'E' 'G' 'D' 'A' 'B' 'F' 'T']


In [26]:
#Ordinal encoding
categories = [0,"T","G","F","E","D","C","B","A"]
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder(categories=[categories])#categories seed to be of shape (n_features,)
X_train["CabinClassEnc"] = ord_enc.fit_transform(X_train["CabinClass"].values.reshape(-1,1))
del X_train["Cabin"]
del X_train["CabinClass"]
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,CabinClassEnc
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0.0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,6.0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0.0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,6.0
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0.0


In [27]:
# ord_enc.fit_transform(X_train["Embarked"].values.reshape(-1,1))
# ord_enc.fit_transform(X_train["Embarked"].fillna(0).values.reshape(-1,1))
# #We defined categories in intialization

ord_enc_gen = OrdinalEncoder()
# #When categories aren't given the order isn't known and is determined through comparisons
# ord_enc_gen.fit_transform(X_train["Embarked"].fillna(0).values.reshape(-1,1))#int vs string comparison error
ord_enc_gen.fit_transform(X_train["Embarked"].fillna("0").values.reshape(-1,1))
ord_enc_gen.categories_

[array(['0', 'C', 'Q', 'S'], dtype=object)]

In [29]:
#One hot encoding
from sklearn.preprocessing import OneHotEncoder
one_hot_enc = OneHotEncoder()#sparse=True
transformed = one_hot_enc.fit_transform(X_train[["Sex","Embarked"]].fillna("0"))
print(transformed)#by default returns a sparse array
print(one_hot_enc.categories_)
oneHotDF = pd.DataFrame(transformed.toarray(),columns = one_hot_enc.get_feature_names_out())
print(oneHotDF)
X_train = pd.concat([X_train,oneHotDF],axis=1)
del X_train["Sex"]
del X_train["Embarked"]
X_train

  (0, 1)	1.0
  (0, 5)	1.0
  (1, 0)	1.0
  (1, 3)	1.0
  (2, 0)	1.0
  (2, 5)	1.0
  (3, 0)	1.0
  (3, 5)	1.0
  (4, 1)	1.0
  (4, 5)	1.0
  (5, 1)	1.0
  (5, 4)	1.0
  (6, 1)	1.0
  (6, 5)	1.0
  (7, 1)	1.0
  (7, 5)	1.0
  (8, 0)	1.0
  (8, 5)	1.0
  (9, 0)	1.0
  (9, 3)	1.0
  (10, 0)	1.0
  (10, 5)	1.0
  (11, 0)	1.0
  (11, 5)	1.0
  (12, 1)	1.0
  :	:
  (878, 5)	1.0
  (879, 0)	1.0
  (879, 3)	1.0
  (880, 0)	1.0
  (880, 5)	1.0
  (881, 1)	1.0
  (881, 5)	1.0
  (882, 0)	1.0
  (882, 5)	1.0
  (883, 1)	1.0
  (883, 5)	1.0
  (884, 1)	1.0
  (884, 5)	1.0
  (885, 0)	1.0
  (885, 4)	1.0
  (886, 1)	1.0
  (886, 5)	1.0
  (887, 0)	1.0
  (887, 5)	1.0
  (888, 0)	1.0
  (888, 5)	1.0
  (889, 1)	1.0
  (889, 3)	1.0
  (890, 1)	1.0
  (890, 4)	1.0
[array(['female', 'male'], dtype=object), array(['0', 'C', 'Q', 'S'], dtype=object)]
     Sex_female  Sex_male  Embarked_0  Embarked_C  Embarked_Q  Embarked_S
0           0.0       1.0         0.0         0.0         0.0         1.0
1           1.0       0.0         0.0         1.0       

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,CabinClassEnc,Sex_female,Sex_male,Embarked_0,Embarked_C,Embarked_Q,Embarked_S
0,1,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,6.0,1.0,0.0,0.0,1.0,0.0,0.0
2,3,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,6.0,1.0,0.0,0.0,0.0,0.0,1.0
4,5,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,0.0,0.0,1.0,0.0,0.0,0.0,1.0
887,888,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,7.0,1.0,0.0,0.0,0.0,0.0,1.0
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,0.0,1.0,0.0,0.0,0.0,0.0,1.0
889,890,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,6.0,0.0,1.0,0.0,1.0,0.0,0.0


In [31]:
# one_hot_enc.transform(X_train[["Embarked","Sex"]].fillna("0"))
# one_hot_enc.transform(X_train[["Sex","Embarked"]].fillna("1"))

In [30]:
X_train = X_train.drop(labels=["Name","Ticket"],axis=1)
print(X_train.info())
X_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    891 non-null    int64  
 1   Pclass         891 non-null    int64  
 2   Age            714 non-null    float64
 3   SibSp          891 non-null    int64  
 4   Parch          891 non-null    int64  
 5   Fare           891 non-null    float64
 6   CabinClassEnc  891 non-null    float64
 7   Sex_female     891 non-null    float64
 8   Sex_male       891 non-null    float64
 9   Embarked_0     891 non-null    float64
 10  Embarked_C     891 non-null    float64
 11  Embarked_Q     891 non-null    float64
 12  Embarked_S     891 non-null    float64
dtypes: float64(9), int64(4)
memory usage: 90.6 KB
None


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,CabinClassEnc,Sex_female,Sex_male,Embarked_0,Embarked_C,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2,1,38.0,1,0,71.2833,6.0,1.0,0.0,0.0,1.0,0.0,0.0
2,3,3,26.0,0,0,7.925,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,4,1,35.0,1,0,53.1,6.0,1.0,0.0,0.0,0.0,0.0,1.0
4,5,3,35.0,0,0,8.05,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [32]:
from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy="median")
X_train = sim_imp.fit_transform(X_train)

In [33]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
# log_reg.score(X_train,y_train)#We need to transform our test dataset too

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#embeddings