In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
titanic_train_data = pd.read_csv("train.csv")

In [3]:
titanic_train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
titanic_train_data = titanic_train_data.drop(columns=["Cabin", "Name", "Ticket"])

In [5]:
train_set, test_set = train_test_split(
    titanic_train_data,
    test_size=0.2,
    random_state=0,
    stratify=titanic_train_data["Pclass"],
)

In [6]:
y_train = train_set["Survived"]
X_train = train_set.drop(columns=["Survived"])

In [7]:
colums_with_missing_values = X_train.columns[X_train.isnull().any()]

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_imputer = SimpleImputer(strategy="median")
object_imputer = SimpleImputer(strategy="most_frequent")
oneHotEncoder = OneHotEncoder(
    handle_unknown="ignore", sparse_output=False, drop="first"
)

In [9]:
def getLogPlusOne(fare):
    return np.log(fare + 1)

In [10]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

fare_transformer = Pipeline(
    steps=[
        (
            "log_plus_1",
            FunctionTransformer(
                getLogPlusOne, validate=True, feature_names_out="one-to-one"
            ),
        ),
        ("standard_scaler", StandardScaler()),
    ]
)

In [11]:
pipeline_numerical_imputer = Pipeline(
    steps=[("numerical_imputer", numerical_imputer), ("scaler", StandardScaler())]
)
pipeline_object_imputer = Pipeline(
    steps=[("object_imputer", object_imputer), ("one_hot_encode", oneHotEncoder)]
)
numerical_standard_pipeline = Pipeline(steps=[("scaler", StandardScaler())])
object_standard_pipeline = Pipeline(steps=[("one_hot_encoder", oneHotEncoder)])

In [12]:
X_train.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [13]:
from sklearn.compose import ColumnTransformer

columnTransformer = ColumnTransformer(
    [
        ("fare_transformer", fare_transformer, ["Fare"]),
        ("numerical_with_missing_vals", pipeline_numerical_imputer, ["Age"]),
        ("object_with_missing_vals", pipeline_object_imputer, ["Embarked"]),
        ("standard_object", object_standard_pipeline, ["Sex"]),
        ("standard_numeric", numerical_standard_pipeline, ["SibSp", "Parch"]),
    ],
    remainder="passthrough",
)

In [14]:
columnTransformer = columnTransformer.fit(X_train)

In [15]:
X_train_transformed = columnTransformer.transform(X_train)

In [16]:
X_train_transformed_df = pd.DataFrame(
    X_train_transformed,
    index=X_train,
    columns=columnTransformer.get_feature_names_out(),
)

In [17]:
y_test = test_set["Survived"]
X_test = test_set.drop(columns=["Survived"])

In [18]:
X_test_transformed = columnTransformer.transform(X_test)

Making sure output has same informatio

# Logistic regrssion

In [19]:
from sklearn.linear_model import LogisticRegression
logesticRegrssion = LogisticRegression(random_state= 0)


Logistic regression returns warning about not able to converge

 /opt/homebrew/anaconda3/envs/kerasenv/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

In [20]:
logesticRegrssion.fit(X_train_transformed, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Naive Bais

In [21]:
from sklearn.naive_bayes import GaussianNB
naiveBaisGausian = GaussianNB()

In [22]:
naiveBaisGausian.fit(X_train_transformed, y_train)

In [23]:
y_train_predict = naiveBaisGausian.predict(X_train_transformed)
print(
    np.concatenate(
        (
            y_train_predict.reshape(len(y_train_predict), 1),
            y_train.to_numpy().reshape(len(y_train), 1),
        ),
        axis=1,
    )
)

[[1 1]
 [0 0]
 [1 1]
 ...
 [0 0]
 [1 1]
 [1 1]]


In [24]:
y_test_predict = naiveBaisGausian.predict(X_test_transformed)
print(
    np.concatenate(
        (
            y_train_predict.reshape(len(y_train_predict), 1),
            y_train.to_numpy().reshape(len(y_train), 1),
        ),
        axis=1,
    )
)

[[1 1]
 [0 0]
 [1 1]
 ...
 [0 0]
 [1 1]
 [1 1]]


In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_test_predict, normalize='all'))
print(accuracy_score(y_test, y_test_predict, normalize=True))

[[0.4972067  0.17877095]
 [0.08938547 0.23463687]]
0.7318435754189944


# Kernal SVM

In [26]:
from sklearn.svm import SVC
svmKernal = SVC(kernel='rbf', random_state= 0)
svmKernal.fit(X_train_transformed, y_train)

In [27]:
y_test_predict = svmKernal.predict(X_test_transformed)
print(
    np.concatenate(
        (
            y_test_predict.reshape(len(y_test_predict), 1),
            y_test.to_numpy().reshape(len(y_test), 1),
        ),
        axis=1,
    )
)

[[0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]

In [28]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_test_predict, normalize='all'))
print(accuracy_score(y_test, y_test_predict, normalize=True))

[[0.67597765 0.        ]
 [0.32402235 0.        ]]
0.6759776536312849


# KNN classifier

In [29]:
from sklearn.neighbors import KNeighborsClassifier

kNeighborsClassifier = KNeighborsClassifier(p=2)
kNeighborsClassifier.fit(X_train_transformed, y_train)

In [30]:
y_test_predict = kNeighborsClassifier.predict(X_test_transformed)

In [31]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_test_predict, normalize='all'))
print(accuracy_score(y_test, y_test_predict, normalize=True))

[[0.53072626 0.1452514 ]
 [0.2122905  0.11173184]]
0.6424581005586593


# Random Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier

randomForest = RandomForestClassifier(n_estimators=50, criterion="entropy", random_state=0)

In [33]:
randomForest.fit(X_train_transformed, y_train)
y_test_predict = randomForest.predict(X_test_transformed)

In [34]:
print(confusion_matrix(y_test, y_test_predict, normalize='all'))
print(accuracy_score(y_test, y_test_predict, normalize=True))

[[0.60893855 0.06703911]
 [0.10055866 0.22346369]]
0.8324022346368715
