In [28]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [29]:
df = pd.read_csv('train.csv')
df.drop('Cabin', axis=1, inplace=True)

In [31]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [32]:
df['Age'] = df.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

In [35]:
df['Fare'] = df.groupby(['Pclass'])['Fare'].transform(lambda x: x.fillna(x.mean()))

In [48]:
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

In [49]:
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
FamilySize     0
dtype: int64

In [50]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1


In [51]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1


In [52]:
df['Pclass'].unique()
df['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [53]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,2,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,0,2
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,2,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,2,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,2,1


In [54]:
x = df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch'], axis=1)
y = df['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train[['Age', 'Fare']] = scaler.fit_transform(x_train[['Age', 'Fare']])
x_test[['Age', 'Fare']] = scaler.transform(x_test[['Age', 'Fare']])

model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
CM = confusion_matrix(y_test, y_pred)
CR = classification_report(y_test, y_pred)

print(f"Accuracy : {round(accuracy, 2)}")
print(f"Confusion Matrix : \n{CM}")
print(f"Classification Report : \n{CR}")

Accuracy : 0.82
Confusion Matrix : 
[[92 13]
 [19 55]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       105
           1       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [55]:
from sklearn.neighbors import KNeighborsClassifier

x = df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch'], axis=1)
y = df['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train[['Age', 'Fare']] = scaler.fit_transform(x_train[['Age', 'Fare']])
x_test[['Age', 'Fare']] = scaler.transform(x_test[['Age', 'Fare']])

model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
CM = confusion_matrix(y_test, y_pred)
CR = classification_report(y_test, y_pred)

print(f"Accuracy : {round(accuracy, 2)}")
print(f"Confusion Matrix : \n{CM}")
print(f"Classification Report : \n{CR}")

Accuracy : 0.8
Confusion Matrix : 
[[90 15]
 [20 54]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179



In [56]:
from sklearn.ensemble import RandomForestClassifier

x = df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch'], axis=1)
y = df['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# scaler = StandardScaler()
# x_train[['Age', 'Fare']] = scaler.fit_transform(x_train[['Age', 'Fare']])
# x_test[['Age', 'Fare']] = scaler.transform(x_test[['Age', 'Fare']])

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
CM = confusion_matrix(y_test, y_pred)
CR = classification_report(y_test, y_pred)

print(f"Accuracy : {round(accuracy, 2)}")
print(f"Confusion Matrix : \n{CM}")
print(f"Classification Report : \n{CR}")

Accuracy : 0.84
Confusion Matrix : 
[[92 13]
 [16 58]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       105
           1       0.82      0.78      0.80        74

    accuracy                           0.84       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179



In [57]:
from sklearn.pipeline import Pipeline

models = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000))
    ]),
    "KNN": Pipeline([
        ("scaler", StandardScaler()),
        ("model", KNeighborsClassifier(n_neighbors=5))
    ]),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        random_state=42
    )
}
    
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    CM = confusion_matrix(y_test, y_pred)
    CR = classification_report(y_test, y_pred)
    
    print(f"{name} - Accuracy : {round(accuracy, 2)}")
    # print(f"{name} - Confusion Matrix : \n{CM}")
    # print(f"{name} - Classification Report : \n{CR}")

    import joblib
    joblib.dump(models["Random Forest"], 'titanic_random_forest_model.pkl')

Logistic Regression - Accuracy : 0.82
KNN - Accuracy : 0.79
Random Forest - Accuracy : 0.84
