In [None]:
# TITANIC SURVIVAL ANALYSIS

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:

df = pd.read_csv("/content/train(1).csv")
df.head()


In [None]:

df.info()


In [None]:

df.isnull().sum()


In [None]:

df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=['Cabin'], inplace=True)


In [None]:

sns.countplot(x='Survived', data=df)
plt.show()


In [None]:

sns.countplot(x='Sex', hue='Survived', data=df)
plt.show()


In [None]:

sns.countplot(x='Pclass', hue='Survived', data=df)
plt.show()


In [None]:

sns.boxplot(x='Survived', y='Age', data=df)
plt.show()


In [None]:

df.drop(columns=['PassengerId','Name','Ticket'], inplace=True)


In [None]:

X = df.drop('Survived', axis=1)
y = df['Survived']


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:

categorical_features = ['Sex','Embarked']
numerical_features = ['Age','Fare','Pclass','SibSp','Parch']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])


In [None]:

X_train_p = preprocessor.fit_transform(X_train)
X_test_p = preprocessor.transform(X_test)


In [None]:

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_p, y_train)
y_pred_lr = lr.predict(X_test_p)
accuracy_score(y_test, y_pred_lr)


In [None]:

dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=20, random_state=42)
dt.fit(X_train_p, y_train)
y_pred_dt = dt.predict(X_test_p)
accuracy_score(y_test, y_pred_dt)


In [None]:

rf = RandomForestClassifier(n_estimators=300, max_depth=8, random_state=42)
rf.fit(X_train_p, y_train)
y_pred_rf = rf.predict(X_test_p)
accuracy_score(y_test, y_pred_rf)
