# 🚢 Titanic Survival Prediction

This project uses the Titanic dataset to build predictive models that estimate the survival of passengers. We apply machine learning techniques such as Logistic Regression and Random Forest, and explore data preprocessing, feature engineering, and model evaluation.

## 📥 Load Data

In [None]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()

## 🧹 Data Overview and Missing Values

In [None]:
train.info()
train.isnull().sum()

## 📊 Exploratory Data Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=train, x='Sex', hue='Survived')
plt.title('Survival Count by Gender')
plt.show()

In [None]:
sns.countplot(data=train, x='Pclass', hue='Survived')
plt.title('Survival Count by Passenger Class')
plt.show()

In [None]:
sns.histplot(data=train, x='Age', hue='Survived', bins=30, kde=True)
plt.title('Age Distribution by Survival')
plt.show()

## 🛠️ Data Preprocessing

In [None]:
# Encode 'Sex'
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

# Fill missing values
train['Age'] = train['Age'].fillna(train['Age'].median())
test['Age'] = test['Age'].fillna(test['Age'].median())
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# Drop unused columns
columns_to_drop = ['Cabin', 'Ticket', 'Name']
train = train.drop(columns=columns_to_drop)
test = test.drop(columns=columns_to_drop)

# One-hot encoding
train = pd.get_dummies(train, columns=['Embarked'], drop_first=True)
test = pd.get_dummies(test, columns=['Embarked'], drop_first=True)


## 🤖 Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = train.drop(columns='Survived')
y = train['Survived']
X_test = test[X.columns]
X = X.fillna(0)
X_test = X_test.fillna(0)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Logistic Regression Accuracy: {accuracy:.4f}')

In [None]:
# Feature Importance
import numpy as np
importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0],
    'Abs_Coefficient': np.abs(model.coef_[0])
})
importance.sort_values(by='Abs_Coefficient', ascending=False)

## 🌲 Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, rf_pred)
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')

In [None]:
rf_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
rf_importance

## 🧾 Final Model (without PassengerId) and Submission File

In [None]:
X = X.drop(columns='PassengerId')
X_test_final = X_test.drop(columns='PassengerId')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X_train, y_train)
val_pred = final_model.predict(X_val)
final_accuracy = accuracy_score(y_val, val_pred)
print(f'Final Random Forest Accuracy (no PassengerId): {final_accuracy:.4f}')

In [None]:
original_test = pd.read_csv('test.csv')
test_predictions = final_model.predict(X_test_final)
submission = pd.DataFrame({
    'PassengerId': original_test['PassengerId'],
    'Survived': test_predictions
})
submission.to_csv('titanic_submission.csv', index=False)
submission.head()