---
### MACHINE LEARNING DATA SCIENCE PROJECT
#### Titanic Survival Prediction
---

In [None]:

# 1️⃣ Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

sns.set(style="whitegrid")

In [2]:
print("Libraries Imported Successfully")

Libraries Imported Successfully


In [None]:
titanic = sns.load_dataset('titanic')
df = titanic.copy()
df.head()


In [None]:
print("Dataset Info:\n")
print(df.info())

print("\nMissing Values:\n", df.isnull().sum())

# Simple plots
plt.figure(figsize=(6,4))
sns.countplot(x='survived', data=df)
plt.title("Survival Count")
plt.show()

In [None]:
# Drop columns not needed
df.drop(['deck', 'embark_town', 'alive', 'who', 'adult_male', 'class'], axis=1, inplace=True)

# Fill missing values
df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# Encode categorical features
le = LabelEncoder()
for col in ['sex', 'embarked']:
    df[col] = le.fit_transform(df[col])

# Features and target
X = df.drop('survived', axis=1)
y = df['survived']

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances = feat_importances.sort_values(ascending=False)

plt.figure(figsize=(8,5))
sns.barplot(x=feat_importances, y=feat_importances.index)
plt.title("Feature Importance")
plt.show()

# 9️⃣ Conclusion
# We now have a complete ML pipeline:
# - Data loading and cleaning
# - Exploratory Data Analysis (EDA)
# - Feature encoding and scaling
# - Model training
# - Evaluation
# - Feature importance visualization