# Titanic Machine Learning From Disaster

## Import Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

## Loading the Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Explore the training data
print(train_df.head())
print(train_df.info())

## Data Preprocessing

In [None]:
# Imputing missing values
impute_age = SimpleImputer(strategy="median")
train_df['Age'] = impute_age.fit_transform(train_df[['Age']])

# Check if 'Embarked' column exists
if 'Embarked' in train_df.columns:
    train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
else:
    print("'Embarked' column not found in the dataset.")
train_df['Cabin'] = train_df['Cabin'].notna().astype(int)

# Converting categorical variables
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
train_df = pd.get_dummies(train_df, columns=['Embarked'])

# Feature engineering
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']
train_df['IsAlone'] = (train_df['FamilySize'] == 0).astype(int)

# Dropping unnecessary features
train_df.drop(['Ticket', 'Name', 'SibSp', 'Parch'], axis=1, inplace=True)

# Scaling numerical features
scaler = StandardScaler()
train_df[['Age', 'Fare', 'FamilySize']] = scaler.fit_transform(train_df[['Age', 'Fare', 'FamilySize']])
train_df

## Exploratory Data Analysis

In [None]:
# Set the aesthetics for the plots
sns.set(style="whitegrid")

# Analyzing the Overall Survival Rate
overall_survival_rate = train_df['Survived'].mean()

# Creating plots to analyze feature correlations with survival
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 15))

# Pclass vs Survival
sns.barplot(x='Pclass', y='Survived', data=train_df, ax=axes[0, 0])

# Sex vs Survival
sns.barplot(x='Sex', y='Survived', data=train_df, ax=axes[0, 1])

# Age vs Survival
sns.histplot(data=train_df, x='Age', hue='Survived', multiple='stack', bins=30, ax=axes[1, 0])

# Fare vs Survival
sns.histplot(data=train_df, x='Fare', hue='Survived', multiple='stack', bins=40, ax=axes[1, 1])

# FamilySize vs Survival
sns.barplot(x='FamilySize', y='Survived', data=train_df, ax=axes[2, 0])

# IsAlone vs Survival
sns.barplot(x='IsAlone', y='Survived', data=train_df, ax=axes[2, 1])

plt.tight_layout()
plt.show()

print("Overall Survival Rate on Training Data:", overall_survival_rate)

## Model Training

In [None]:
X = train_df.drop(['Survived', 'PassengerId'], axis=1)
y = train_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

## Model Evaluation

In [None]:
y_pred = random_forest_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)

## Preparing the Test Data for Predictions

In [None]:
# Apply the same preprocessing to the test data
test_df['Age'] = impute_age.transform(test_df[['Age']])
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())
test_df['Cabin'] = test_df['Cabin'].notna().astype(int)
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

# Check if 'Embarked' column exists before creating dummy variables
if 'Embarked' in test_df.columns:
    test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)

# Create 'FamilySize' and 'IsAlone' features
if 'SibSp' in test_df.columns and 'Parch' in test_df.columns:
    test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']
    test_df['IsAlone'] = (test_df['FamilySize'] == 0).astype(int)

# Drop unnecessary columns only if they exist
columns_to_drop = ['Ticket', 'Name', 'SibSp', 'Parch']
for col in columns_to_drop:
    if col in test_df.columns:
        test_df.drop(col, axis=1, inplace=True)

# Scale numerical features
test_df[['Age', 'Fare', 'FamilySize']] = scaler.transform(test_df[['Age', 'Fare', 'FamilySize']])

# Add missing dummy columns if they are not present in the test set
for col in set(X_train.columns) - set(test_df.columns):
    test_df[col] = 0

# Reorder columns to match the training set
test_df = test_df[X_train.columns]
test_df

## Making Predictions on the Test Set

In [None]:
# Making predictions on the test dataset
test_predictions = random_forest_model.predict(test_df)

# Creating a DataFrame for the predictions
prediction_df = pd.DataFrame({
    'PassengerId': test_df.index + 1,  # Adjusting index to match PassengerId
    'Survived': test_predictions
})

# Displaying the first few predictions
prediction_df.head(20)