In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration

In [None]:
# Read data
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
train_data.info()

In [None]:
train_data.describe().T

In [None]:
train_data.shape

# Data Cleaning

In [None]:
# Check for missing values
missing_values = train_data.isnull().sum()
print(missing_values)

In [None]:
# Check for missing values
missing_values = test_data.isnull().sum()
print(missing_values)

In [None]:
# Impute missing values in the 'Age' column with the median
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

In [None]:
# Impute missing values in the 'Fare' column with the mode
test_data['Fare'].fillna(test_data['Fare'].mode()[0], inplace=True)

In [None]:
# Impute missing values in the 'Embarked' column with the mode
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

In [None]:
# Drop irrelevant columns
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [None]:
train_data.info()

In [None]:
missing_values = train_data.isnull().sum()
print(missing_values)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a box plot for 'Age'
plt.figure(figsize=(8, 6))
sns.boxplot(x='Age', data=train_data, orient='vertical')
plt.title('Box Plot of Age')
plt.xlabel('Age')
plt.show()

# Create a box plot for 'Fare'
plt.figure(figsize=(8, 6))
sns.boxplot(x='Fare', data=train_data, orient='vertical')
plt.title('Box Plot of Fare')
plt.xlabel('Fare')
plt.show()

In [None]:
# Identify outliers in the 'Age' column using the IQR method
Q1 = train_data['Age'].quantile(0.25)
Q3 = train_data['Age'].quantile(0.75)
IQR = Q3 - Q1

# Define the outlier boundaries
lower_bound = Q1 - 3 * IQR
upper_bound = Q3 + 3 * IQR

# Find outliers in the 'Age' column
outliers_age = train_data[(train_data['Age'] < lower_bound) | (train_data['Age'] > upper_bound)]

# Identify outliers in the 'Fare' column using the same method
Q1 = train_data['Fare'].quantile(0.25)
Q3 = train_data['Fare'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 3 * IQR
upper_bound = Q3 + 3 * IQR
outliers_fare = train_data[(train_data['Fare'] < lower_bound) | (train_data['Fare'] > upper_bound)]

In [None]:
# Remove outliers from the 'Age' column
train_data = train_data[~train_data.index.isin(outliers_age.index)]

# Cap extreme values in the 'Fare' column
train_data['Fare'] = train_data['Fare'].apply(lambda x: upper_bound if x > upper_bound else x)

In [None]:
# Create a box plot for 'Age'
plt.figure(figsize=(8, 6))
sns.boxplot(x='Age', data=train_data, orient='vertical')
plt.title('Box Plot of Age')
plt.xlabel('Age')
plt.show()

# Create a box plot for 'Fare'
plt.figure(figsize=(8, 6))
sns.boxplot(x='Fare', data=train_data, orient='vertical')
plt.title('Box Plot of Fare')
plt.xlabel('Fare')
plt.show()

# Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a bar chart for the 'Survived' variable
plt.figure(figsize=(6, 4))
sns.countplot(data=train_data, x='Survived')
plt.title('Distribution of Survival')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.show()

In [None]:
# Create histograms for 'Age,' 'Fare,' 'SibSp,' and 'Parch' together
plt.figure(figsize=(16, 8))

# Histogram for Age
plt.subplot(2, 2, 1)
sns.histplot(data=train_data, x='Age', bins=20, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')

# Histogram for Fare
plt.subplot(2, 2, 2)
sns.histplot(data=train_data, x='Fare', bins=20, kde=True)
plt.title('Fare Distribution')
plt.xlabel('Fare')
plt.ylabel('Frequency')

# Histogram for SibSp
plt.subplot(2, 2, 3)
sns.histplot(data=train_data, x='SibSp', bins=20, kde=True)
plt.title('SibSp Distribution')
plt.xlabel('SibSp')
plt.ylabel('Frequency')

# Histogram for Parch
plt.subplot(2, 2, 4)
sns.histplot(data=train_data, x='Parch', bins=20, kde=True)
plt.title('Parch Distribution')
plt.xlabel('Parch')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Create count plots for 'Sex', 'Pclass', and 'Embarked'
plt.figure(figsize=(18, 4))

plt.subplot(1, 3, 1)
sns.countplot(data=train_data, x='Sex')
plt.title('Gender Distribution')
plt.xlabel('Sex')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
sns.countplot(data=train_data, x='Pclass')
plt.title('Passenger Class Distribution')
plt.xlabel('Pclass')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
sns.countplot(data=train_data, x='Embarked')
plt.title('Embarked Distribution')
plt.xlabel('Embarked')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Perform one-hot encoding for categorical columns
train_data_encoded = pd.get_dummies(train_data, columns=['Sex', 'Pclass', 'Embarked'])

In [None]:
# Calculate the correlation matrix for the encoded dataset
correlation_matrix = train_data_encoded.corr()

In [None]:
# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Calculate the correlation between each feature and the target variable 'outcome'
correlation_with_outcome = correlation_matrix['Survived'][:-1].sort_values()

# Create a bar plot
plt.figure(figsize=(10, 6))
correlation_with_outcome.plot(kind='bar', color='skyblue')
plt.title('Correlation with Survival (Target Variable)')
plt.xlabel('Features')
plt.ylabel('Correlation')
plt.xticks(rotation=45)
plt.show()

# Feature Engineering

In [None]:
train_data.head()

In [None]:
# Create a 'FamilySize' feature
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch']

In [None]:
# Define age bins and labels
age_bins = [0, 18, 30, 50, 80]
age_labels = ['0-18', '19-30', '31-50', '51-80']

# Create a new 'AgeCategory' feature with age bins
train_data['AgeCategory'] = pd.cut(train_data['Age'], bins=age_bins, labels=age_labels)
test_data['AgeCategory'] = pd.cut(test_data['Age'], bins=age_bins, labels=age_labels)

In [None]:
# Define fare bins and labels
fare_bins = [0, 20, 50, 100, 200, 300, 600]  
fare_labels = ['0-20', '21-50', '51-100', '101-200', '201-300', '301-600']

# Create a new column 'FareCategory' with fare bins
train_data['FareCategory'] = pd.cut(train_data['Fare'], bins=fare_bins, labels=fare_labels)
test_data['FareCategory'] = pd.cut(test_data['Fare'], bins=fare_bins, labels=fare_labels)

In [None]:
# Perform one-hot encoding for categorical columns
train_data = pd.get_dummies(train_data, columns=['AgeCategory', 'FareCategory', 'Sex', 'Embarked'])
test_data = pd.get_dummies(test_data, columns=['AgeCategory', 'FareCategory', 'Sex', 'Embarked'])

In [None]:
# Add interaction features
train_data['AgeClass'] = train_data['Age'] * train_data['Pclass']
test_data['AgeClass'] = test_data['Age'] * test_data['Pclass']

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.shape

In [None]:
train_data.info()

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Scale the 'Age' and 'Fare' columns
train_data[['Age', 'Fare']] = scaler.fit_transform(train_data[['Age', 'Fare']])
test_data[['Age', 'Fare']] = scaler.fit_transform(test_data[['Age', 'Fare']])

In [None]:
import numpy as np

# Apply log transformation to 'Fare' (use np.log1p to avoid issues with zero values)
train_data['Fare'] = np.log1p(train_data['Fare'])
test_data['Fare'] = np.log1p(test_data['Fare'])

# Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

# Define your features (X) and target (y)
X = train_data.drop('Survived', axis=1)  # Features (excluding the target)
y = train_data['Survived']  # Target variable

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting sets
print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)

# Modeling using Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_model_rf = grid_search.best_estimator_

# Retrain the model with the best hyperparameters
best_model_rf.fit(X_train, y_train)

In [None]:
# Predict using the retrained Random Forest model with tuned hyperparameters
y_pred_rf_best = best_model_rf.predict(X_test)

# Evaluate the Random Forest model
accuracy_rf_best = accuracy_score(y_test, y_pred_rf_best)
confusion_matrix_rf_best = confusion_matrix(y_test, y_pred_rf_best)
classification_report_rf_best = classification_report(y_test, y_pred_rf_best)

# Print the evaluation results for the initial model
print("Random Forest Accuracy with Tuned Hyperparameters:", accuracy_rf_best)
print("Confusion Matrix:")
print(confusion_matrix_rf_best)
print("Classification Report:")
print(classification_report_rf_best)

# Submission

In [None]:
test_predictions = best_model_rf.predict(test_data)

In [None]:
# Load the original test dataset again
test = pd.read_csv("/kaggle/input/titanic/test.csv")

# Recover the "PassengerId" column
test_ids = test['PassengerId']

In [None]:
submission_df = pd.DataFrame({'PassengerId': test_ids, 'Survived': test_predictions})
submission_df.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")