In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the train dataset
train_data_path = r"C:\Users\kunta\Downloads\titanic\train.csv"  # replace with your actual train dataset path
train_df = pd.read_csv(train_data_path)

# Data Exploration
print(train_df.head())
print(train_df.info())
print(train_df.describe())

# Data Cleaning
# Handling missing values
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
train_df.drop(columns=['Cabin'], inplace=True)  # Dropping Cabin due to many missing values

# Converting categorical features to numerical
train_df['Sex'] = LabelEncoder().fit_transform(train_df['Sex'])
train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)

# Feature Engineering
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']
train_df['IsAlone'] = (train_df['FamilySize'] == 0).astype(int)
train_df.drop(columns=['Name', 'Ticket', 'SibSp', 'Parch'], inplace=True)

# Splitting the data
X = train_df.drop(columns=['PassengerId', 'Survived'])
y = train_df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Model Building
clf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Best estimator
best_clf = grid_search.best_estimator_

# Predictions on validation set
y_val_pred = best_clf.predict(X_val_scaled)

# Evaluation
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

# Load the test dataset
test_data_path = r"C:\Users\kunta\Downloads\titanic\test.csv"  # replace with your actual test dataset path
test_df = pd.read_csv(test_data_path)

# Data Cleaning on test set
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)  # Handle missing Fare value if present
test_df.drop(columns=['Cabin'], inplace=True)

# Converting categorical features to numerical
test_df['Sex'] = LabelEncoder().fit_transform(test_df['Sex'])
test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)

# Feature Engineering on test set
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']
test_df['IsAlone'] = (test_df['FamilySize'] == 0).astype(int)
test_df.drop(columns=['Name', 'Ticket', 'SibSp', 'Parch'], inplace=True)

# Aligning the test set with the train set
X_test = test_df.drop(columns=['PassengerId'])
X_test_scaled = scaler.transform(X_test)

# Predictions on test set
test_predictions = best_clf.predict(X_test_scaled)

# Generating CSV for submission
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_predictions
})
submission_df.to_csv('titanic_predictions.csv', index=False)

print("Predictions saved to titanic_predictions.csv")


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c