In [185]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
import seaborn as sns

In [186]:
# Load and clean the training data
train_data = pd.read_csv(r'C:\Users\Valmik Belgaonkar\OneDrive\Desktop\IIITB Folder\5th Semester\Machine Learning\ML Lab\Assignment 2\train.csv')

In [187]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5000,C124,S
1,734,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0000,,S
2,383,0,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.9250,,S
3,705,0,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
4,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.2750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
707,107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
708,271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0000,,S
709,861,0,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
710,436,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S


In [188]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            140
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          553
Embarked         2
dtype: int64

In [189]:
train_data['Age']

0      45.5
1      23.0
2      32.0
3      26.0
4       6.0
       ... 
707    21.0
708     NaN
709    41.0
710    14.0
711    21.0
Name: Age, Length: 712, dtype: float64

In [190]:
train_data['Cabin']

0         C124
1          NaN
2          NaN
3          NaN
4          NaN
        ...   
707        NaN
708        NaN
709        NaN
710    B96 B98
711        D26
Name: Cabin, Length: 712, dtype: object

In [191]:
train_data['Embarked']

0      S
1      S
2      S
3      S
4      S
      ..
707    S
708    S
709    S
710    S
711    S
Name: Embarked, Length: 712, dtype: object

In [192]:
# Advanced Feature Engineering
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
train_data['IsAlone'] = (train_data['FamilySize'] == 1).astype(int)

In [193]:
# Extract Title from Name
train_data['Title'] = train_data['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
train_data['Title'] = train_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train_data['Title'] = train_data['Title'].replace(['Mlle', 'Ms'], 'Miss')
train_data['Title'] = train_data['Title'].replace('Mme', 'Mrs')

In [194]:
# Create Age bins
train_data['Age_bin'] = pd.cut(train_data['Age'], bins=[0, 12, 20, 40, 60, 80], labels=[0, 1, 2, 3, 4])
train_data['Fare_bin'] = pd.qcut(train_data['Fare'], 4, labels=[0, 1, 2, 3])

In [195]:
# Drop irrelevant columns
train_data = train_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

In [196]:
# Split the data into features and target variable
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

In [197]:
# Handle missing values with SimpleImputer
imputer = SimpleImputer(strategy='median')
X[['Age', 'Fare']] = imputer.fit_transform(X[['Age', 'Fare']])

In [198]:
# Fill missing values for Embarked with mode
X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])

In [199]:
# Convert categorical variables to numeric
X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})
X = pd.get_dummies(X, columns=['Embarked', 'Pclass', 'Title', 'Age_bin', 'Fare_bin'], drop_first=True)

In [200]:
# Step 2: Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [201]:
# Step 3: Hyperparameter tuning using GridSearchCV for Decision Tree
param_grid = {
    'max_depth': [3, 5, 7, 10, 12],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [202]:
dtree = DecisionTreeClassifier(random_state=42)
# grid_search = GridSearchCV(dtree, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
random_search = RandomizedSearchCV(
    dtree,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1,
    verbose=1,
)
# grid_search.fit(X_train, y_train)
random_search.fit(X_train, y_train)



Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [203]:
# Get the best parameters from grid search
# best_dtree = grid_search.best_estimator_
best_dtree = random_search.best_estimator_

In [204]:
# Evaluate decision tree
y_pred_dtree = best_dtree.predict(X_val)
dtree_accuracy = accuracy_score(y_val, y_pred_dtree)
print(f'Best Decision Tree Accuracy: {dtree_accuracy:.4f}')

Best Decision Tree Accuracy: 0.8411


In [205]:
# Step 4: Random Forest and XGBoost Ensemble Models
rfc = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
# xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb = XGBClassifier(eval_metric='logloss', random_state=42)

# Fit models
rfc.fit(X_train, y_train)
xgb.fit(X_train, y_train)

# Predict and evaluate Random Forest
y_pred_rfc = rfc.predict(X_val)
rfc_accuracy = accuracy_score(y_val, y_pred_rfc)
print(f'Random Forest Accuracy: {rfc_accuracy:.4f}')

# Predict and evaluate XGBoost
y_pred_xgb = xgb.predict(X_val)
xgb_accuracy = accuracy_score(y_val, y_pred_xgb)
print(f'XGBoost Accuracy: {xgb_accuracy:.4f}')

Random Forest Accuracy: 0.8224
XGBoost Accuracy: 0.8131


In [206]:
# Step 5: Voting Classifier for ensemble method
voting_clf = VotingClassifier(estimators=[
    ('dtree', best_dtree),
    ('rfc', rfc),
    ('xgb', xgb)
], voting='soft')

# Fit the Voting Classifier
voting_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_voting = voting_clf.predict(X_val)
voting_accuracy = accuracy_score(y_val, y_pred_voting)
print(f'Voting Classifier Accuracy: {voting_accuracy:.4f}')

Voting Classifier Accuracy: 0.8458


In [207]:
# Load and prepare the test data
test_data = pd.read_csv(r'C:\Users\Valmik Belgaonkar\OneDrive\Desktop\IIITB Folder\5th Semester\Machine Learning\ML Lab\Assignment 2\test.csv')

# Apply the same feature engineering and preprocessing as the training data
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
test_data['IsAlone'] = (test_data['FamilySize'] == 1).astype(int)

test_data['Title'] = test_data['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test_data['Title'] = test_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test_data['Title'] = test_data['Title'].replace(['Mlle', 'Ms'], 'Miss')
test_data['Title'] = test_data['Title'].replace('Mme', 'Mrs')

# Create Age bins and Fare bins for test data
test_data['Age_bin'] = pd.cut(test_data['Age'], bins=[0, 12, 20, 40, 60, 80], labels=[0, 1, 2, 3, 4])
test_data['Fare_bin'] = pd.qcut(test_data['Fare'], 4, labels=[0, 1, 2, 3])

# Drop irrelevant columns
test_data = test_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

# Handle missing values for the test data
test_data[['Age', 'Fare']] = imputer.transform(test_data[['Age', 'Fare']])
test_data['Embarked'] = test_data['Embarked'].fillna(test_data['Embarked'].mode()[0])

# Convert categorical variables to numeric for test data
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})
test_data = pd.get_dummies(test_data, columns=['Embarked', 'Pclass', 'Title', 'Age_bin', 'Fare_bin'], drop_first=True)

# Ensure the test data has the same columns as the training set
missing_cols = set(X.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0

# Align the columns in the test set to match the training set
test_data = test_data[X.columns]

In [208]:
# Step 6: Predict on the test data using the Voting Classifier
test_predictions = voting_clf.predict(test_data)

In [209]:
# Prepare submission file
submission = pd.DataFrame({
    'PassengerId': pd.read_csv(r'C:\Users\Valmik Belgaonkar\OneDrive\Desktop\IIITB Folder\5th Semester\Machine Learning\ML Lab\Assignment 2\test.csv')['PassengerId'],
    'Survived': test_predictions
})
submission.to_csv('submission.csv', index=False)

print("Submission file created.")

Submission file created.
