In [17]:
## DECISION TREE CLASSIFIER

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Load data
train_data = pd.read_csv('train.csv')

# Extract first letter of 'Cabin' field as a new column
train_data['Cabin'] = train_data['Cabin'].str[0]

# Assign weighted values to 'Sex'
train_data['Sex'] = train_data['Sex'].map({'female': 1, 'male': 0})

# Drop unnecessary columns from X
X = train_data.drop(columns=['Survived', 'Name', 'Embarked', 'Ticket'])

# Identify and handle empty rows
empty_row_indices = X.index[X.isnull().all(axis=1)]  # Rows where all columns are NaN
X.drop(empty_row_indices, inplace=True)

# Calculate overall median age
overall_median_age = X['Age'].median()

# Fill missing 'Age' values based on 'SibSp' and 'Parch', with fallback to overall median
age_median = X.groupby(['SibSp', 'Parch'])['Age'].median()

def fill_age(row):
    if pd.isnull(row['Age']):
        try:
            return age_median.loc[(row['SibSp'], row['Parch'])]
        except KeyError:
            return overall_median_age
    else:
        return row['Age']

X['Age'] = X.apply(fill_age, axis=1)

# Handle any remaining NaN values in 'Age' by filling with overall median
X['Age'] = X['Age'].fillna(overall_median_age)

# Encode 'Cabin' column into numerical format
cabin_encoder = LabelEncoder()
X['Cabin'] = cabin_encoder.fit_transform(X['Cabin'])

# Preprocessing pipeline - The ColumnTransformer applies the MinMaxScaler to the specified numerical columns to standardize them where now min value is 0 and max value is 1.
#The remainder='passthrough' parameter indicates that columns not specified in the transformers should be passed through without any changes.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), ['Age', 'Fare'])
    ], remainder='passthrough')

# Combine preprocessor and model into a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

# Extract the target variable
y = train_data['Survived']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__max_depth': [None, 10, 20, 30, 40, 50],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__criterion': ['gini', 'entropy']
}

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           cv=8, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy with best model: {:.2f}".format(accuracy))

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Fitting 8 folds for each of 324 candidates, totalling 2592 fits
Best parameters found:  {'classifier__criterion': 'gini', 'classifier__max_depth': 20, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2}
Best cross-validation score: 0.81
Test set accuracy with best model: 0.79
Confusion Matrix:
[[90 15]
 [23 51]]
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       105
           1       0.77      0.69      0.73        74

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.78       179
weighted avg       0.79      0.79      0.79       179



864 fits failed out of a total of 2592.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
690 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py"

In [18]:
#RANDOM FOREST CLASSIFIER

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import joblib  # Import joblib for model persistence

# Load data
train_data = pd.read_csv('train.csv')

# Feature engineering
train_data['Cabin'] = train_data['Cabin'].str[0]
train_data['Sex'] = train_data['Sex'].map({'female': 1, 'male': 0})

X = train_data.drop(columns=['Survived', 'Name', 'Embarked', 'Ticket'])
y = train_data['Survived']

overall_median_age = X['Age'].median()
age_median = X.groupby(['SibSp', 'Parch'])['Age'].median()

def fill_age(row):
    if pd.isnull(row['Age']):
        try:
            return age_median.loc[(row['SibSp'], row['Parch'])]
        except KeyError:
            return overall_median_age
    else:
        return row['Age']

X['Age'] = X.apply(fill_age, axis=1)
X['Age'] = X['Age'].fillna(overall_median_age)

cabin_encoder = LabelEncoder()
X['Cabin'] = cabin_encoder.fit_transform(X['Cabin'])

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), ['Age', 'Fare'])
    ], remainder='passthrough')

# Combine preprocessor and model into a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           cv=8, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Get best model
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy with best model: {:.2f}".format(accuracy))

# Confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

#joblib.dump(best_model, 'titanic-survival-model.joblib')

Fitting 8 folds for each of 24 candidates, totalling 192 fits
Best parameters found:  {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best cross-validation score: 0.82
Test set accuracy with best model: 0.82
Confusion Matrix:
[[92 13]
 [20 54]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       105
           1       0.81      0.73      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.81       179
weighted avg       0.82      0.82      0.81       179



In [10]:
## RANDOM FOREST CLASSIFIER

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import joblib  # Import joblib for model persistence

# Load data
test_data = pd.read_csv('test.csv')

# Load the persisted model to make predictions
model = joblib.load('titanic-survival-model.joblib')

# Input new data points to predict their genre
predictions = model.predict(X)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
