In [280]:
## DECISION TREE CLASSIFIER

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Load data
train_data = pd.read_csv('train.csv')

# Extract first letter of 'Cabin' field as a new column
train_data['Cabin'] = train_data['Cabin'].str[0]

# Assign weighted values to 'Sex'
train_data['Sex'] = train_data['Sex'].map({'female': 1, 'male': 0})

# Drop unnecessary columns from X
X = train_data.drop(columns=['Survived', 'Name', 'Embarked', 'Ticket'])

# Identify and handle empty rows
empty_row_indices = X.index[X.isnull().all(axis=1)]  # Rows where all columns are NaN
X.drop(empty_row_indices, inplace=True)

# Calculate overall median age
overall_median_age = X['Age'].median()

# Fill missing 'Age' values based on 'SibSp' and 'Parch', with fallback to overall median
age_median = X.groupby(['SibSp', 'Parch'])['Age'].median()

def fill_age(row):
    if pd.isnull(row['Age']):
        try:
            return age_median.loc[(row['SibSp'], row['Parch'])]
        except KeyError:
            return overall_median_age
    else:
        return row['Age']

X['Age'] = X.apply(fill_age, axis=1)

# Handle any remaining NaN values in 'Age' by filling with overall median
X['Age'].fillna(overall_median_age, inplace=True)

# Encode 'Cabin' column into numerical format
cabin_encoder = LabelEncoder()
X['Cabin'] = cabin_encoder.fit_transform(X['Cabin'])

# Preprocessing pipeline - The ColumnTransformer applies the StandardScaler to the specified numerical columns to standardize them (i.e., mean=0, standard deviation=1).
#The remainder='passthrough' parameter indicates that columns not specified in the transformers should be passed through without any changes.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age', 'Fare'])
    ], remainder='passthrough')

# Combine preprocessor and model into a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

# Extract the target variable
y = train_data['Survived']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__max_depth': [None, 10, 20, 30, 40, 50],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__criterion': ['gini', 'entropy']
}

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           cv=8, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy with best model: {:.2f}".format(accuracy))

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Fitting 8 folds for each of 324 candidates, totalling 2592 fits
Best parameters found:  {'classifier__criterion': 'entropy', 'classifier__max_depth': 50, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10}
Best cross-validation score: 0.82
Test set accuracy with best model: 0.78
Confusion Matrix:
[[88 17]
 [23 51]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       105
           1       0.75      0.69      0.72        74

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.77       179
weighted avg       0.78      0.78      0.77       179



In [296]:
## RANDOM FOREST CLASSIFIER

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

# Load data
train_data = pd.read_csv('train.csv')

# Feature engineering
train_data['Cabin'] = train_data['Cabin'].str[0]
train_data['Sex'] = train_data['Sex'].map({'female': 1, 'male': 0})

X = train_data.drop(columns=['Survived', 'Name', 'Embarked', 'Ticket'])
y = train_data['Survived']

overall_median_age = X['Age'].median()
age_median = X.groupby(['SibSp', 'Parch'])['Age'].median()

def fill_age(row):
    if pd.isnull(row['Age']):
        try:
            return age_median.loc[(row['SibSp'], row['Parch'])]
        except KeyError:
            return overall_median_age
    else:
        return row['Age']

X['Age'] = X.apply(fill_age, axis=1)
X['Age'].fillna(overall_median_age, inplace=True)

cabin_encoder = LabelEncoder()
X['Cabin'] = cabin_encoder.fit_transform(X['Cabin'])

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age', 'Fare'])
    ], remainder='passthrough')

# Combine preprocessor and model into a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           cv=8, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Get best model
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy with best model: {:.2f}".format(accuracy))

# Confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Fitting 8 folds for each of 24 candidates, totalling 192 fits
Best parameters found:  {'classifier__max_depth': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best cross-validation score: 0.83
Test set accuracy with best model: 0.81
Confusion Matrix:
[[91 14]
 [20 54]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       105
           1       0.79      0.73      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [312]:
## RANDOM FOREST CLASSIFIER

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

# Load data
train_data = pd.read_csv('train.csv')

# Feature engineering
train_data['Cabin'] = train_data['Cabin'].str[0]
train_data['Sex'] = train_data['Sex'].map({'female': 1, 'male': 0})

X = train_data.drop(columns=['Survived', 'Name', 'Embarked', 'Ticket'])
y = train_data['Survived']

overall_median_age = X['Age'].median()
age_median = X.groupby(['SibSp', 'Parch'])['Age'].median()

def fill_age(row):
    if pd.isnull(row['Age']):
        try:
            return age_median.loc[(row['SibSp'], row['Parch'])]
        except KeyError:
            return overall_median_age
    else:
        return row['Age']

X['Age'] = X.apply(fill_age, axis=1)
X['Age'].fillna(overall_median_age, inplace=True)

cabin_encoder = LabelEncoder()
X['Cabin'] = cabin_encoder.fit_transform(X['Cabin'])

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age', 'Fare'])
    ], remainder='passthrough')

# Combine preprocessor and model into a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           cv=8, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Get best model
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy with best model: {:.2f}".format(accuracy))

# Confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


ValueError: A given column is not a column of the dataframe