In [33]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [34]:
"""Group 15 data science project
DATASET:
(https://www.kaggle.com/competitions/playground-series-s4e11/data)

PROCESS:
1. Load and clean the data
2. EDA - Exploring data
3. Data Pre-processing
4. Creating model
5. Cross-validated training
6. Evaluation of model
"""

'Group 15 data science project\nDATASET:\n(https://www.kaggle.com/competitions/playground-series-s4e11/data)\n\nPROCESS:\n1. Load and clean the data\n2. EDA - Exploring data\n3. Data Pre-processing\n4. Creating model\n5. Cross-validated training\n6. Evaluation of model\n'

In [35]:
"""
1. Loading and cleaning data

TODO:
We need a better way of handling NA values in certain columns - KNN encoding?
Run the DataAnalyser object before preprocessing (if verbose=True) such that it
can present information about the data before we start altering it
"""
cleaning_transformations = [
    {'drop': ['id', 'Name', 'City']},
    {'handle_na': []}
]

# Define Cleaner Class
class Cleaner(BaseEstimator, TransformerMixin):
    """
    This class will be used as part of the ML Pipeline for cleaning and handling data
    """
    def __init__(self, transformations):
        self.transformations = transformations

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        x_temp = x.copy()

        for transformation in self.transformations:
            for transformation_type, column_names in transformation.items():
                if transformation_type == 'handle_na':
                    x_temp = self.deal_with_na_values(x_temp)
                elif transformation_type == 'drop':
                    x_temp = self.drop_unneeded_columns(x_temp, column_names)

        return x_temp

    @staticmethod
    def deal_with_na_values(data, verbose=False):
        data.fillna(0, inplace=True)
        return data

    @staticmethod
    def drop_unneeded_columns(data, column_names):
        return data.drop(columns=column_names)

In [36]:
"""
2. EDA - This is the class that will allow us to display information about the data
         The idea is that this information that we can extract/plot will allow us to tweak out data-preprocessing pipeline
         to achieve better and more generalisable results

         TODO:
         Assess the overall distribution of 'Depressed' and 'not depressed' individuals in the dataset
         Confimatory factor analysis - or another method of figuring out how we can best assess each variables affect in building up
                                       a model which can generalise and help predict which variables affect the overall ability of the model
         look on kaggle for methods other users have used to
"""

class DataAnalyser:

    def __init__(self, base_train_df):
        self.base_train_df = base_train_df

    def view_cleaning_analysis(self):
        self.view_unique_columns()

    def view_data_analysis(self):
      self.component_analysis


    # Cleaning analysis functions
    def view_unique_columns(self):
        print('[] Breakdown of Binary columns values []')
        for column in self.base_train_df.columns:
            print(f'Unique values in column "{column}": {self.base_train_df[column].unique()}')


    # EDA anaylsis functions
    def component_analysis(self):
      ...

# analyser = DataAnalyser(train_df)
# analyser.view_cleaning_analysis()

In [37]:
"""
3. Data Pre-Processing

COLUMNS -
id: Can be dropped
Name: Can be dropped
Gender: binary encoding
age: Normalised
city: Can be dropped
Working professional or Student: Binary encoding
Profession: Frequency-encoding
Academic Pressure: Normalised
Work Pressure: Normalised
CGPA: Normalised
Study Satisfaction: Normalised
Job Satisfaction: Normalised
Sleep Duration: mapped to values and normalised
Dietary Habits: mapped to values and normalised
Degree: Frequency encoding
Suicidal Thoughts: Binary Encoding
Work/Study Hours: Normalised
Financial stress: Normalised
Mental Illness: Binary Encoding
Depression: Target variable (no pre-processing needed)

TODO:

"""

preprocessing_transformations = [
    {'binary_encode': ['Gender', 'Have you ever had suicidal thoughts ?',
                       'Family History of Mental Illness',
                       'Working Professional or Student']},
    {'map_sleep_column': ['Sleep Duration']},
    {'map_diet_column': ['Dietary Habits']},
    {'frequency_encode': ['Profession', 'Degree']},
    {'normalise': ['Financial Stress', 'Age', 'Academic Pressure', 'Work Pressure',
                   'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']}
]


class PreProcessor(BaseEstimator, TransformerMixin):
    """
    This class will handle the Pre-Processin of our data before passing to the Model

    """
    def __init__(self, transformations):
        self.transformations = transformations
        self.mappings = {
            'Gender': {'Male': 0, 'Female': 1},
            'Have you ever had suicidal thoughts ?': {'Yes': 0, 'No': 1},
            'Family History of Mental Illness': {'Yes': 0, 'No': 1},
            'Working Professional or Student': {'Working Professional': 0, 'Student': 1},
        }
        self.sleep_mapping = {
        "More than 8 hours":9,
        'Less than 5 hours':4,
        '5-6 hours':5.5,
        '7-8 hours':7.5,
        '1-2 hours':1.5,
        '6-8 hours':7,
        '4-6 hours':5,
        '6-7 hours':6.5,
        '10-11 hours':10.5,
        '8-9 hours':8.5,
        '9-11 hours':10,
        '2-3 hours':2.5,
        '3-4 hours':3.5,
        'Moderate':6,
        '4-5 hours':4.5,
        '9-6 hours':7.5,
        '1-3 hours':2,
        '1-6 hours':4,
        '8 hours':8,
        '10-6 hours':8,
        'Unhealthy':3,
        'Work_Study_Hours':6,
        '3-6 hours':4.5,
        '9-5':7,
        '9-5 hours':7,
        }
        self.diet_mapping ={
            'More Healty':0,
            'Healthy':1,
            'Less than Healthy':2,
            'Less Healthy':2,
            'Moderate':3,
            'Unhealthy':4,
            'No Healthy':4,
        }

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        x_temp = x.copy()

        for transformation in self.transformations:
            for transformation_name, column_names in transformation.items():
                if transformation_name == 'drop_columns':
                    x_temp = self.drop_columns(x_temp, column_names)
                elif transformation_name == 'binary_encode':
                    x_temp = self.encode_columns(x_temp, column_names, self.mappings)
                elif transformation_name == 'normalise':
                    x_temp = self.normalise_columns(x_temp, column_names)
                elif transformation_name == 'map_sleep_column':
                    x_temp = self.map_sleep_values(x_temp, column_names, self.sleep_mapping)
                elif transformation_name == 'frequency_encode':
                    x_temp = self.frequency_encode(x_temp, column_names)
                elif transformation_name == 'map_diet_column':
                    x_temp = self.map_dietary_value(x_temp, column_names, self.diet_mapping)

        return x_temp

    @staticmethod
    def drop_columns(data, column_names):
        data.drop(columns=column_names, inplace=True)
        return data

    @staticmethod
    def encode_columns(data, column_names, mappings):
        for column in column_names:
            if column in mappings:
                data[column] = data[column].map(mappings[column])
        return data

    @staticmethod
    def map_sleep_values(data, column_names, mapping):
        for column in column_names:
            data[column] = data[column].map(mapping)
            data[column] = data[column].fillna(data[column].mode()[0])
        return data

    @staticmethod
    def map_dietary_value(data, column_names, mapping):
        for column in column_names:
            data[column] = data[column].map(mapping)
            data[column] = data[column].fillna(data[column].mode()[0])
        return data


    @staticmethod
    def frequency_encode(data, column_names):
        for column in column_names:
            if column in data.columns:
                freq = data[column].value_counts() / len(data)
                data[column] = data[column].map(freq)
        return data

    @staticmethod
    def one_hot_encode(data, column_names):
        for column in column_names:
            if column in data.columns:
                one_hot = pd.get_dummies(data[column], prefix=column)
                data = pd.concat([data, one_hot], axis=1)
                data.drop(columns=[column], inplace=True)
        return data

    @staticmethod
    def normalise_columns(data, column_names):
        scaler = MinMaxScaler()  # Initialize MinMaxScaler
        for column in column_names:
            if column in data.columns:
                # Reshape for MinMaxScaler since it expects 2D input
                data[column] = scaler.fit_transform(data[[column]])
        return data

# data_pipeline = Pipeline([
#      ('cleaning', Cleaner(cleaning_transformations)),
#      ('preprocessing', PreProcessor(preprocessing_transformations))
# ])

# Apply the transformations to the data
# processed_data = data_pipeline.transform(train_df)

# # View the processed data
# print(processed_data.head())


In [38]:
"""
4. Creating Model / Training / Evaluating

TODO:
save TP, TN, FP, FN values for each iteration, save all values and average them
"""

# Define model parameters
logistic_regression_params = {'max_iter': 1000, 'random_state': 42}
decision_tree_params = {'criterion': 'gini', 'max_depth': 3, 'random_state': 42}
random_forest_params = {'n_estimators': 100, 'max_depth': 5, 'random_state': 42}
svm_params = {'kernel': 'linear', 'C': 1.0}
knn_params = {'n_neighbors': 5}


# Define models
models = {'Logistic_regression': LogisticRegression(**logistic_regression_params),
          'decision_tree': DecisionTreeClassifier(**decision_tree_params),
          'random_forest': RandomForestClassifier(**random_forest_params)}
# models.append({'SVM': SVC(**svm_params)})
# models.append({'K-Nearest Neighbors': KNeighborsClassifier(**knn_params)})

class Trainer(BaseEstimator, TransformerMixin):

  def __init__(self, models, n_splits=5):
    self.models = models
    self.n_splits = n_splits
    self.results = {}
    self.results_analyser = ResultsAnalyser()

  def fit(self,x,y):
    skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)

    for model_name, model in self.models.items():
      accuracies = []
      tp_percentages, tn_percentages, fp_percentages, fn_percentages = [], [], [], []
      for train_idx, val_idx in skf.split(x, y):
        x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(x_train, y_train)
        y_pred = model.predict(x_val)
        accuracy = accuracy_score(y_val, y_pred)
        # accuracies.append(accuracy_score(y_val, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()

        # Total samples in fold
        total = len(y_val)

        # Store percentages
        tp_percentages.append(tp / total)
        tn_percentages.append(tn / total)
        fp_percentages.append(fp / total)
        fn_percentages.append(fn / total)
        accuracies.append(accuracy)

      self.results[model_name] = {
        'accuracy': np.mean(accuracies),
        'tp%': np.mean(tp_percentages),
        'tn%': np.mean(tn_percentages),
        'fp%': np.mean(fp_percentages),
        'fn%': np.mean(fn_percentages)}
    return self

  def transform(self,x):
    return x

  def get_results(self):
    self.results_analyser.view_results(self.results)
    print(self.results)
# results_analyser = ResultsAnalyser(model_results)
# results_analyser.view_results()

In [44]:
'''
5. Analysing results
'''

class ResultsAnalyser:
    def __init__(self):
      ...

    def view_results(self, results):
        for result in results.items():
            print(result)

    def create_confusion_matrix(fp,fn,tp,tn):
      ...


In [45]:
# Load the data
dataset = pd.read_csv('train.csv')

# Removing target Column
target = dataset.pop('Depression')

pipeline = Pipeline([
    ('cleaning', Cleaner(cleaning_transformations)),
    ('preprocessor', PreProcessor(preprocessing_transformations)),
    ('training', Trainer(models))
])

pipeline.fit(dataset, target)

model_results = pipeline.named_steps['training'].get_results()

('Logistic_regression', {'accuracy': np.float64(0.9376830135039091), 'tp%': np.float64(0.1462046908315565), 'tn%': np.float64(0.7914783226723525), 'fp%': np.float64(0.026808813077469795), 'fn%': np.float64(0.03550817341862118)})
('decision_tree', {'accuracy': np.float64(0.9100995024875622), 'tp%': np.float64(0.15526652452025586), 'tn%': np.float64(0.7548329779673064), 'fp%': np.float64(0.063454157782516), 'fn%': np.float64(0.02644633972992182)})
('random_forest', {'accuracy': np.float64(0.9275053304904052), 'tp%': np.float64(0.13539445628997868), 'tn%': np.float64(0.7921108742004265), 'fp%': np.float64(0.02617626154939588), 'fn%': np.float64(0.04631840796019901)})
{'Logistic_regression': {'accuracy': np.float64(0.9376830135039091), 'tp%': np.float64(0.1462046908315565), 'tn%': np.float64(0.7914783226723525), 'fp%': np.float64(0.026808813077469795), 'fn%': np.float64(0.03550817341862118)}, 'decision_tree': {'accuracy': np.float64(0.9100995024875622), 'tp%': np.float64(0.1552665245202558