In [1]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
"""Group 15 data science project
DATASET:
(https://www.kaggle.com/competitions/playground-series-s4e11/data)

PROCESS:
1. Load and clean the data
2. EDA - Exploring data
3. Data Pre-processing
4. Creating model
5. Cross-validated training
6. Evaluation of model
"""

'Group 15 data science project\nDATASET:\n(https://www.kaggle.com/competitions/playground-series-s4e11/data)\n\nPROCESS:\n1. Load and clean the data\n2. EDA - Exploring data\n3. Data Pre-processing\n4. Creating model\n5. Cross-validated training\n6. Evaluation of model\n'

In [3]:
"""
1. Loading and cleaning data
"""

# Load the data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Removing target Column
target = train_df.pop('Depression')

cleaning_transformations = [
    {'drop': ['id', 'Name', 'City']},
    {'handle_na': []}
]

# Define Cleaner Class
class Cleaner(BaseEstimator, TransformerMixin):
    """
    This class will be used as part of the ML Pipeline for cleaning and handling data
    """
    def __init__(self, transformations):
        self.transformations = transformations
        
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        x_temp = x.copy()
        
        for transformation in self.transformations:
            for transformation_type, column_names in transformation.items():
                if transformation_type == 'handle_na':
                    x_temp = self.deal_with_na_values(x_temp)
                elif transformation_type == 'drop':
                    x_temp = self.drop_unneeded_columns(x_temp, column_names)  
                            
        return x_temp
    
    @staticmethod
    def deal_with_na_values(data, verbose=False):
        data.fillna(0, inplace=True)
        return data
    
    @staticmethod
    def drop_unneeded_columns(data, column_names):
        return data.drop(columns=column_names)

In [4]:
"""
2. EDA - This is the class that will allow us to display information about the data
         The idea is that this information that we can extract/plot will allow us to tweak out data-preprocessing pipeline 
         to achieve better and more generalisable results
"""

class DataAnalyser:
    
    def __init__(self, base_train_df):
        self.base_train_df = base_train_df
        
    def view_basic_analysis(self):
        self.view_unique_columns()
        
    def view_unique_columns(self):
        print('[] Breakdown of Binary columns values []')
        for column in self.base_train_df.columns:
            print(f'Unique values in column "{column}": {self.base_train_df[column].unique()}')
                
analyser = DataAnalyser(train_df)
analyser.view_basic_analysis()

[] Breakdown of Binary columns values []
Unique values in column "id": [     0      1      2 ... 140697 140698 140699]
Unique values in column "Name": ['Aaradhya' 'Vivan' 'Yuvraj' 'Rhea' 'Vani' 'Ritvik' 'Rajveer' 'Aishwarya'
 'Simran' 'Utkarsh' 'Aahana' 'Tejas' 'Aadhya' 'Kiran' 'Aditi' 'Suhani'
 'Jiya' 'Bhavesh' 'Armaan' 'Ishaani' 'Prachi' 'Pratyush' 'Abhinav'
 'Siddhesh' 'Aditya' 'Aarav' 'Asha' 'Kashish' 'Prisha' 'Chhavi' 'Tanmay'
 'Vihaan' 'Shiv' 'Anvi' 'Darsh' 'Samar' 'Raunak' 'Mahi' 'Shaurya' 'Vidya'
 'Jai' 'Ayush' 'Ansh' 'Anand' 'Yashvi' 'Shrey' 'Ritika' 'Mihir' 'Isha'
 'Arjun' 'Rohan' 'Pratham' 'Nirvaan' 'Ishaan' 'Aarya' 'Riya' 'Aariv'
 'Raghavendra' 'Mahika' 'Abhishek' 'Harshil' 'Janvi' 'Kartikeya' 'Shivam'
 'Advait' 'Reyansh' 'Saanvi' 'Ivaan' 'Pallavi' 'Sneha' 'Ayaan' 'Aakash'
 'Raghav' 'Satyam' 'Aarush' 'Vibha' 'Rupal' 'Sanya' 'Mira' 'Rashi' 'Shlok'
 'Harsha' 'Divya' 'Pranav' 'Hrithik' 'Tushar' 'Garima' 'Zoya' 'Kian'
 'Navya' 'Lakshay' 'Kriti' 'Palak' 'Aryan' 'Parth' 'Ishan' '

In [5]:
"""
3. Data Pre-Processing

COLUMNS -
id: Can be dropped
Name: Can be dropped
Gender: binary encoding
age: Temporarily okay but will need normalised
city: Can be dropped
Working professional or Student: Binary encoding
Profession: One-hot-encoding (TEMPORARILY DROPPED)
Academic Pressure: Temporarily okay but will need normalised
Work Pressure: Temporarily okay but will need normalised
CGPA: replace NaN with 0 - also will need normalised
Study Satisfaction: replace NaN with 0 - also will need normalised
Job Satisfaction: replace NaN with 0 - also will need normalised
Sleep Duration: Ordinal Encoding (TEMPORARILY DROPPED)
Dietary Habits: Ordinal Encoding (TEMPORARILY DROPPED)
Degree: One-hot-encoding (TEMPORARILY DROPPED)
Suicidal Thoughts: Binary Encoding
Work/Study Hours: Normalised
Financial stress: Normalised
Mental Illness: Binary Encoding
Depression: Target variable (no pre-processing needed)
"""

preprocessing_transformations = [
    {'binary_encode': ['Gender', 'Have you ever had suicidal thoughts ?',
                       'Family History of Mental Illness',
                       'Working Professional or Student']},
    {'map_sleep_column': ['Sleep Duration']},
    {'map_diet_column': ['Dietary Habits']},
    {'frequency_encode': ['Profession', 'Degree']},
    {'normalise': ['Financial Stress', 'Age', 'Academic Pressure', 'Work Pressure',
                   'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']}
]


class PreProcessor(BaseEstimator, TransformerMixin):
    """
    This class will handle the Pre-Processin of our data before passing to the Model
    
    """
    def __init__(self, transformations):
        self.transformations = transformations
        self.mappings = {
            'Gender': {'Male': 0, 'Female': 1},
            'Have you ever had suicidal thoughts ?': {'Yes': 0, 'No': 1},
            'Family History of Mental Illness': {'Yes': 0, 'No': 1},
            'Working Professional or Student': {'Working Professional': 0, 'Student': 1},
        }
        self.sleep_mapping = {
        "More than 8 hours":9,
        'Less than 5 hours':4,
        '5-6 hours':5.5,
        '7-8 hours':7.5,
        '1-2 hours':1.5,
        '6-8 hours':7,
        '4-6 hours':5,
        '6-7 hours':6.5,
        '10-11 hours':10.5,
        '8-9 hours':8.5,
        '9-11 hours':10,
        '2-3 hours':2.5,
        '3-4 hours':3.5,
        'Moderate':6,
        '4-5 hours':4.5,
        '9-6 hours':7.5,
        '1-3 hours':2,
        '1-6 hours':4,
        '8 hours':8,
        '10-6 hours':8,
        'Unhealthy':3,
        'Work_Study_Hours':6,
        '3-6 hours':4.5,
        '9-5':7,
        '9-5 hours':7,
        }
        self.diet_mapping ={
            'More Healty':0,
            'Healthy':1,
            'Less than Healthy':2,
            'Less Healthy':2,
            'Moderate':3,
            'Unhealthy':4,   
            'No Healthy':4,
        }
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x_temp = x.copy()        
        
        for transformation in self.transformations:
            for transformation_name, column_names in transformation.items():
                if transformation_name == 'drop_columns':
                    x_temp = self.drop_columns(x_temp, column_names)
                elif transformation_name == 'binary_encode':
                    x_temp = self.encode_columns(x_temp, column_names, self.mappings)
                elif transformation_name == 'normalise':
                    x_temp = self.normalise_columns(x_temp, column_names) 
                elif transformation_name == 'map_sleep_column':
                    x_temp = self.map_sleep_values(x_temp, column_names, self.sleep_mapping)
                elif transformation_name == 'frequency_encode':
                    x_temp = self.frequency_encode(x_temp, column_names)
                elif transformation_name == 'map_diet_column':
                    x_temp = self.map_dietary_value(x_temp, column_names, self.diet_mapping)
                    
        return x_temp
    
    @staticmethod
    def drop_columns(data, column_names):
        data.drop(columns=column_names, inplace=True)
        return data
    
    @staticmethod
    def encode_columns(data, column_names, mappings):
        for column in column_names:
            if column in mappings:
                data[column] = data[column].map(mappings[column])
        return data
    
    @staticmethod
    def map_sleep_values(data, column_names, mapping):
        for column in column_names:
            data[column] = data[column].map(mapping)
            data[column].fillna(data[column].mode()[0], inplace=True)  # Fill NaNs with the most frequent value
        return data
    
    @staticmethod
    def map_dietary_value(data, column_names, mapping):
        for column in column_names:
            data[column] = data[column].map(mapping)
            data[column].fillna(data[column].mode()[0], inplace=True)  # Fill NaNs with the most frequent value
        return data
        

    @staticmethod
    def frequency_encode(data, column_names):
        for column in column_names:
            if column in data.columns:
                freq = data[column].value_counts() / len(data)
                data[column] = data[column].map(freq)
        return data
    
#     @staticmethod
#     def one_hot_encode(data, column_names):
#         for column in column_names:
#             if column in data.columns:
#                 # Perform one-hot encoding using pd.get_dummies
#                 one_hot = pd.get_dummies(data[column], prefix=column)
#                 # Concatenate the one-hot encoded columns to the original data
#                 data = pd.concat([data, one_hot], axis=1)
#                 # Drop the original column after encoding
#                 data.drop(columns=[column], inplace=True)
#         return data
    
    @staticmethod
    def normalise_columns(data, column_names):
        scaler = MinMaxScaler()  # Initialize MinMaxScaler
        for column in column_names:
            if column in data.columns:
                # Reshape for MinMaxScaler since it expects 2D input
                data[column] = scaler.fit_transform(data[[column]])
        return data
    
data_pipeline = Pipeline([
     ('cleaning', Cleaner(cleaning_transformations)),
     ('preprocessing', PreProcessor(preprocessing_transformations))
])

# Apply the transformations to the data
processed_data = data_pipeline.transform(train_df)

# View the processed data
print(processed_data.head())


   Gender       Age  Working Professional or Student  Academic Pressure  \
0       1  0.738095                                0                0.0   
1       0  0.190476                                0                0.0   
2       0  0.357143                                1                1.0   
3       0  0.095238                                0                0.0   
4       1  0.285714                                0                0.0   

   Work Pressure   CGPA  Study Satisfaction  Job Satisfaction  Sleep Duration  \
0            1.0  0.000                 0.0               0.4             9.0   
1            0.8  0.000                 0.0               0.6             4.0   
2            0.0  0.897                 0.4               0.0             5.5   
3            1.0  0.000                 0.0               0.2             4.0   
4            0.2  0.000                 0.0               0.2             5.5   

   Dietary Habits  ...  Degree_Ritik  Degree_S.Arch  Degree_S.

In [6]:
'''
5. Analysing results
'''

class ResultsAnalyser:
    
    def __init__(self, results):
        self.results = results
        self.process_results()
        
    def process_results(self):
        # for results_set in self.results:
            # ...put through function to build confusin matrix - 
        ...
        
    def view_results(self):
        for result in self.results:
            print(result)

In [7]:
"""
4. Creating Model / Training / Evaluating
"""

# Define model parameters
logistic_regression_params = {'max_iter': 1000, 'random_state': 42}
decision_tree_params = {'criterion': 'gini', 'max_depth': 3, 'random_state': 42}
random_forest_params = {'n_estimators': 100, 'max_depth': 5, 'random_state': 42}
svm_params = {'kernel': 'linear', 'C': 1.0}
knn_params = {'n_neighbors': 5}


# Define models
models = []
models.append({'Logistic_regression': LogisticRegression(**logistic_regression_params)})
models.append({'decision_tree': DecisionTreeClassifier(**decision_tree_params)})
models.append({'Random Forest': RandomForestClassifier(**random_forest_params)})
# models.append({'SVM': SVC(**svm_params)})
# models.append({'K-Nearest Neighbors': KNeighborsClassifier(**knn_params)})

model_results = []
for model_info in models:
    for model_name, model in model_info.items():
      # Define stratified k-fold cross-validation
      n_splits = 5
      skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

      # For tracking scores
      fold = 1
      accuracies = []

      # Cross-validation loop
      for train_index, val_index in skf.split(train_df, target):
          # Split data
          X_train, X_val = train_df.iloc[train_index], train_df.iloc[val_index]
          y_train, y_val = target.iloc[train_index], target.iloc[val_index]
        
          # Define ML Pipeline
          pipeline = Pipeline([
              ('cleaning', Cleaner(cleaning_transformations)),
              ('preprocessing', PreProcessor(preprocessing_transformations)),
              ('classifier', model)
          ])
            
          pipeline.fit(X_train, y_train)

          # Predict on validation set
          y_pred = pipeline.predict(X_val)

          # Evaluate accuracy
          acc = accuracy_score(y_val, y_pred)
          accuracies.append(acc)
            
          print(f'Fold {fold}/5 for model: {model_name}')

          fold += 1
        
      print({model_name: np.mean(accuracies)})

      model_results.append({model_name: np.mean(accuracies)})
        
results_analyser = ResultsAnalyser(model_results)
results_analyser.view_results()

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Degree_29
- Degree_5.56
- Degree_5.61
- Degree_5.88
- Degree_B.03
- ...
Feature names seen at fit time, yet now missing:
- Degree_20
- Degree_24
- Degree_7.06
- Degree_8.56
- Degree_ACA
- ...
