In [27]:
### IMPORTING IMPORTANT LIBRARIES

import kagglehub

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
student_health = pd.read_csv('Student Mental health.csv')
path = kagglehub.dataset_download("lainguyn123/student-performance-factors")
student_performance = pd.read_csv(path + '/StudentPerformanceFactors.csv')

In [None]:
student_performance.head()

In [None]:
student_health.head()

In [None]:
# Define bins and labels
bins = [0, 1.99, 2.99, 4.00]
labels = ['0-1.99', '2.00-2.99', '3.00-4.00']

# Bin the numeric column in df2
student_performance['CGPA'] = pd.cut(student_performance['Exam_Score'] / 25, bins=bins, labels=labels, include_lowest=True)

# Renaming columns
student_health.rename(columns={'What is your CGPA?': 'CGPA'}, inplace=True)
student_health.rename(columns={'What is your course?': 'Course'}, inplace=True)
student_health.rename(columns={'Do you have Depression?': 'Depression'}, inplace=True)
student_health.rename(columns={'Do you have Anxiety?': 'Anxiety'}, inplace=True)
student_health.rename(columns={'Do you have Panic attack?': 'Panic attack'}, inplace=True)
student_health.rename(columns={'Your current year of Study': 'Year of Study'}, inplace=True)
student_health.rename(columns={'Did you seek any specialist for a treatment?': 'Seeked Help'}, inplace=True)

# Mapping old intervals to new intervals
interval_mapping = {
    '0 - 1.99': '0-1.99',
    '2.00 - 2.49': '2.00-2.99',
    '2.50 - 2.99': '2.00-2.99',
    '3.00 - 3.49': '3.00-4.00',
    '3.50 - 4.00': '3.00-4.00'
}

# Apply mapping to the column
student_health['CGPA'] = student_health['CGPA'].map(interval_mapping)

# Cleaning the Year of Study column:
student_health['Year of Study'] = student_health['Year of Study'].str.lower().str.replace('year ', '').str.strip()

student_health.head()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
student_performance.groupby('CGPA').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
student_performance.groupby('CGPA').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# Merge dataframes
df = pd.merge(student_performance, student_health, on='CGPA', how='inner')
# Display the result
df.head()

In [35]:
df.drop(['Timestamp', 'Choose your gender', 'Gender', 'Course', 'Exam_Score', 'CGPA'], axis=1, inplace=True)
df.dropna(inplace=True)

In [None]:
unique_vales = df.apply(pd.Series.unique)
print(unique_vales)

In [None]:
# Columns to convert
cols_to_convert = ['Depression', 'Anxiety', 'Panic attack', 'Seeked Help', 'Extracurricular_Activities', 'Internet_Access', 'Marital status', 'Learning_Disabilities']

# Mapping dictionary
mapping = {'Yes': True, 'No': False}

# Loop through the columns and apply the mapping
for col in cols_to_convert:
    if col in df.columns:  # Check if the column exists in the DataFrame
      df[col] = df[col].map(mapping)

df.dtypes

In [None]:
# Columns to convert to categorical
categorical_cols = ['Parental_Involvement', 'Motivation_Level', 'Access_to_Resources', 'Year of Study', 'Distance_from_Home', 'Family_Income', 'Teacher_Quality', 'Peer_Influence', 'Parental_Education_Level']

# Convert columns to categorical
for col in categorical_cols:
    if col in df.columns:
        df[col] = pd.Categorical(df[col])

df.dtypes

In [None]:
# Converting School Type to Boolean (Public : False, Private: True)
df['School_Type'] = df['School_Type'].map({'Public': False, 'Private': True})
df.rename(columns={'School_Type': 'Private_School'}, inplace=True)
df.dtypes

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

for col in ['Anxiety', 'Panic attack', 'Seeked Help', 'Extracurricular_Activities', 'Internet_Access', 'Marital status', 'Learning_Disabilities', 'Private_School']:
    # Create confusion matrix
    cm = confusion_matrix(df['Depression'], df[col])

    # Plotting the confusion matrix
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['False', 'True'], yticklabels=['False', 'True'])
    plt.xlabel(col)
    plt.ylabel('Depression')
    plt.title(f'Confusion Matrix: Depression vs. {col}')

    # Move y-axis labels to the right
    plt.yticks(rotation=0)
    plt.gca().yaxis.tick_right()
    plt.show()

In [None]:
for col in ['Parental_Involvement', 'Motivation_Level', 'Access_to_Resources', 'Year of Study', 'Distance_from_Home', 'Family_Income', 'Teacher_Quality', 'Peer_Influence', 'Parental_Education_Level']:
    plt.figure(figsize=(8, 6))
    sns.countplot(x=col, hue='Depression', data=df)
    plt.title(f'Depression vs. {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns

for col in numeric_cols:
    plt.figure(figsize=(8, 6))
    plt.hist(df[df['Depression'] == False][col], alpha=0.5, label='No Depression')
    plt.hist(df[df['Depression'] == True][col], alpha=0.5, label='Depression')
    plt.title(f'Distribution of {col} by Depression')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

In [43]:
selected_columns = ['Depression', 'Marital status', 'Parental_Involvement', 'Access_to_Resources',
                    'Year of Study', 'Teacher_Quality', 'Peer_Influence', 'Age']
df_original = df
df = df_original[selected_columns]

In [44]:
# Converting the categorical values to numerical values
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the categorical columns to be one-hot encoded
def preprocess(df, classifier, **kwargs):
  bool_list = df.select_dtypes(include=['bool']).columns
  cat_list = df.select_dtypes(include=['category']).columns
  num_list = df.select_dtypes(include=['int64', 'float64']).columns

  preprocessor = ColumnTransformer(
      transformers=[
          ('bool', 'passthrough', bool_list),
          ('cat', OneHotEncoder(), cat_list),
          ('num', 'passthrough', num_list)
      ])

  classifier_instance = classifier(**kwargs)

  model = Pipeline([
      ('preprocessor', preprocessor),
      ('classifier', classifier_instance)
  ])

  return model

In [45]:
from sklearn.model_selection import train_test_split

# Splitting input and target variable
X = df.drop(columns=['Depression'], axis=1)
y = df['Depression']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Train the Logistic Regression model
logreg_model = preprocess(X, LogisticRegression, max_iter=10000)
logreg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Train the Naive Bayes model
nb_model = preprocess(X, MultinomialNB)
nb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_nb = nb_model.predict(X_test)

# Evaluate the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {accuracy_nb}")
print(classification_report(y_test, y_pred_nb))

cm_nb = confusion_matrix(y_test, y_pred_nb)
sns.heatmap(cm_nb, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Naive Bayes')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
rf_model = preprocess(X, RandomForestClassifier, n_estimators=200)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

error_rates = []
for n in range(1, 11):
    # Train the KNN Model
    knn = preprocess(X, KNeighborsClassifier, n_neighbors=n)
    knn.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = knn.predict(X_test)
    error = 1 - accuracy_score(y_test, y_pred)
    error_rates.append(error)

plt.plot(range(1, 11), error_rates, marker='o')
plt.title('Error Rate vs. N Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Train the KNN model
knn_model = preprocess(X, KNeighborsClassifier, n_neighbors=3)
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()