## Interactive Vizulaisation

In [None]:
import pandas as pd
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, Select, FactorRange, CustomJS
from bokeh.plotting import figure
from bokeh.io import output_file, save

df = pd.read_csv('student_depression_dataset.csv')
df['Depression'] = df['Depression'].map({0: 'Not Depressed', 1: 'Depressed'})
categorical_columns = ['Academic Pressure', 'Study Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Work/Study Hours', 'Financial Stress']
df['Sleep Duration'] = df['Sleep Duration'].astype(str).str.strip().str.strip("'")
study_order = ['Less than 5 hours', '5-6 hours', '7-8 hours', 'More than 8 hours', 'Others']
work_study_hours_order = ['0.0', '1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0', '10.0', '11.0', '12.0']
depression_levels = ['Not Depressed', 'Depressed']

def make_grouped_data(feature):
    df[feature] = df[feature].astype(str)
    df['Depression'] = df['Depression'].astype(str)
    counts = df.groupby([feature, 'Depression']).size().reset_index(name='count')
    total_per_category = counts.groupby(feature)['count'].transform('sum')
    counts['percent'] = counts['count'] / total_per_category * 100

    if feature == 'Sleep Duration':
        ordered_categories = study_order
    elif feature == 'Work/Study Hours':
        ordered_categories = work_study_hours_order
    else:
        ordered_categories = sorted(counts[feature].unique())

    factors = [(cat, dep) for cat in ordered_categories for dep in depression_levels]
    percents = []
    for cat, dep in factors:
        match = counts[(counts[feature] == cat) & (counts['Depression'] == dep)]
        percents.append(match.iloc[0]['percent'] if not match.empty else 0)

    return factors, percents

all_data = {}
for feature in categorical_columns:
    factors, percents = make_grouped_data(feature)
    all_data[feature] = {
        'x': factors,
        'percent': percents
    }

initial_feature = categorical_columns[0]
source = ColumnDataSource(data=all_data[initial_feature])

plot = figure(x_range=FactorRange(*all_data[initial_feature]['x']), height=500, width=1200,
              title=f"Depression % by {initial_feature}", toolbar_location=None, tools="")
plot.vbar(x='x', top='percent', width=0.8, source=source)
plot.yaxis.axis_label = "% of students per category"
plot.xaxis.major_label_orientation = 1.2

select = Select(title="Select Feature", value=initial_feature, options=categorical_columns)

select.js_on_change("value", CustomJS(
    args=dict(source=source, plot=plot, all_data=all_data),
    code="""
        const feature = cb_obj.value;
        const data = all_data[feature];
        source.data = { x: data.x, percent: data.percent };
        plot.x_range.factors = data.x;
        plot.title.text = `Depression % by ${feature}`;
        source.change.emit();
    """
))

layout = column(select, plot)
output_file("depression_visualization.html")
save(layout)


## Corellation

In [None]:
from scipy.stats import chi2_contingency
import numpy as np
from scipy.stats import pointbiserialr
import matplotlib.pyplot as plt

df = pd.read_csv('student_depression_dataset.csv')

correlation_strenght = pd.DataFrame(columns=["Feature", "Correlation Strength"])
numerical_features = ['Academic Pressure', 'Study Satisfaction', 'Financial Stress', 'CGPA', 'Work/Study Hours' ]
categorical_features = [ 'Sleep Duration', 'Dietary Habits']

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k-1, r-1))

for feature in categorical_features:
    contingency_table = pd.crosstab(df[feature], df['Depression'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    cramers_v_value = cramers_v(df[feature], df['Depression'])
    print(f'Feature: {feature}')
    print(f'Chi-square statistic: {chi2}, p-value: {p}\n')
    print(f"Cramér's V: {cramers_v_value}\n")
    correlation_strenght.loc[-1] = [feature, abs(cramers_v_value)]
    correlation_strenght.index = correlation_strenght.index + 1

for feature in numerical_features:
    df[feature] = pd.to_numeric(df[feature], errors='coerce')
    df = df.dropna(subset=[feature, 'Depression'])
    correlation, p_value = pointbiserialr(df[feature], df['Depression'])
    print(f'Feature: {feature}')
    print(f'Correlation coefficient: {correlation}, p-value: {p_value}\n')
    correlation_strenght.loc[-1] = [feature, abs(correlation)]
    correlation_strenght.index = correlation_strenght.index + 1

correlation_strenght = correlation_strenght.sort_values(by="Correlation Strength", ascending=True)
plt.figure(figsize=(10, 6))
plt.barh(correlation_strenght['Feature'], correlation_strenght['Correlation Strength'])
plt.xlabel('Correlation Strength')
plt.title('Correlation Strength with Depression')
plt.tight_layout()
plt.savefig("correlation_strength.png")
