In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.decomposition import PCA
from scipy.stats import ttest_1samp, chi2_contingency
from statsmodels.stats.weightstats import ztest
data = pd.read_csv('/content/Student_Mental_Stress_and_Coping_Mechanisms.csv')  # Change path if needed
data = data.drop('Student ID', axis=1)
categorical_cols = ['Gender', 'Counseling Attendance', 'Stress Coping Mechanisms',
                    'Family Mental Health History', 'Medical Condition']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le
X = data.drop('Mental Stress Level', axis=1)
y = data['Mental Stress Level']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
y_train_binary = (y_train > 5).astype(int)
y_test_binary = (y_test > 5).astype(int)
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train_binary)
y_pred_logreg = log_reg.predict(X_test)
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train_binary)
y_pred_rf = rf_clf.predict(X_test)
svm_clf = SVC(random_state=42)
svm_clf.fit(X_train, y_train_binary)
y_pred_svm = svm_clf.predict(X_test)
def evaluate_model(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-score": f1_score(y_true, y_pred)
    }

results = {
    "Logistic Regression": evaluate_model(y_test_binary, y_pred_logreg),
    "Random Forest": evaluate_model(y_test_binary, y_pred_rf),
    "SVM": evaluate_model(y_test_binary, y_pred_svm)
}
print("\nModel Evaluation Results:")
print(pd.DataFrame(results))
importances = rf_clf.feature_importances_
feature_importance = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print("\nTop 3 Important Features:")
print(feature_importance.head(3))
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
print("\nExplained Variance Ratio by PCA:")
print(pca.explained_variance_ratio_)
t_stat, p_value = ttest_1samp(y, popmean=5)
print("\nT-Test Results:")
print(f"T-statistic = {t_stat:.4f}, p-value = {p_value:.4f}")

if p_value < 0.05:
    print("Conclusion: Reject null hypothesis → Mean mental stress level is significantly different from 5.")
else:
    print("Conclusion: Fail to reject null hypothesis → Mean mental stress level is NOT significantly different from 5.")

counseling_yes = data[data['Counseling Attendance'] == 1]['Study Hours Per Week']
counseling_no = data[data['Counseling Attendance'] == 0]['Study Hours Per Week']

z_stat, p_value_z = ztest(counseling_yes, counseling_no)

print("\nZ-Test Results:")
print(f"Z-statistic = {z_stat:.4f}, p-value = {p_value_z:.4f}")

if p_value_z < 0.05:
    print("Conclusion: Significant difference in study hours between counseling attendees and non-attendees.")
else:
    print("Conclusion: No significant difference in study hours based on counseling attendance.")

contingency_table = pd.crosstab(data['Gender'], data['Counseling Attendance'])
chi2_stat, p_val_chi, dof, expected = chi2_contingency(contingency_table)

print("\nChi-Square Test Results:")
print(f"Chi2-statistic = {chi2_stat:.4f}, p-value = {p_val_chi:.4f}")

if p_val_chi < 0.05:
    print("Conclusion: Significant relationship between Gender and Counseling Attendance.")
else:
    print("Conclusion: No significant relationship between Gender and Counseling Attendance.")




Model Evaluation Results:
           Logistic Regression  Random Forest       SVM
Accuracy              0.480263       0.519737  0.526316
Precision             0.432836       0.471698  0.482143
Recall                0.414286       0.357143  0.385714
F1-score              0.423358       0.406504  0.428571

Top 3 Important Features:
Study Hours Per Week                  0.126116
Physical Exercise (Hours per week)    0.082010
Age                                   0.077096
dtype: float64

Explained Variance Ratio by PCA:
[0.06839345 0.06683797]

T-Test Results:
T-statistic = 3.4489, p-value = 0.0006
Conclusion: Reject null hypothesis → Mean mental stress level is significantly different from 5.

Z-Test Results:
Z-statistic = 1.1762, p-value = 0.2395
Conclusion: No significant difference in study hours based on counseling attendance.

Chi-Square Test Results:
Chi2-statistic = 8.9103, p-value = 0.2592
Conclusion: No significant relationship between Gender and Counseling Attendance.
