In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/data/student_lifestyle_dataset.csv')
# Display first few rows
data.head()


Unnamed: 0,Student_ID,Study_Hours_Per_Day,Extracurricular_Hours_Per_Day,Sleep_Hours_Per_Day,Social_Hours_Per_Day,Physical_Activity_Hours_Per_Day,GPA,Stress_Level
0,1,6.9,3.8,8.7,2.8,1.8,2.99,Moderate
1,2,5.3,3.5,8.0,4.2,3.0,2.75,Low
2,3,5.1,3.9,9.2,1.2,4.6,2.67,Low
3,4,6.5,2.1,7.2,1.7,6.5,2.88,Moderate
4,5,8.1,0.6,6.5,2.2,6.6,3.51,High


In [3]:
# Summary statistics
data.describe(include='all')


Unnamed: 0,Student_ID,Study_Hours_Per_Day,Extracurricular_Hours_Per_Day,Sleep_Hours_Per_Day,Social_Hours_Per_Day,Physical_Activity_Hours_Per_Day,GPA,Stress_Level
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000
unique,,,,,,,,3
top,,,,,,,,High
freq,,,,,,,,1029
mean,1000.5,7.4758,1.9901,7.50125,2.70455,4.3283,3.11596,
std,577.494589,1.423888,1.155855,1.460949,1.688514,2.51411,0.298674,
min,1.0,5.0,0.0,5.0,0.0,0.0,2.24,
25%,500.75,6.3,1.0,6.2,1.2,2.4,2.9,
50%,1000.5,7.4,2.0,7.5,2.6,4.1,3.11,
75%,1500.25,8.7,3.0,8.8,4.1,6.1,3.33,


In [4]:
# Check for missing values
data.isnull().sum()


Unnamed: 0,0
Student_ID,0
Study_Hours_Per_Day,0
Extracurricular_Hours_Per_Day,0
Sleep_Hours_Per_Day,0
Social_Hours_Per_Day,0
Physical_Activity_Hours_Per_Day,0
GPA,0
Stress_Level,0


In [6]:
from sklearn.preprocessing import LabelEncoder

# Encode 'Stress_Level'
label_encoder = LabelEncoder()
data['Stress_Level'] = label_encoder.fit_transform(data['Stress_Level'])


In [10]:
# Check the column names in the dataset
print(data.columns)


Index(['Student_ID', 'Study_Hours_Per_Day', 'Extracurricular_Hours_Per_Day',
       'Sleep_Hours_Per_Day', 'Social_Hours_Per_Day',
       'Physical_Activity_Hours_Per_Day', 'GPA', 'Stress_Level'],
      dtype='object')


In [11]:
from sklearn.preprocessing import StandardScaler

# Standardize numerical columns
scaler = StandardScaler()
numerical_cols = [
    'Study_Hours_Per_Day',
    'Extracurricular_Hours_Per_Day',
    'Sleep_Hours_Per_Day',
    'Social_Hours_Per_Day',
    'Physical_Activity_Hours_Per_Day',
    'GPA'
]

data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [12]:
from sklearn.model_selection import train_test_split

# Define features and target variable
X = data.drop(['Stress_Level'], axis=1)
y = data['Stress_Level']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Initialize models
logistic_model = LogisticRegression()
tree_model = DecisionTreeClassifier()
forest_model = RandomForestClassifier()

# Train models
logistic_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)
forest_model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)
    return accuracy, precision, recall, f1, cm

# Evaluate each model
logistic_eval = evaluate_model(logistic_model, X_test, y_test)
tree_eval = evaluate_model(tree_model, X_test, y_test)
forest_eval = evaluate_model(forest_model, X_test, y_test)

# Print results
print("Logistic Regression:", logistic_eval)
print("Decision Tree:", tree_eval)
print("Random Forest:", forest_eval)


Logistic Regression: (0.7816666666666666, 0.7792754834672253, 0.7816666666666666, 0.7802349957011697, array([[278,   5,  33],
       [  0,  71,  25],
       [ 45,  23, 120]]))
Decision Tree: (1.0, 1.0, 1.0, 1.0, array([[316,   0,   0],
       [  0,  96,   0],
       [  0,   0, 188]]))
Random Forest: (1.0, 1.0, 1.0, 1.0, array([[316,   0,   0],
       [  0,  96,   0],
       [  0,   0, 188]]))


In [15]:
from sklearn.model_selection import GridSearchCV

# Example for Random Forest tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

grid_search = GridSearchCV(estimator=forest_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters and their impact
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)


Best Parameters: {'max_depth': None, 'n_estimators': 50}
Best Score: 1.0
