In [22]:
#import modules
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [23]:
#get data
def load_data(file):
    data= pd.read_csv(file)
    
    # AVG grade of each student
    grad_mean= (data.G1 + data.G2 + data.G3) / 3
    data['G_Mean'] = grad_mean
    data['G_Mean']= data['G_Mean'].round()

    # Generating USN for each student
    data['USN'] = [random.randint(100, 700) for _ in range(len(data))]

    # Define age groups
    data['age_group'] = pd.cut(data['age'], bins=[14, 16, 19, 21], labels=["15-16", "17-19", "20-21"], right=False)

    # Define grade groups using pd.cut
    bins = [-np.inf, 12, 15, 17, np.inf]
    labels = ['Fail', 'Pass', 'Good', 'Excellent']
    data['grade_group'] = pd.cut(data['G_Mean'], bins=bins, labels=labels, right=False)
    return data

file="D:/trail/data/student-por.csv"
df= load_data(file)

In [24]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,Walc,health,absences,G1,G2,G3,G_Mean,USN,age_group,grade_group
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,1,3,4,0,11,11,7.0,127,17-19,Fail
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,1,3,2,9,11,11,10.0,517,17-19,Fail
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,3,6,12,13,12,12.0,514,15-16,Pass
3,GP,F,15,U,GT3,T,4,2,health,services,...,1,5,0,14,14,14,14.0,427,15-16,Pass
4,GP,F,16,U,GT3,T,3,3,other,other,...,2,5,0,11,13,13,12.0,201,17-19,Pass


In [25]:
# Encode categorical columns using one-hot encoding
categorical_columns = ['school', 'sex', 'address', 'internet', 'age_group']
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Display the encoded dataframe
print(df_encoded.head())


   age famsize Pstatus  Medu  Fedu     Mjob      Fjob  reason guardian  \
0   18     GT3       A     4     4  at_home   teacher  course   mother   
1   17     GT3       T     1     1  at_home     other  course   father   
2   15     LE3       T     1     1  at_home     other   other   mother   
3   15     GT3       T     4     2   health  services    home   mother   
4   16     GT3       T     3     3    other     other    home   father   

   traveltime  ...  G2  G3 G_Mean  USN age_group grade_group school_MS  sex_M  \
0           2  ...  11  11    7.0  127     17-19        Fail     False  False   
1           1  ...  11  11   10.0  517     17-19        Fail     False  False   
2           1  ...  13  12   12.0  514     15-16        Pass     False  False   
3           1  ...  14  14   14.0  427     15-16        Pass     False  False   
4           1  ...  13  13   12.0  201     17-19        Pass     False  False   

  address_U  internet_yes  
0      True         False  
1      True 

In [26]:
# Prepare features and target variable
X = df_encoded[['G1', 'G2', 'sex_M', 'address_U', 'internet_yes', 'school_MS', 'age_group_17-19', 'age_group_20-21']]
X.shape

(649, 7)

In [27]:
y = df_encoded['G3']
y.head()

0    11
1    11
2    12
3    14
4    13
Name: G3, dtype: int64

In [28]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

In [31]:
print(f'R2 Score: {r2}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')

R2 Score: 0.7089269677920439
Mean Absolute Error: 1.0846153846153845
Mean Squared Error: 2.8384615384615386


In [32]:
import joblib

#Save the model
joblib.dump(model, 'student_performance_model.pkl')

['spp_model.pkl']

In [33]:
# Save the column names
joblib.dump(X_train.columns, 'model_columns.pkl')

['model_col.pkl']