In [87]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import joblib
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [88]:
# values
path = "/content/drive/MyDrive/ml/StudentPerformanceFactors.csv"
feature_list = ['Hours_Studied', 'Attendance', 'Extracurricular_Activities', 'Sleep_Hours', 'Tutoring_Sessions', 'Physical_Activity', 'Exam_Score']
test_size = 0.2
val_size = 0.5

In [89]:
def upload_dataset(path, feature_list):
  data = pd.read_csv(path)
  or_df = pd.DataFrame(data)
  df = or_df[feature_list]
  X = df.iloc[:, :-1]
  y = df.iloc[:, -1]
  return X, y

def upload_df(path, feature_list):
  data = pd.read_csv(path)
  or_df = pd.DataFrame(data)
  df = or_df[feature_list]
  return df

def split_dataset(X, y, test_size, val_size):
  X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size, random_state=42)
  X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=val_size, random_state=42)
  return X_train, X_val, X_test, y_train, y_val, y_test

class multiregressionmodel:
  def __init__(self, degree: int):
    self.model = LinearRegression()
    self.poly = PolynomialFeatures(degree)

  def train(self, x, y):
    x = self.poly.fit_transform(x)
    y = y.values.reshape(-1, 1)
    self.model.fit(x, y)

  def predict(self, x):
    x = self.poly.transform(x)
    return self.model.predict(x)

model = joblib.load('/content/drive/MyDrive/ml/polynomial_model.joblib')
X, y = upload_dataset(path, feature_list)
X = pd.get_dummies(X, columns=['Extracurricular_Activities'], drop_first=True)
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X, y, test_size, val_size)

In [90]:
# dataset sampling -> 추론할 데이터
df = upload_df(path, feature_list)
sampled_df = df.sample(n=100, random_state=42)
X_infer = sampled_df.iloc[:, :-1]
y_infer = sampled_df.iloc[:, -1]

In [92]:
# 추론 데이터 일부 출력
print(X_infer[:5])
print("=" * 40)
print(y_infer[:5])

      Hours_Studied  Attendance Extracurricular_Activities  Sleep_Hours  \
743              20          71                         No            7   
5551             22          71                        Yes            7   
3442             21          91                        Yes            6   
6571             12          91                        Yes            8   
4204             21          63                        Yes            8   

      Tutoring_Sessions  Physical_Activity  
743                   1                  5  
5551                  2                  2  
3442                  1                  3  
6571                  0                  4  
4204                  2                  5  
743     65
5551    65
3442    71
6571    64
4204    66
Name: Exam_Score, dtype: int64


- infer 데이터에서 특정 feature값이 한 가지만 들어올 수 있음
- infer 데이터의 feature 명을 X_train에 맞춰줌

In [93]:
def infer(model, X_infer, X_train):
  X_features = X_train.columns
  X_infer = pd.get_dummies(X_infer, drop_first=True)
  X_infer = X_infer.reindex(columns=X_features, fill_value=0)
  return model.predict(X_infer)

In [96]:
infer_pred = infer(model, X_infer, X_train)

infer_pred_flat = infer_pred.flatten()

infer_compare = pd.DataFrame({"y_infer": y_infer, "y_true_pred": infer_pred_flat})
infer_compare.head()

Unnamed: 0,y_infer,y_true_pred
743,65,65.214273
5551,65,66.335023
3442,71,69.704608
6571,64,66.554921
4204,66,64.842102


In [97]:
def eval(X_test, y_test, model):
  y_pred = model.predict(X_test).flatten()
  mse = np.mean((y_test - y_pred) ** 2)
  mae = np.mean(np.abs(y_test - y_pred))
  r2 = np.corrcoef(y_test, y_pred)[0, 1] ** 2

  eval_value = torch.tensor([mse, mae, r2])
  return pd.DataFrame(eval_value, index=["MSE", "MAE", "R2"], columns=["value"])

In [100]:
X_features = X_train.columns
X_infer_processed = pd.get_dummies(X_infer, drop_first=True)
X_infer_processed = X_infer_processed.reindex(columns=X_features, fill_value=0)

evaluation_results = eval(X_infer_processed, y_infer, model)
evaluation_results

Unnamed: 0,value
MSE,8.664239
MAE,1.451236
R2,0.525616
