In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("/content/drive/MyDrive/CS114/StudentPerformanceFactors.csv")
# Drop any missing-value row
df = df.dropna(how="all")

# Make dataset
X = df.drop(["Exam_Score"], axis=1)
y = df['Exam_Score']

# Data Scaling and Non-numerical Value Processing
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Build train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Mounted at /content/drive


In [4]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ]
)

In [20]:
from sklearn.svm import SVR
pipeline_1 = Pipeline([
    ('preprocessor', preprocessor),
    ('SVR', SVR(C=1.0, epsilon=0.1, kernel='rbf', tol=1e-4))
])
pipeline_1.fit(X_train, y_train)

In [21]:
# Predict on train and test sets
y_train_pred_1 = pipeline_1.predict(X_train)
y_test_pred_1 = pipeline_1.predict(X_test)

# Evaluate the model
train_mse_1 = mean_squared_error(y_train, y_train_pred_1)
test_mse_1 = mean_squared_error(y_test, y_test_pred_1)
train_r2_1 = r2_score(y_train, y_train_pred_1)
test_r2_1 = r2_score(y_test, y_test_pred_1)

In [22]:
print("Training MSE with degree 1:", train_mse_1)
print("Test MSE with degree 1:", test_mse_1)
print("Training R² with degree 1:", train_r2_1)
print("Test R² with degree 1:", test_r2_1)

Training MSE with degree 1: 4.344116139456927
Test MSE with degree 1: 3.342559243301164
Training R² with degree 1: 0.7175653980568448
Test R² with degree 1: 0.7635272652706422


In [10]:
import pandas as pd
import numpy as np
from cvxopt import matrix, solvers

# Triển khai SVR tuyến tính từ đầu
class SVRFromScratch:
    def __init__(self, C=1.0, epsilon=0.1, tol=1e-4):
        self.C = C
        self.epsilon = epsilon
        self.tol = tol
        self.w = None
        s = None
        self.b = None
        self.alphas = None
        self.support_vectors = None

    def fit(self, X, y):
        # Convert X and y to NumPy arrays if they are DataFrames/Series
        if isinstance(X, pd.DataFrame):
            X = X.values  # Convert DataFrame to NumPy array
        if isinstance(y, pd.Series):
            y = y.values  # Convert Series to NumPy array

        n_samples, n_features = X.shape

        # Tính ma trận kernel K = X * X^T (SVR tuyến tính)
        K = np.dot(X, X.T)

        # Xây dựng ma trận P đúng kích thước (2n, 2n)
        P = np.zeros((2 * n_samples, 2 * n_samples))
        for i in range(2 * n_samples):
            for j in range(2 * n_samples):
                if i < n_samples and j < n_samples:
                    P[i, j] = K[i, j]
                elif i < n_samples and j >= n_samples:
                    P[i, j] = -K[i, j - n_samples]
                elif i >= n_samples and j < n_samples:
                    P[i, j] = -K[i - n_samples, j]
                else:
                    P[i, j] = K[i - n_samples, j - n_samples]

        # Chuyển P thành ma trận cvxopt
        P = matrix(P, tc='d')

        # Vector q = -epsilon * 1 - y
        q = -self.epsilon * np.ones(2 * n_samples) - np.hstack([y, -y])
        q = matrix(q, tc='d')

        # Ràng buộc Gx <= h
        G = np.vstack([
            -np.eye(2 * n_samples),  # alpha_i, alpha_i^* >= 0
            np.eye(2 * n_samples)    # alpha_i, alpha_i^* <= C
        ])
        G = matrix(G, tc='d')

        h = np.hstack([
            np.zeros(2 * n_samples),  # alpha_i, alpha_i^* >= 0
            self.C * np.ones(2 * n_samples)  # alpha_i, alpha_i^* <= C
        ])
        h = matrix(h, tc='d')

        # Ràng buộc A^T * x = b (tổng alpha_i - alpha_i^* = 0)
        A = matrix(np.hstack([np.ones(n_samples), -np.ones(n_samples)]), (1, 2 * n_samples), tc='d')
        b = matrix(0.0, tc='d')

        # Giải bài toán QP
        solvers.options['show_progress'] = False
        solution = solvers.qp(P, q, G, h, A, b)
        alphas = np.array(solution['x']).flatten()

        # Tách alpha_i và alpha_i^*
        self.alphas = alphas[:n_samples] - alphas[n_samples:]

        # Tìm support vectors (các alpha_i trong khoảng (tol, C - tol))
        sv_idx = np.where((np.abs(self.alphas) > self.tol) & (np.abs(self.alphas) < self.C - self.tol))[0]
        self.support_vectors = X[sv_idx]
        self.support_alphas = self.alphas[sv_idx]
        self.support_y = y[sv_idx]

        # Tính w
        self.w = np.sum((self.alphas[:, np.newaxis] * X), axis=0)

        # Tính b từ một support vector
        if len(sv_idx) > 0:
            sv = sv_idx[0]
            self.b = y[sv] - np.dot(self.w, X[sv]) - self.epsilon * np.sign(self.alphas[sv])
        else:
            self.b = 0

    def predict(self, X):
        return np.dot(X, self.w) + self.b

In [11]:
pipeline_scratch = Pipeline([
    ('preprocessor', preprocessor),
    ('SVR', SVRFromScratch(C=1.0, epsilon=0.1))
])
pipeline_scratch.fit(X_train, y_train)

In [12]:
y_train_pred_1 = pipeline_scratch.predict(X_train)
y_test_pred_1 = pipeline_scratch.predict(X_test)

# Evaluate the model
train_mse_1 = mean_squared_error(y_train, y_train_pred_1)
test_mse_1 = mean_squared_error(y_test, y_test_pred_1)
train_r2_1 = r2_score(y_train, y_train_pred_1)
test_r2_1 = r2_score(y_test, y_test_pred_1)

# Print evaluation metrics
print("Training MSE with degree 1:", train_mse_1)
print("Test MSE with degree 1:", test_mse_1)
print("Training R² with degree 1:", train_r2_1)
print("Test R² with degree 1:", test_r2_1)

Training MSE with degree 1: 4.396012111427113
Test MSE with degree 1: 3.2383820094158007
Training R² with degree 1: 0.714191358847183
Test R² with degree 1: 0.7708973890591085


