**Data Pre-Processing**

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

# Read dataset from csv file
df = pd.read_csv('../kaggle/input/d/lainguyn123/student-performance-factors/StudentPerformanceFactors.csv')

# Drop any missing-value row
df = df.dropna(how="all")

# Make dataset
X = df.drop(["Exam_Score"], axis=1)
y = df['Exam_Score']

# Data Scaling and Non-numerical Value Processing
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Build train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

FileNotFoundError: [Errno 2] No such file or directory: '../kaggle/input/d/lainguyn123/student-performance-factors/StudentPerformanceFactors.csv'

**Trying multiple argument degrees for Polynomial Regression**

In [4]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ]
)


# ------------------------------------------------------Degree 1------------------------------------------------------------
pipeline_1 = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=1, include_bias=True)),
    ('regressor', LinearRegression())
])


# Train the model
pipeline_1.fit(X_train, y_train)

# Predict on train and test sets
y_train_pred_1 = pipeline_1.predict(X_train)
y_test_pred_1 = pipeline_1.predict(X_test)

# Evaluate the model
train_mse_1 = mean_squared_error(y_train, y_train_pred_1)
test_mse_1 = mean_squared_error(y_test, y_test_pred_1)
train_r2_1 = r2_score(y_train, y_train_pred_1)
test_r2_1 = r2_score(y_test, y_test_pred_1)

# Print evaluation metrics
print("Training MSE with degree 1:", train_mse_1)
print("Test MSE with degree 1:", test_mse_1)
print("Training R² with degree 1:", train_r2_1)
print("Test R² with degree 1:", test_r2_1)

# ------------------------------------------------------Degree 2------------------------------------------------------------
pipeline_2 = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=True)),
    ('regressor', LinearRegression())
])


# Train the model
pipeline_2.fit(X_train, y_train)

# Predict on train and test sets
y_train_pred_2 = pipeline_2.predict(X_train)
y_test_pred_2 = pipeline_2.predict(X_test)

# Evaluate the model
train_mse_2 = mean_squared_error(y_train, y_train_pred_2)
test_mse_2 = mean_squared_error(y_test, y_test_pred_2)
train_r2_2 = r2_score(y_train, y_train_pred_2)
test_r2_2 = r2_score(y_test, y_test_pred_2)

# Print evaluation metrics
print("Training MSE with degree 2:", train_mse_2)
print("Test MSE with degree 2:", test_mse_2)
print("Training R² with degree 2:", train_r2_2)
print("Test R² with degree 2:", test_r2_2)

# ------------------------------------------------------Degree 3------------------------------------------------------------
pipeline_3 = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=3, include_bias=True)),
    ('regressor', LinearRegression())
])


# Train the model
pipeline_3.fit(X_train, y_train)

# Predict on train and test sets
y_train_pred_3 = pipeline_3.predict(X_train)
y_test_pred_3 = pipeline_3.predict(X_test)

# Evaluate the model
train_mse_3 = mean_squared_error(y_train, y_train_pred_3)
test_mse_3 = mean_squared_error(y_test, y_test_pred_3)
train_r2_3 = r2_score(y_train, y_train_pred_3)
test_r2_3 = r2_score(y_test, y_test_pred_3)

# Print evaluation metrics
print("Training MSE with degree 3:", train_mse_3)
print("Test MSE with degree 3:", test_mse_3)
print("Training R² with degree 3:", train_r2_3)
print("Test R² with degree 3:", test_r2_3)


NameError: name 'categorical_cols' is not defined

**Build model from exisiting libraries and frameworks**

In [None]:

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=True)),
    ('regressor', LinearRegression())
])


# Train the model
pipeline.fit(X_train, y_train)

# Predict on train and test sets
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print evaluation metrics
print("Training MSE:", train_mse)
print("Test MSE:", test_mse)
print("Training R²:", train_r2)
print("Test R²:", test_r2)

In [None]:
class OneHotEncoderCustom:
    def fit(self, X):
        self.categories_ = {}
        for col in X.columns:
            # Drop NaN and convert all to string to prevent mixed types
            values = X[col].dropna().astype(str)
            self.categories_[col] = sorted(set(values))
        return self

    def transform(self, X):
        X_enc = []
        for _, row in X.iterrows():
            encoded_row = []
            for col in X.columns:
                row_val = str(row[col]) if pd.notnull(row[col]) else None
                for val in self.categories_[col]:
                    encoded_row.append(1 if row_val == val else 0)
            X_enc.append(encoded_row)
        return np.array(X_enc)

    def fit_transform(self, X):
        return self.fit(X).transform(X)

class StandardScalerCustom:
    def fit(self, X):
        self.mean_ = X.mean().values
        self.scale_ = X.std(ddof=0).values
        return self

    def transform(self, X):
        return ((X - self.mean_) / self.scale_).values

    def fit_transform(self, X):
        return self.fit(X).transform(X)
# === Custom ColumnTransformer ===

class ColumnTransformerCustom:
    def __init__(self, transformers):
        self.transformers = transformers

    def fit(self, X):
        self.fitted_transformers_ = []
        for name, transformer, columns in self.transformers:
            t = transformer.fit(X[columns])
            self.fitted_transformers_.append((name, t, columns))
        return self

    def transform(self, X):
        transformed_parts = []
        for name, transformer, columns in self.fitted_transformers_:
            transformed_parts.append(transformer.transform(X[columns]))
        return np.hstack(transformed_parts)

    def fit_transform(self, X):
        return self.fit(X).transform(X)

# === Custom Linear Regression ===

class PolynomialFeaturesCustom:
    def __init__(self, degree=2, include_bias=True):
        self.degree = degree
        self.include_bias = include_bias

    def fit(self, X):
        from itertools import combinations_with_replacement
        self.n_features_in_ = X.shape[1]
        self.combos = [()] if self.include_bias else []
        for d in range(1, self.degree + 1):
            self.combos.extend(combinations_with_replacement(range(self.n_features_in_), d))
        return self

    def transform(self, X):
        n_samples = X.shape[0]
        X_new = np.empty((n_samples, len(self.combos)), dtype=X.dtype)
        for i, comb in enumerate(self.combos):
            X_new[:, i] = np.prod(X[:, comb], axis=1) if comb else np.ones(n_samples)
        return X_new

    def fit_transform(self, X):
        return self.fit(X).transform(X)

class LinearRegressionCustom:
    def fit(self, X, y):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        self.theta_ = np.linalg.pinv(X_b.T @ X_b) @ X_b.T @ y
        return self

    def predict(self, X):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        return X_b @ self.theta_

# === Custom Metrics ===

def mean_squared_error_custom(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def r2_score_custom(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - ss_res / ss_tot

# === Custom Pipeline ===

class PipelineCustom:
    def __init__(self, steps):
        self.steps = steps

    def fit(self, X, y):
        for name, step in self.steps[:-1]:
            X = step.fit_transform(X)
        self.steps[-1][1].fit(X, y)
        self.X_transformed_ = X
        return self

    def predict(self, X):
        for name, step in self.steps[:-1]:
            X = step.transform(X)
        return self.steps[-1][1].predict(X)

In [2]:
preprocessor = ColumnTransformerCustom([
    ('cat', OneHotEncoderCustom(), categorical_cols),
    ('num', StandardScalerCustom(), numeric_cols)
])

pipeline = PipelineCustom([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeaturesCustom(degree=2)),
    ('regressor', LinearRegressionCustom())
])

pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

print("Training MSE:", mean_squared_error_custom(y_train.values, y_train_pred))
print("Test MSE:", mean_squared_error_custom(y_test.values, y_test_pred))
print("Training R²:", r2_score_custom(y_train.values, y_train_pred))
print("Test R²:", r2_score_custom(y_test.values, y_test_pred))


NameError: name 'ColumnTransformerCustom' is not defined