In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 1. Load the dataset
df = pd.read_csv('Orig_kaggle_laptops.csv',encoding='ISO-8859-1')
df.columns = df.columns.str.lower().str.replace(" ", "_")
strings = list(df.dtypes[df.dtypes == 'object'].index)
for col in strings:
    df[col] = df[col].str.lower().str.replace(" ", "_")

my_columns = ['ram','storage','screen','final_price']
df = df[my_columns]
# 2. Handle missing values by filling them with 0
df = df.fillna(0)
#df.column
# 3. Split the data into features (X) and target (y)
# Assuming the target is the 'price' column for this regression task
X = df.drop('final_price', axis=1)  # Drop the target variable from the features
y = df['final_price']  # Define the target variable

# Split the data into training and testing sets with a seed of 9, in 80/20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

# 4. Train a linear regression model with regularization r=0.001
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg
    
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

# Training the model on the training data
w_0, w = train_linear_regression_reg(X_train.values, y_train.values, r=0.001)

# 5. Predict on the test data
X_test = np.column_stack([np.ones(X_test.shape[0]), X_test.values])
y_pred = X_test.dot(np.concatenate([[w_0], w]))

# 6. Calculate RMSE on the test data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'RMSE on the test dataset: {rmse:.2f}')


RMSE on the test dataset: 535.08


0, 1459.18, 904.4100
 1, 1155.93, 905.4700
 2, 1382.78, 905.3700
 3, 993.96, 904.6100
 4, 1596.89, 908.7600
 5, 1205.45, 907.2500
 6, 1662.91, 908.1300
 7, 1409.06, 909.4500
 8, 1213.98, 905.4200
 9, 1352.20, 905.4600

In [16]:
rmse = [904.4100, 905.4700, 905.3700, 904.6100, 908.7600, 907.2500, 908.1300, 909.4500, 905.4200, 905.4600]
np_rmse = np.array(rmse)

In [17]:
np.std(np_rmse)

1.7178652450061456

In [None]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])  
    XTX = XTX + reg
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:]  

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.round(np.sqrt(mse), 2)

# Assuming df_train and df_val are already defined with relevant features and target variable
X_train = df_train.drop(columns=['target_column']).fillna(0)  # Replace 'target_column' with the actual name of your target column
y_train = df_train['target_column']  # Replace 'target_column' with the actual name of your target column
df_val_filled_with_zeros = df_val.fillna(0).drop(columns=['target_column'])  # Prepare the validation set similarly

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)

    y_pred_val_zeros = w_0 + df_val_filled_with_zeros.dot(w)
    
    print(f"Regularization {r}: RMSE = {rmse(y_val, y_pred_val_zeros)}")