In [1]:
import pandas as pd
import numpy as np

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv

--2024-10-05 16:19:00--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 298573 (292K) [text/plain]
Saving to: ‘laptops.csv’


2024-10-05 16:19:00 (11.5 MB/s) - ‘laptops.csv’ saved [298573/298573]



In [2]:
df = pd.read_csv('laptops.csv')

In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
base = ['ram', 'storage', 'screen', 'final_price']
df_base = df[base]

### Q1

In [18]:
df_base.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

### Q2

In [24]:
# using the median
df.ram.median()

16.0

In [25]:
# using quantile method
df.ram.quantile(0.5)

16.0

In [26]:
# using numpy instead of pandas
np.percentile(np.array(df.ram.values), 50)

16.0

## Prepare and Split Dataset

In [5]:
n = len(df_base)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - (n_val + n_test)

In [6]:
n_val, n_test, n_train

(432, 432, 1296)

In [7]:
idx = np.arange(n)
idx

array([   0,    1,    2, ..., 2157, 2158, 2159])

In [8]:
np.random.seed(42)
np.random.shuffle(idx)

idx

array([2079,  668, 2073, ..., 1130, 1294,  860])

In [9]:
df_train = df_base.iloc[idx[:n_train]]
df_val = df_base.iloc[idx[n_train:n_train+n_test]]
df_test = df_base.iloc[idx[n_train+n_test:]]

In [10]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
y_train = np.log1p(df_train.final_price.values)
y_val = np.log1p(df_val.final_price.values)
y_test = np.log1p(df_test.final_price.values)

In [12]:
del df_train['final_price']
del df_val['final_price']
del df_test['final_price']

In [13]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X) 
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [14]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [15]:
def prepare_X(df, mean=0):
    df_num = df.fillna(mean)
    X = df_num.values
    return X

## Q3

- ### Option 1: zero fill

In [134]:
# Training Part
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

# Validation Part
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

score = rmse(y_val, y_pred)
round(score, 2)

0.43

- ## Option 2: mean fill

In [135]:
# Training Part
mean = df_train.screen.mean()
X_train = prepare_X(df_train, mean=mean)
w0, w = train_linear_regression(X_train, y_train)

# Validation Part
X_val = prepare_X(df_val, mean=mean)
y_pred = w0 + X_val.dot(w)

score = rmse(y_val, y_pred)
round(score, 2)

0.43

## Q4

In [16]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X) 
    XTX = XTX + r * np.eye(XTX.shape[0])
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [17]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    # Training Part
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    
    # Validation Part
    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    
    score = rmse(y_val, y_pred)
    score = round(score, 2)

    print(r, score)

0 0.43
0.01 0.43
0.1 0.43
1 0.43
5 0.46
10 0.51
100 0.67


if not rounded, the best r with the smallest rmse is 0.01
but if it is rounded to 2 decimal digits, the r is 0 

## Q5

In [18]:
rsme_scores = []

for s in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    idx = np.arange(n)
    np.random.seed(s)
    np.random.shuffle(idx)

    df_train = df_base.iloc[idx[:n_train]]
    df_val = df_base.iloc[idx[n_train:n_train+n_test]]
    df_test = df_base.iloc[idx[n_train+n_test:]]
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train = np.array(df_train.final_price.values)
    y_val = np.array(df_val.final_price.values)
    y_test = np.array(df_test.final_price.values)
    
    del df_train['final_price']
    del df_val['final_price']
    del df_test['final_price']
    
    # Training Part
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression(X_train, y_train)
    
    # Validation Part
    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    
    score = rmse(y_val, y_pred)
    rsme_scores.append(score)
    print(s, score)


0 565.4520868771027
1 636.7985423056726
2 588.9558697907962
3 597.8148920012521
4 571.962791511102
5 573.2383256618949
6 647.3438328407208
7 550.4398184485952
8 587.333503616991
9 576.1017929433108


In [19]:
rsme_scores = np.array(rsme_scores)

rsme_scores_std = np.std(rsme_scores)
rsme_scores_std = round(rsme_scores_std, 3)

rsme_scores_std

np.float64(29.176)

## Q6

In [20]:
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_train = df_base.iloc[idx[:n_train]]
df_val = df_base.iloc[idx[n_train:n_train+n_test]]
df_test = df_base.iloc[idx[n_train+n_test:]]

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.array(df_train.final_price.values)
y_val = np.array(df_val.final_price.values)
y_test = np.array(df_test.final_price.values)

del df_train['final_price']
del df_val['final_price']
del df_test['final_price']

df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)

# Training Part
X_full_train = prepare_X(df_full_train)
y_full_train = np.concatenate([y_train, y_val])
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

# Validation Part
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)

score = rmse(y_test, y_pred)
score

np.float64(608.609982204956)