# Machine Learning for Regression

In [1175]:
import pandas as pd
import numpy as np

In [1176]:
df = pd.read_csv('laptops.csv')

In [1177]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [1178]:
df = df[['ram','storage','screen','final_price']]

In [1179]:
df.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


## Q1

Q1 Missing values

In [1180]:
df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

## Q2

Q2 Median for 'ram'

In [1181]:
df['ram'].mean()

np.float64(15.41388888888889)

## Q3

Q3 fillna, media

In [1182]:
df['screen'] = df['screen'].fillna(0)
# df['screen'] = df['screen'].fillna(df['screen'].mean())

Q3 Shuffle

In [1183]:
np.random.seed(9)

idx = np.arange(len(df))
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]

Q3 Split

In [1184]:
val_split = int(len(df)*0.2)
test_split = int(len(df)*0.2)
train_split = len(df) - (val_split + test_split)

# es decir que va de 0 a 1296 (60% de 2160)
df_train = df_shuffled.iloc[:train_split].copy()
# es decir que va de 2160 a 2160+432 (toma un 50% del 20% restante)
df_test = df_shuffled.iloc[train_split:train_split+val_split].copy()
# es decir que va de 2160+432 hasta el resto (toma el 50% del 20% restante del restante)
df_val = df_shuffled.iloc[train_split+val_split:].copy()

In [1185]:
y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values

In [1186]:
x_train = df_train.drop(["final_price"], axis=1).values
x_test = df_test.drop(["final_price"], axis=1).values
x_val = df_val.drop(["final_price"], axis=1).values

Q3 Linear regresssion model

In [1187]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return round(np.sqrt(mse), 2)

Q3 validation

In [1188]:
w_0, w = train_linear_regression(x_train, y_train)
y_pred = w_0 + x_val.dot(w)

rmse(y_val, y_pred)

# 547.04 fill 0
# 547.32 median

np.float64(614.11)

## Q4

Q4 Regularized LR

In [1189]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [1190]:
hist = {}
for i in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(x_train, y_train, r=i)
    y_pred = w_0 + x_val.dot(w)
    hist[i] = float(rmse(y_val, y_pred))

In [1191]:
hist
# Best result 1

{0: 614.11,
 0.01: 614.11,
 0.1: 614.07,
 1: 613.8,
 5: 613.35,
 10: 613.31,
 100: 613.89}

## Q5

In [1192]:
# 0 578.24
# 1 570.17
# 2 594.94
# 3 626.17
# 4 611.53
# 5 573.7
# 6 543.51
# 7 631.02
# 8 672.99
# 9 614.11

In [1193]:
seed = [578.24, 570.17, 594.94, 626.17, 611.53, 573.7, 543.51, 631.02, 672.99, 614.11]
np.std(seed)

np.float64(35.42822626099139)

## Q6

In [1194]:
test_split = int(len(df)*0.4)
train_split = len(df) - test_split

df_train = df_shuffled.iloc[:train_split].copy()
df_test = df_shuffled.iloc[train_split:].copy()

y_train = df_train.final_price.values
y_test = df_test.final_price.values

x_train = df_train.drop(["final_price"], axis=1).values
x_test = df_test.drop(["final_price"], axis=1).values

w_0, w = train_linear_regression_reg(x_train, y_train, r=0.001)
y_pred = w_0 + x_val.dot(w)
float(rmse(y_val, y_pred))

# 614.11

614.11