In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# test commit 
# read in the data with headers
diamonds = pd.read_csv('diamonds.csv', index_col=0)

# check the data
diamonds.head()

In [None]:
diamonds2 = diamonds.copy()

for col in diamonds2.columns:
    if diamonds2[col].dtype != 'object':
        diamonds2 = diamonds2[(diamonds2[col] >= np.percentile(diamonds2[col], 0.5)) &
                              (diamonds2[col] <= np.percentile(diamonds2[col], 99.5))]
        diamonds2.reset_index(drop=True, inplace=True)
        
diamonds2.shape

diamonds2.hist(bins=50, figsize=(20, 15))
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

# split data into training, validation, and test sets
train, test = train_test_split(diamonds2, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.25, random_state=42)

In [None]:
# create a list of features
features = ['carat', 'cut', 'color', 'clarity', 'x', 'y','z','table', 'depth']
# create X matrix and y vector
X_train = train[features]
y_train = train['price']
X_val = val[features]
y_val = val['price']
X_test = test[features]
y_test = test['price']


In [None]:
# preprocess the categorical features (one-hot encode)
X_train = pd.get_dummies(X_train, columns=['cut', 'color', 'clarity'])
X_val = pd.get_dummies(X_val, columns=['cut', 'color', 'clarity'])
X_test = pd.get_dummies(X_test, columns=['cut', 'color', 'clarity'])

# check the shape of the data

In [None]:
# check first five rows of training data

X_train.head()

In [None]:
numeric_features = ['carat', 'x', 'y', 'z', 'table', 'depth']

# save mean and standard deviation of training data
means = X_train[numeric_features].mean()
stds = X_train[numeric_features].std()

# normalize the numeric features (subtract mean and divide by standard deviation)
X_train[numeric_features] = (X_train[numeric_features] - means) / stds
X_val[numeric_features] = (X_val[numeric_features] - means) / stds
X_test[numeric_features] = (X_test[numeric_features] - means) / stds


In [None]:
# write a MLP model by Pytorch to predict price from the features
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# convert X and y to tensors

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float).reshape(-1, 1)

X_val_tensor = torch.tensor(X_val.values, dtype=torch.float)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float).reshape(-1, 1)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float)
y_val_tensor = torch.tensor(y_test.values, dtype=torch.float).reshape(-1, 1) 

# create a dataset and dataloader 

train_ds = TensorDataset(X_train_tensor, y_train_tensor)
train_dl = DataLoader(train_ds, batch_size=32)

val_ds = TensorDataset(X_val_tensor, y_val_tensor)
val_dl = DataLoader(val_ds, batch_size=32)

# MLP model 
class MLP(nn.Module):
    def __init__(self, l1 ) -> None:
        super().__init__()
        self.fc1 = nn.Linear(26, l1) 
        self.fc2 = nn.Linear(l1, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [None]:
def train(epochs, model, loss_func, opt, train_dl, val_dl, clip_gra = None ,early_stopping=True, gpu=False): 
    tmp = 0 
    for epoch in range(epochs):
        
        if gpu:
            model = model.cuda()
            # loss_func = loss_func.cuda()
        model.train()
        
        for xb, yb in train_dl:
            if gpu:
                xb = xb.cuda()
                yb = yb.cuda()
                
            pred = model(xb)
            loss = loss_func(pred, yb)
            # print('loss: ', loss)
            loss.backward()
            if clip_gra is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip_gra)
            opt.step()
            opt.zero_grad()
        
        model.eval()
        with torch.no_grad():
            
            valid_loss = sum(loss_func(model(xb.cuda()), yb.cuda()) for xb, yb in val_dl)
        
        # early stopping when validation loss doesn't increases after 5 epochs
        # save the model with the lowest validation loss to a variable 
        
        if early_stopping:
            
            if epoch == 0:
                best_loss = valid_loss
                best_model = model
            else:
                if valid_loss < best_loss:
                    best_loss = valid_loss
                    # torch.save(model.state_dict(), 'best_model.pt')
                    best_model = model 
                else:
                    tmp += 1
                    if tmp == 5: 
                        print(f'Early stopping, epoch: {epoch}')
                        return best_model , best_loss 
        
        else : 
            best_model = model
            best_loss = valid_loss
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, training loss: {loss}, validation loss: {valid_loss}, Best loss : {best_loss}, tmp : {tmp}')
            
        
        
    return best_model, best_loss

In [None]:

val_losses = [] 
best_models = []
for l1 in [32, 64, 128, 256, 512, 1024]:
    model = MLP(l1)
    loss_func = F.mse_loss
    opt = torch.optim.SGD(model.parameters(), lr=1e-3, weight_decay=0.01)
    best_model, best_loss = train(1000, model, loss_func, opt, train_dl, val_dl, 1 ,early_stopping=True, gpu=True)
    val_losses.append(best_loss)
    best_models.append(best_model)
    print(f'Validation loss: {best_loss} with {l1} hidden units')


In [None]:

val_losses = [] 
best_models = []
for l1 in [32, 64, 128, 256, 512, 1024]:
    model = MLP(l1)
    loss_func = F.mse_loss
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    best_model, best_loss = train(1000, model, loss_func, opt, train_dl, val_dl, early_stopping=True, gpu=True)
    val_losses.append(best_loss)
    best_models.append(best_model)
    print(f'Validation loss: {best_loss} with {l1} hidden units')

In [None]:
# plot the validation loss for each model
val_losses = [x.cpu().numpy() for x in val_losses]
plt.plot([32, 64, 128, 256, 512, 1024], val_losses)
plt.xlabel('Hidden units')
plt.ylabel('Validation loss')
plt.show()

In [None]:
# get the min validation loss and the corresponding model 
best_model = best_models[np.argmin(val_losses)]

from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score, mean_absolute_error
def metrics (y_val, y_pred):
    mae = mean_absolute_error(y_val, y_pred)
    mape = mean_absolute_percentage_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, y_pred)
    return mae, mape, mse, rmse, r2 

# calculate the metrics for the best model on the validation set
best_model.eval()
y_pred = best_model(X_train_tensor.cuda()).cpu().detach().numpy()
mae, mape, mse, rmse, r2 = metrics(y_train, y_pred)
print(f'MAE: {mae}, MAPE: {mape}, MSE: {mse}, RMSE: {rmse}, R2: {r2}')

- dropout ? 
- weight decay vs SGD 
- Adam
- tuning siêu tham số 



- MLP với SKlearn 