In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings
from ktools.preprocessing.i_feature_transformer import IFeatureTransformer
from sklearn.preprocessing import StandardScaler
import functools
import torch
import torch.nn as nn
import torch.nn.functional as F
from copy import deepcopy
from ktools.preprocessing.basic_feature_transformers import ConvertToLower
from collections import OrderedDict

In [2]:
train_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/used_car_prices/train.csv"
test_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/used_car_prices/test.csv"
target_col_name = "price"

In [3]:
settings = DataSciencePipelineSettings(train_csv_path,
                                        test_csv_path,
                                        target_col_name)

In [4]:
from typing import List


class CategorizeFeatures(IFeatureTransformer):
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, features : List[str]):
        settings = deepcopy(original_settings)
        settings.combined_df[features] = settings.combined_df[features].astype('object')
        settings.categorical_col_names += features
        return settings

In [5]:
class StandardizeNumerical(IFeatureTransformer):
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        numerical_columns = settings.combined_df.select_dtypes(include=['number']).columns.tolist()
        numerical_columns.remove(settings.target_col_name)

        numerical_scaler = StandardScaler()
        settings.combined_df[numerical_columns] = numerical_scaler.fit_transform(settings.combined_df[numerical_columns])
        return settings

In [6]:
class RemoveRareCategories(IFeatureTransformer):
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, threshold=40):
        
        global CAT_SIZE
        global CAT_EMB  
        global RARE

        CAT_SIZE = []
        CAT_EMB = []
        RARE = []

        settings = deepcopy(original_settings)
        categorical_columns = settings.combined_df.select_dtypes(include=['object']).columns.tolist()

        for c in categorical_columns:
            settings.combined_df[c], _ = settings.combined_df[c].factorize()
            settings.combined_df[c] -= settings.combined_df[c].min()
            vc = settings.combined_df[c].value_counts()
            
            RARE.append(vc.loc[vc<threshold].index.values )
            n = settings.combined_df[c].nunique()
            mn = settings.combined_df[c].min()
            mx = settings.combined_df[c].max()
            r = len(RARE[-1])
            print(f'{c}: nunique={n}, min={mn}, max={mx}, rare_ct={r}')
            
            CAT_SIZE.append(mx+1 +1)
            CAT_EMB.append( int(np.ceil( np.sqrt(mx+1 +1))) )
            settings.combined_df[c] += 1
            settings.combined_df.loc[settings.combined_df[c].isin(RARE[-1]),c] = 0
        
        return settings

In [7]:
class RemoveCategoriesNotInTrain(IFeatureTransformer):
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        train_df, test_df = settings.update()
        
        for c in settings.categorical_col_names:
            A = train_df[c].unique()
            B = test_df[c].unique()
            C = np.setdiff1d(B,A)
            print(f"{c}: Test has label encodes = {C} which are not in train.")
                
            # RELABEL UNSEEN TEST VALUES AS ZERO
            test_df.loc[test_df[c].isin(C), c] = 0

        settings.combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])
        return settings

In [8]:
full_transforms = [ #ConvertToLower.transform,
                    lambda x : CategorizeFeatures.transform(x, features=['model_year']),
                    StandardizeNumerical.transform,
                    RemoveRareCategories.transform,
                    RemoveCategoriesNotInTrain.transform]

full_settings = functools.reduce(lambda acc, func: func(acc), full_transforms, settings)

brand: nunique=57, min=0, max=56, rare_ct=8
model: nunique=1898, min=0, max=1897, rare_ct=551
model_year: nunique=36, min=0, max=35, rare_ct=4
fuel_type: nunique=8, min=0, max=7, rare_ct=1
engine: nunique=1118, min=0, max=1117, rare_ct=308
transmission: nunique=52, min=0, max=51, rare_ct=8
ext_col: nunique=319, min=0, max=318, rare_ct=99
int_col: nunique=156, min=0, max=155, rare_ct=48
accident: nunique=3, min=0, max=2, rare_ct=0
clean_title: nunique=2, min=0, max=1, rare_ct=0
brand: Test has label encodes = [] which are not in train.
model: Test has label encodes = [1898] which are not in train.
fuel_type: Test has label encodes = [] which are not in train.
engine: Test has label encodes = [1118] which are not in train.
transmission: Test has label encodes = [] which are not in train.
ext_col: Test has label encodes = [] which are not in train.
int_col: Test has label encodes = [] which are not in train.
accident: Test has label encodes = [] which are not in train.
clean_title: Test h

In [9]:
train_df, test_df = full_settings.update()

In [10]:
cat_idcs = np.where(full_settings.combined_df.dtypes.to_numpy() == 'int64')[0].tolist()

In [11]:
print(cat_idcs)
print(CAT_EMB)
print(CAT_SIZE)

[0, 1, 2, 4, 5, 6, 7, 8, 9, 10]
[8, 44, 7, 3, 34, 8, 18, 13, 2, 2]
[58, 1899, 37, 9, 1119, 53, 320, 157, 4, 3]


In [12]:
from typing import List


class BasicFeedForwardNetwork(nn.Module):

    def __init__(self,
                 input_dim : int,
                 output_dim : int,
                 categorical_idcs : List[int],
                 categorical_sizes : List[int],
                 categorical_embedding : List[int],
                 activation : str,
                 num_hidden_layers : int = 1,
                 largest_hidden_dim :int = 256,
                 dim_decay : float = 1.0,
                 ):
        super().__init__()
        self._input_dim = input_dim
        self._output_dim = output_dim
        
        self._categorical_idcs = categorical_idcs
        self._categorical_sizes = categorical_sizes
        self._categorical_embedding = categorical_embedding
        self._num_categories = len(categorical_idcs)
        self._activation = activation

        self._expanded_dim = self._input_dim - self._num_categories + sum(self._categorical_embedding)
        self._largest_hidden_dim = largest_hidden_dim
        self._num_hidden_layers = num_hidden_layers
        self._dim_decay = dim_decay
        
        self.embedding_layers = self._create_embedding_layers()
        self.model = self._create_dense_layers()

    def forward(self, x):
        x = self.forward_embeddings(x)
        x = self.model(x)
        return x
      
    def forward_embeddings(self, x):
        inputs = ()
        for i in range(self._input_dim):
            if i in self._categorical_idcs:
                feature = x[:, i].long()
            else:
                feature = x[:, i:i+1]
            inputs += (self.embedding_layers[i](feature),)
        x = torch.cat(inputs, dim=1)
        return x
    
    def _create_dense_layers(self):
        layers = OrderedDict()
        prev_dim = self._expanded_dim
        curr_dim = self._largest_hidden_dim

        for l in range(self._num_hidden_layers):
            layers[f'layer_{l}'] = nn.Linear(prev_dim, curr_dim)
            layers[f'activation_{l}'] = self._get_activation()
            prev_dim = curr_dim
            curr_dim = max(int(curr_dim*self._dim_decay), self._output_dim)
        
        layers['last_layer'] = nn.Linear(prev_dim, self._output_dim)
        model = nn.Sequential(layers)
        return model

    def _create_embedding_layers(self):
        embeddings = []
        for i in range(self._input_dim):
            if i in self._categorical_idcs:
                j = self._categorical_idcs.index(i)
                embeddings += [nn.Embedding(self._categorical_sizes[j], self._categorical_embedding[j])]
            else:
                embeddings += [nn.Identity()]
        return embeddings
    
    def _get_activation(self):
        if self._activation == 'relu':
            return nn.ReLU()
        elif self._activation == 'gelu':
            return nn.GELU()

In [13]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.to_numpy(), dtype=torch.float32)
        self.y = torch.tensor(y.to_numpy(), dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    

def prep_torch_dataset(X, y, batch_size):
    torch_dataset = MyDataset(X, y)
    dataloader = DataLoader(torch_dataset, batch_size=batch_size, shuffle=True)
    
    return dataloader

In [14]:
model = BasicFeedForwardNetwork(11,
                            1,
                            cat_idcs,
                            CAT_SIZE,
                            CAT_EMB,
                            'relu',
                            num_hidden_layers=3)

print(model)

BasicFeedForwardNetwork(
  (model): Sequential(
    (layer_0): Linear(in_features=140, out_features=256, bias=True)
    (activation_0): ReLU()
    (layer_1): Linear(in_features=256, out_features=256, bias=True)
    (activation_1): ReLU()
    (layer_2): Linear(in_features=256, out_features=256, bias=True)
    (activation_2): ReLU()
    (last_layer): Linear(in_features=256, out_features=1, bias=True)
  )
)


In [15]:
def initialize_weights(model):
    for module in model.modules():
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)

In [19]:
def train_torch_nn(X_train, y_train, X_test, y_test, epochs=3, activation='relu'):
    model = BasicFeedForwardNetwork(11,
                                1,
                                cat_idcs,
                                CAT_SIZE,
                                CAT_EMB,
                                activation,
                                largest_hidden_dim=256,
                                dim_decay=1,
                                num_hidden_layers=3)
    initialize_weights(model)
    print(model)
    criterion = torch.nn.MSELoss()  # Binary Cross-Entropy Loss for binary classification
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
    BATCH_SIZE = 64
    dataloader = prep_torch_dataset(X_train, y_train, BATCH_SIZE)
    
    print(dataloader)

    # Training loop
    num_epochs = epochs
    for epoch in range(num_epochs):
        model.train()
        cum_loss = 0
        nb = 0
        for batch_features, batch_target in dataloader:
            nb+=1
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass: compute predictions
            predictions = model(batch_features).squeeze()  # Model output
            
            # Compute loss
            loss = criterion(predictions, batch_target)
            cum_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        scheduler.step()
        current_lr = scheduler.get_last_lr()[0]
        print("Current learning rate: ", current_lr)
        print(f'Epoch {epoch+1}, Loss: {np.sqrt(cum_loss/nb)}')

        testx = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
        testy = torch.tensor(y_test.to_numpy(), dtype=torch.float32)

        model.eval()
        oof_pred = model(testx).squeeze()
        print("oof performance: ", torch.sqrt(criterion(oof_pred, testy)))

    return oof_pred.detach().numpy(), model

In [20]:
X,y = train_df.drop(columns='price'), train_df['price']

In [25]:
y.dtypes

dtype('float64')

In [21]:
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error

model_list = []
oof_preds = np.zeros(X.shape[0])
kf = KFold(5, shuffle=True, random_state=42)

for i, (train_index, val_index) in enumerate(kf.split(X,y)):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    oof_pred, model = train_torch_nn(X_train_fold, y_train_fold, X_val_fold, y_val_fold)
    model_list += [model]
    oof_preds[val_index] = oof_pred

print("OOF score: ", root_mean_squared_error(y.to_numpy().squeeze(), oof_preds))

BasicFeedForwardNetwork(
  (model): Sequential(
    (layer_0): Linear(in_features=140, out_features=256, bias=True)
    (activation_0): ReLU()
    (layer_1): Linear(in_features=256, out_features=256, bias=True)
    (activation_1): ReLU()
    (layer_2): Linear(in_features=256, out_features=256, bias=True)
    (activation_2): ReLU()
    (last_layer): Linear(in_features=256, out_features=1, bias=True)
  )
)
Current learning rate:  0.001
Epoch 1, Loss: 76487.0156562648
oof performance:  tensor(69369.2578, grad_fn=<SqrtBackward0>)
Current learning rate:  0.001
Epoch 2, Loss: 75025.95014766863
oof performance:  tensor(69311.0078, grad_fn=<SqrtBackward0>)
Current learning rate:  0.0001
Epoch 3, Loss: 74696.63951007155
oof performance:  tensor(68992.5938, grad_fn=<SqrtBackward0>)
BasicFeedForwardNetwork(
  (model): Sequential(
    (layer_0): Linear(in_features=140, out_features=256, bias=True)
    (activation_0): ReLU()
    (layer_1): Linear(in_features=256, out_features=256, bias=True)
    (a

KeyboardInterrupt: 

In [25]:
model_list = []
oof_preds = np.zeros(X.shape[0])
kf = KFold(5, shuffle=True, random_state=42)

for i, (train_index, val_index) in enumerate(kf.split(X,y)):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    oof_pred, model = train_torch_nn(X_train_fold, y_train_fold, X_val_fold, y_val_fold, epochs=5, activation='gelu')
    model_list += [model]
    oof_preds[val_index] = oof_pred

print("OOF score: ", root_mean_squared_error(y.to_numpy().squeeze(), oof_preds))

BasicFeedForwardNetwork(
  (model): Sequential(
    (layer_0): Linear(in_features=140, out_features=256, bias=True)
    (activation_0): GELU(approximate='none')
    (layer_1): Linear(in_features=256, out_features=256, bias=True)
    (activation_1): GELU(approximate='none')
    (layer_2): Linear(in_features=256, out_features=256, bias=True)
    (activation_2): GELU(approximate='none')
    (last_layer): Linear(in_features=256, out_features=1, bias=True)
  )
)
Current learning rate:  0.001
Epoch 1, Loss: 76556.93201581527
oof performance:  tensor(69819.7500, grad_fn=<SqrtBackward0>)
Current learning rate:  0.001
Epoch 2, Loss: 75008.55708973424
oof performance:  tensor(69157.1719, grad_fn=<SqrtBackward0>)
Current learning rate:  0.0001
Epoch 3, Loss: 74552.47022405844
oof performance:  tensor(68957.1875, grad_fn=<SqrtBackward0>)
Current learning rate:  0.0001
Epoch 4, Loss: 74055.85730419285
oof performance:  tensor(68810.0547, grad_fn=<SqrtBackward0>)
Current learning rate:  0.0001
Epoch