In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models, utils

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
from PIL import Image
import time
import os
import copy
from collections import Counter

In [2]:
tabular_data = "../../data/SkidSteer_2019-08.csv"
color = pd.read_csv('../colorfulness/skid_steer_color_score.csv')

df = pd.read_csv(tabular_data, index_col=1)
df['Unique_ID'] = df[['Source','item#']].apply(lambda x: '_'.join(x),axis = 1)
df = df.filter(['Unique_ID','Winning Bid','Hours Final','Age at Sale (bin)','Bucket','Engine','Tires','Transmission'], axis = 1)
df = pd.merge(df, color,on='Unique_ID',how='inner')
df = df.rename(columns={
    'Unique_ID': "unique_id",
    'Hours Final': "hours_final",
    'Winning Bid': "winning_bid",
    'Age at Sale (bin)': "age_at_sale",
    'Bucket': "bucket",
    'Engine': "engine",
    'Tires': "tires",
    'Transmission': "transmission", 
    'socre': "colorfulness_score"
})
# color = pd.read_csv('skid_steer_color_score.csv')
# final_df = pd.merge(new_df, color,on='Unique_ID',how='inner')


### removal
# remove duplicant
duplicated_item = [item for item, count in Counter(df["unique_id"]).items() if count > 1]
df = df[~df['unique_id'].isin(duplicated_item)]

# remove not matched rows
image_item = [img_name.strip(".jpg") for img_name in os.listdir("../../data/images/")]
df = df[df["unique_id"].isin(image_item)]

# remove comma
df["winning_bid"] = df["winning_bid"].str.replace(',', '').astype(int)

# remove special image
df = df[df['unique_id'] != "rbauction_10525632"]

In [3]:
### winning_bid

# log-transform
df["winning_bid"] = np.log(df["winning_bid"])

# min max scale
mm_scaler_price = preprocessing.MinMaxScaler((-1, 1))
df["winning_bid"] = mm_scaler_price.fit_transform(df["winning_bid"].to_numpy().reshape(-1, 1))

In [4]:
### hours_final

# impute nan with median and new binary indicator
df["hours_final"] = df["hours_final"].str.replace(",", "")
df["hours_final"] = df["hours_final"].astype(float)
df.insert(3, column="hours_final_nan", value=df["hours_final"].isna().astype(int))
df.loc[df["hours_final"].isna(), "hours_final"] = df["hours_final"].median(skipna=True)

# log transform
df["hours_final"] = np.log(df["hours_final"])

# normalize
rb_scaler_hour = preprocessing.RobustScaler()
df["hours_final"] = rb_scaler_hour.fit_transform(np.array(df["hours_final"]).reshape(-1, 1))

In [5]:
### age_at_sale

# impute nan with median and new binary indicator
df["age_at_sale"] = df["age_at_sale"].astype(float)
df.insert(5, column="age_at_sale_nan", value=df["age_at_sale"].isna().astype(int))
df.loc[df["age_at_sale"].isna(), "age_at_sale"] = df["age_at_sale"].median(skipna=True)

# normalize
rb_scaler_age = preprocessing.RobustScaler()
df["age_at_sale"] = rb_scaler_age.fit_transform(np.array(df["age_at_sale"]).reshape(-1, 1))

In [6]:
# normalize
rb_scaler_score = preprocessing.RobustScaler()
df["score"] = rb_scaler_score.fit_transform(np.array(df["score"]).reshape(-1, 1))

In [7]:
### bucket
df.insert(7, column="bucket_bin", value=0)
df.loc[
    ~df["bucket"].isna() & 
    df["bucket"].str.contains("bucket", case=False) | 
    df["bucket"].str.contains("bkt", case=False), "bucket_bin"
] = 1

In [8]:
# shuffle and split
np.random.seed(1)
split = [0.7, 0.3]
split0 = round(df.shape[0] * split[0])
# split1 = round(df.shape[0] * (split[0] + split[1]))
df = df.sample(frac=1)
df_train = df.iloc[:split0]
df_val = df.iloc[split0:]

In [9]:
df_train.head()

Unnamed: 0,unique_id,winning_bid,hours_final,hours_final_nan,age_at_sale,age_at_sale_nan,bucket,bucket_bin,engine,tires,transmission,score
5364,ironplanet_1703726,0.105406,-6.732824,0,-0.666667,0,"74"" Wide General Purpose Smooth Edge Bucket",1,,Cushion Tires,,0.886188
3191,PW_DD1289,0.172815,-1.629216,0,-1.0,0,"Kubota 68""W bucket",1,Kubota V3307-CR four cylinder turbo diesel engine,Titan 12-16.5 NHS tires,Two speed hydrostatic transmission,1.599732
5495,rbauction_10239624,0.219707,-0.164164,0,-0.666667,0,hyd Q/C bkt,1,,,,0.224976
2563,PW_H1380,-0.072566,-0.769452,0,-0.333333,0,"80""W bucket",1,"81 HP, Case 3.2L four cylinder turbo diesel en...",12-16.5 tires,Hydrostatic transmission,0.419117
2883,PW_J8873,-0.305592,-0.416323,0,-0.333333,0,,0,John Deere 5030TT001 3.0L turbo diesel engine,12-16.5 tires,Two speed hydrostatic transmission,2.700984


In [10]:
df_val.head()

Unnamed: 0,unique_id,winning_bid,hours_final,hours_final_nan,age_at_sale,age_at_sale_nan,bucket,bucket_bin,engine,tires,transmission,score
4563,bigiron_EN9531,0.164709,-0.503074,0,-0.333333,0,"84"" Bucket",1,"4-Cyl Turbo Diesel Engine, 84 Hp","Some Tires Have Cuts (Pictured), Tires- 14-17....",,0.577357
972,rbauction_10693873,-0.861828,-0.443163,0,1.333333,0,bkt,1,,,,-0.853387
4915,bigiron_BO0203,-0.180158,-0.226316,0,1.333333,0,"Buckets 12"", 64"" Bucket",1,58 Horse Power Isuzu Diesel Engine,"12-16.5 Tires, Spare Tire And Rim",Hydrostat Transmission,-0.8687
5351,ironplanet_1891010,-0.40247,-6.732824,0,-0.333333,0,"72"" Wide General Purpose Smooth Edge Bucket",1,,,,1.81794
2819,PW_J2936,-0.40247,-1.926828,0,-0.333333,0,,0,"Case four cylinder turbo diesel engine, Non-op...",12-16.5 tires,Hydrostatic transmission,-0.326603


In [11]:
df_train.to_csv("./SkidSteer_2019-08_clean_train.csv")
df_val.to_csv("./SkidSteer_2019-08_clean_val.csv")

In [12]:
class skidsteer_dataset(Dataset):
    """Corrosion Detection dataset."""

    def __init__(self, 
                 csv_file, 
                 img_root, 
                 transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            img_root (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample (including augmentation).
        """
        self.csv_file = pd.read_csv(csv_file, index_col=0)
        self.img_root = img_root
        self.transform = transform

    def __len__(self):
        return len(self.csv_file)

    def __getitem__(self, idx):
        '''Return one data point with a PIL image and its label.'''
        price = self.csv_file.iloc[idx, 1]
        others = torch.tensor(self.csv_file.iloc[idx, [2, 3, 4, 5, 7, 11]])  # change this when new columns are added

        sample = {'price': price, "others": others}
        return sample

In [13]:
def norm2price(tensor, min_max_scaler):
    array2d = tensor.to("cpu").data.numpy().reshape(-1, 1)
    return np.exp(min_max_scaler.inverse_transform(array2d))

def price_MAE(outputs, prices, min_max_scaler):
    outputs = norm2price(outputs, min_max_scaler)
    prices = norm2price(prices, min_max_scaler)
    mae = np.abs(outputs - prices)
    maep = np.abs(outputs - prices) / prices
    return mae.mean(), mae, maep

In [14]:
class Vanilla(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(5, 32)
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        return self.fc2(x)

In [15]:
class Vanilla_a(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(5, 1)
        
    def forward(self, x):
        return self.fc1(x)

In [16]:
class Vanilla2(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(6, 32)
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        return self.fc2(x)

In [17]:
class Vanilla2_a(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(6, 1)

    def forward(self, x):
        return self.fc1(x)

In [18]:
def train_model(model, va_ver, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch = None
    best_loss = float("Inf")
    best_mae = float("Inf")
    best_mae_list = None
    best_maep_list = None
    all_loss = {x: [] for x in ['train', 'val']}
    all_mae = {x: [] for x in ['train', 'val']}

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode
            running_loss = 0.0
            running_mae = 0.0
            running_mae_list = []
            running_maep_list = []
            
            # Iterate over data.
            for items in dataloaders[phase]:
                prices = items["price"].to(device)
                if va_ver:
                    others = items["others"].to(device)
                else:
                    others = items["others"][:, :5].to(device)
                optimizer.zero_grad()
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(others).squeeze()
                    loss = criterion(outputs, prices)
                    mae, mae_np, maep_np = price_MAE(outputs, prices, mm_scaler_price)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                running_loss += loss.item() * prices.size(0)
                running_mae += mae * prices.size(0)
                running_mae_list += list(mae_np.flatten())
                running_maep_list += list(maep_np.flatten())
            if phase == 'train':
                scheduler.step(running_mae)
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_mae = running_mae / dataset_sizes[phase]
            all_loss[phase].append(epoch_loss)
            all_mae[phase].append(epoch_mae)
            print('{} Loss: {:.4f}'.format(phase, epoch_loss))
            print('{} MAE: {:.4f}'.format(phase, epoch_mae))
            
            # deep copy the model
            if phase == 'val' and epoch_loss < best_loss:
                best_epoch = epoch + 1
                best_loss = epoch_loss
                best_mae = epoch_mae
                best_mae_list = running_mae_list
                best_maep_list = running_maep_list
                best_model_wts = copy.deepcopy(model.state_dict())
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Loss: {:4f} at epoch {}'.format(best_loss, best_epoch))
    print('Best val MAE: {:4f} at epoch {}'.format(best_mae, best_epoch))

    # load best model weights
    print("\nLoad the model weights at the best epoch")
    model.load_state_dict(best_model_wts)
    return model, best_mae_list, all_loss, all_mae

In [19]:
CSV_FILE = {"train": "./SkidSteer_2019-08_clean_train.csv",
            "val": "./SkidSteer_2019-08_clean_val.csv"}
IMG_ROOT = "../../data/images/"
TRANSFORM = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

datasets = {x: skidsteer_dataset(csv_file=CSV_FILE[x],
                                 img_root=IMG_ROOT,
                                 transform=TRANSFORM[x])
            for x in ["train", "val"]}
dataloaders = {x: DataLoader(datasets[x], 
                             batch_size=16, 
                             shuffle=True, 
                             num_workers=4)
               for x in ["train", "val"]}
dataset_sizes = {x: len(datasets[x]) for x in ["train", "val"]}
num_tabular_features = len(datasets["train"][0]["others"])

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [20]:
torch.manual_seed(0)

model_va = Vanilla_a().to(device)

criterion = nn.MSELoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_va.parameters(), lr=0.01, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=6, gamma=0.1)

In [21]:
model_va, mae_list_va, all_loss_va, all_mae_va = train_model(model_va, None, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=20)

Epoch 1/20
----------
train Loss: 0.0744
train MAE: 4157.5032
val Loss: 0.0709
val MAE: 3981.4618

Epoch 2/20
----------
train Loss: 0.0676
train MAE: 3990.6204
val Loss: 0.0709
val MAE: 3981.4618

Epoch 3/20
----------
train Loss: 0.0676
train MAE: 3990.6204
val Loss: 0.0709
val MAE: 3981.4619

Epoch 4/20
----------
train Loss: 0.0676
train MAE: 3990.6204
val Loss: 0.0709
val MAE: 3981.4618

Epoch 5/20
----------
train Loss: 0.0676
train MAE: 3990.6204
val Loss: 0.0709
val MAE: 3981.4618

Epoch 6/20
----------


KeyboardInterrupt: 

In [22]:
torch.manual_seed(9)

model_va2 = Vanilla2().to(device)

criterion = nn.MSELoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_va2.parameters(), lr=0.01, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=6, gamma=0.1)

In [23]:
model_va2, mae_list_va2, all_loss_va2, all_mae_va2 = train_model(model_va2, 2, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=20)

Epoch 1/20
----------
train Loss: 0.0887
train MAE: 4493.6642
val Loss: 0.0715
val MAE: 3972.9073

Epoch 2/20
----------
train Loss: 0.0692
train MAE: 3999.7710
val Loss: 0.0715
val MAE: 3972.9073

Epoch 3/20
----------
train Loss: 0.0692
train MAE: 3999.7710
val Loss: 0.0715
val MAE: 3972.9073

Epoch 4/20
----------
train Loss: 0.0692
train MAE: 3999.7710
val Loss: 0.0715
val MAE: 3972.9073

Epoch 5/20
----------
train Loss: 0.0692
train MAE: 3999.7710
val Loss: 0.0715
val MAE: 3972.9073

Epoch 6/20
----------
train Loss: 0.0692
train MAE: 3999.7710
val Loss: 0.0715
val MAE: 3972.9073

Epoch 7/20
----------
train Loss: 0.0692
train MAE: 3999.7710
val Loss: 0.0715
val MAE: 3972.9073

Epoch 8/20
----------
train Loss: 0.0692
train MAE: 3999.7710
val Loss: 0.0715
val MAE: 3972.9073

Epoch 9/20
----------
train Loss: 0.0692
train MAE: 3999.7710
val Loss: 0.0715
val MAE: 3972.9073

Epoch 10/20
----------
train Loss: 0.0692
train MAE: 3999.7710
val Loss: 0.0715
val MAE: 3972.9073

Epoch 11/