<a href="https://colab.research.google.com/github/thanhnguyen2612/diveintocode-ml/blob/master/ITS_DL_bigdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Extract data**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!mkdir "segment_status"
!unzip "/content/drive/MyDrive/segment_status.zip"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: segment_status/2021-01-08/period_6_00.csv  
  inflating: segment_status/2021-01-08/period_6_30.csv  
  inflating: segment_status/2021-01-08/period_7_00.csv  
  inflating: segment_status/2021-01-08/period_7_30.csv  
  inflating: segment_status/2021-01-08/period_8_00.csv  
  inflating: segment_status/2021-01-08/period_8_30.csv  
  inflating: segment_status/2021-01-08/period_9_00.csv  
 extracting: segment_status/2021-01-08/period_9_30.csv  
   creating: segment_status/2021-01-09/
  inflating: segment_status/2021-01-09/period_0_00.csv  
 extracting: segment_status/2021-01-09/period_0_30.csv  
  inflating: segment_status/2021-01-09/period_10_00.csv  
 extracting: segment_status/2021-01-09/period_10_30.csv  
  inflating: segment_status/2021-01-09/period_11_00.csv  
 extracting: segment_status/2021-01-09/period_11_30.csv  
  inflating: segment_status/2021-01-09/period_12_00.csv  
 extracting: segment_status/2021-01

In [None]:
from os import listdir
from os.path import join
import pandas as pd
from sklearn.preprocessing import Normalizer, LabelEncoder, OneHotEncoder, StandardScaler
import datetime
import numpy as np
import time

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn, optim

import pickle

# **Handling segments data**

In [None]:
def one_hot_encoding(df, col_name, range):
    encoder = OneHotEncoder(categories=range)
    transformed = encoder.fit_transform(df[col_name].to_numpy().reshape(-1, 1))
    new_col_name = list(map(lambda x: col_name + "_" + x.split("_")[1], encoder.get_feature_names_out()))
    ohe_df = pd.DataFrame(transformed.toarray(), columns=new_col_name)
    new_df = pd.concat([ohe_df, df], axis=1).drop([col_name], axis=1)
    return new_df, encoder

In [None]:
segment_df = pd.read_csv(join("/content/drive/MyDrive/segments.csv"))

In [None]:
segment_df["street_type"].unique().shape

(53,)

In [None]:
segment_df.head(1)

Unnamed: 0,_id,s_node_id,e_node_id,length,street_id,max_velocity,street_level,street_name,street_type,long_snode,lat_snode,long_enode,lat_enode
0,0,373543511,5468660805,114,31096786,80.0,1,Quốc Lộ 1,trunk,106.60178,10.727718,106.601621,10.726701


In [None]:
# Standard scaling features
scaling_feature = ["length", "long_snode", "lat_snode", "long_enode", "lat_enode"]
segment_df[scaling_feature] = StandardScaler().fit_transform(segment_df[scaling_feature])

# One-hot encoding
segment_df, street_type_encoder = one_hot_encoding(segment_df, "street_type", 'auto')
segment_df, street_level_encoder = one_hot_encoding(segment_df, "street_level", 'auto')

# Drop unneccessary columns
segment_df = segment_df.drop(columns=["s_node_id", "e_node_id", "street_id", "max_velocity", "street_name"])

In [None]:
segment_df.head(1)

Unnamed: 0,street_level_1,street_level_2,street_level_3,street_level_4,street_type_apartments,street_type_atm,street_type_bank,street_type_bridge,street_type_bus,street_type_bus.1,street_type_car,street_type_chalet,street_type_church,street_type_cinema,street_type_clothes,street_type_college,street_type_community,street_type_company,street_type_construction,street_type_convenience,street_type_dentist,street_type_department,street_type_dormitory,street_type_dry,street_type_fast,street_type_fire,street_type_fuel,street_type_government,street_type_hospital,street_type_hostel,street_type_house,street_type_industrial,street_type_marketplace,street_type_motorway,street_type_motorway.1,street_type_pedestrian,street_type_pitch,street_type_police,street_type_primary,street_type_primary.1,street_type_residential,street_type_restaurant,street_type_school,street_type_secondary,street_type_secondary.1,street_type_service,street_type_shelter,street_type_swimming,street_type_tertiary,street_type_tertiary.1,street_type_theme,street_type_training,street_type_trunk,street_type_trunk.1,street_type_unclassified,street_type_university,street_type_yes,_id,length,long_snode,lat_snode,long_enode,lat_enode
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1.369884,-0.932411,-1.302646,-0.93461,-1.321409


# **Handling segment statuses data**

In [None]:
# Utilities functions

def period_to_number(period):
    hour, min = period.split('_')[1:]
    return int(hour)*2 + 1 + (min != '00')

def time_to_period(time):
    """
    00:00:00 -> period_0_00
    01:30:00 -> period_1_30
    """
    hour, minute = time.split(":")[:2]
    
    hour = "_" + str(int(hour))
    minute = "" if minute == "00" else "_" + minute

    return "period" + hour + minute

# 6h-8h, 16h-19h
def infer_peaks(period):
    peaks = ["period_6_00", "period_6_30", 
            "period_7_00", "period_7_30",
            "period_8_00", "period_8_30",
            "period_16_00", "period_16_30", 
            "period_17_00", "period_17_30",
            "period_18_00", "period_18_30",
            "period_19_00", "period_19_30"]
    for index in range(len(peaks)):
        if peaks[index] == period:
            return index+1
    return 0


def infer_holiday(date):
    # holidays = [(day, month)]
    holidays = [(1,1), (14,2), (8,3), (30,4), 
                (1,5), (1,6), (2,9), (20,10), 
                (20,11), (24,12), (25,12), (31,12)]
    for index in range(len(holidays)):
        if date.day == holidays[index][0] and \
           date.month == holidays[index][1]:
            return index+1
    return 0

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()

        self.build()

    def build(self):
        self.period_embed = nn.Linear(48, 32)
        self.weekday_embed = nn.Linear(7, 4)
        self.day_embed = nn.Linear(31, 16)
        self.month_embed = nn.Linear(12, 8)
        self.peak_embed = nn.Linear(15, 8)
        self.special_day_embed = nn.Linear(13, 8)
        self.street_type_embed = nn.Linear(53, 32)
        self.street_level_embed = nn.Linear(4, 2)

        self.dense_layer_1 = nn.Linear(110, 128)
        self.activation_1 = nn.Tanh()
        self.dense_layer_2 = nn.Linear(128, 64)
        self.activation_2 = nn.Tanh()
        self.dense_layer_3 = nn.Linear(64, 32)
        self.activation_3 = nn.Tanh()

        self.dense_layer_4 = nn.Linear(32+5, 32)
        self.activation_4 = nn.LeakyReLU()
        self.dense_layer_5 = nn.Linear(32, 16)
        self.activation_5 = nn.LeakyReLU()
        self.dense_layer_6 = nn.Linear(16, 8)
        self.activation_6 = nn.LeakyReLU()
        
        self.dense_layer_final = nn.Linear(8, 6)


    def forward(self, X):
        special_day, peak, month, day, weekday, period, street_level, street_type, rest = \
        X[:,0:13], X[:,13:28], X[:,28:40], X[:,40:71], X[:,71:78], X[:,78:126], X[:,126:130], X[:,130:183], X[:,183:]

        period_embed = self.period_embed(period)
        weekday_embed = self.weekday_embed(weekday)
        day_embed = self.day_embed(day)
        month_embed = self.month_embed(month)
        peak_embed = self.peak_embed(peak)
        special_day_embed = self.special_day_embed(special_day)
        street_type_embed = self.street_type_embed(street_type)
        street_level_embed = self.street_level_embed(street_level)

        out_1 = self.dense_layer_1( torch.cat((period_embed, weekday_embed, day_embed, month_embed, peak_embed, special_day_embed, street_type_embed, street_level_embed), axis=1 ) )
        out_1 = self.activation_1(out_1)
        out_2 = self.dense_layer_2(out_1)
        out_2 = self.activation_2(out_2)
        out_3 = self.dense_layer_3(out_2)
        out_3 = self.activation_3(out_3)

        out_4 = self.dense_layer_4( torch.cat( (rest, out_3), axis=1 ) )
        out_4 = self.activation_4(out_4)
        out_5 = self.dense_layer_5(out_4)
        out_5 = self.activation_5(out_5)
        out_6 = self.dense_layer_6(out_5)
        out_6 = self.activation_6(out_6)
        out_7 = self.dense_layer_final(out_6)

        return out_7


In [None]:
class TrafficDataset(Dataset):

    def __init__(self, period, date):
        self.df = pd.read_csv(join("segment_status", date, period + ".csv"), usecols=["segment_id","date","weekday","period","LOS"], parse_dates=["date"])
        self.df = self.df.reset_index(drop=True)

        if self.df.empty:
            self.X = np.array([])
            self.y = np.array([])
        else:
            self.df = pd.merge(left=self.df,right=segment_df,left_on="segment_id",right_on="_id").drop(columns=["segment_id", "_id"])

            self.build()
    
    def build(self):
        feature = self.df.drop(columns=["LOS"])

        target = self.df[["LOS"]]

        # Infer peak hours and special days
        feature["peak"] = feature["period"].apply(infer_peaks)
        feature["special_day"] = feature["date"].apply(infer_holiday)

        # Extract day and month then drop "date" column
        feature['month'] = feature['date'].apply(lambda date: date.month)
        feature['day'] = feature['date'].apply(lambda date: date.day)
        feature.drop(["date"], axis=1, inplace=True)

        # Extract hour and minute
        feature['period'] = feature['period'].apply(period_to_number)

        # One-hot encoding
        feature, period_encoder = one_hot_encoding(feature, "period", [range(1,49)])
        feature, weekday_encoder = one_hot_encoding(feature, "weekday", [range(0,7)])
        feature, day_encoder = one_hot_encoding(feature, "day", [range(1,32)])
        feature, month_encoder = one_hot_encoding(feature, "month", [range(1,13)])
        feature, peak_encoder = one_hot_encoding(feature, "peak", [range(0,15)])
        feature, special_day_encoder = one_hot_encoding(feature, "special_day", [range(0,13)])

        # Label encoding target
        label_encoder = LabelEncoder()
        label_encoder.fit(["A", "B", "C", "D", "E", "F"])
        target = label_encoder.transform(self.df["LOS"])

        self.X = feature.to_numpy().astype('float32') # (N, 189)
        self.y = target.astype('long') # (N, 1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

In [None]:
model = Net()

In [None]:
class Learner():

    def __init__(self, train_start_date, train_end_date, 
                    val_start_date, val_end_date,
                    model, epochs=100, batch_size=64, shuffle_samples_in_batch=True, num_workers=1, lr=0.01, save_dir="./"):
        
        self.train_start_date = train_start_date
        self.train_end_date = train_end_date

        self.val_start_date = val_start_date
        self.val_end_date = val_end_date

        self.epochs = epochs
        self.batch_size = batch_size
        self.shuffle_samples_in_batch = shuffle_samples_in_batch
        self.num_workers = num_workers

        self.lr = lr

        self.save_dir = save_dir

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)


    def build_dataloader(self, period, date):

        dataset = TrafficDataset(period, date)
        
        if len(dataset) == 0:
            return None
        
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=self.shuffle_samples_in_batch, num_workers=self.num_workers)

        return dataloader

    def multi_acc(self, y_pred, y_test):
        y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
        _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
        correct_pred = (y_pred_tags == y_test).float()

        acc = correct_pred.sum() / len(correct_pred)
        
        acc = torch.round(acc * 100)
        
        return acc

    def train_model(self):
        
        # Store train, val loss
        train_loss = np.zeros((self.epochs,))
        val_loss = np.zeros((self.epochs,))
        accuracy_list = np.zeros((self.epochs,))
        best_val_loss = np.inf # variable to store best loss

        # Define the optimization
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        # Enumerate epochs
        for epoch in range(self.epochs):
            start_time = time.time()
            # Training
            train_batch = np.array([])
            for date in pd.date_range(start=self.train_start_date, end=self.train_end_date):
                date = str(date).split(" ")[0]
                for period in listdir(join("segment_status", date)):
                    period = period[:-4]

                    # Load a period in a date to train
                    train_dataloader = self.build_dataloader(period, date)

                    if train_dataloader == None:
                        continue

                    # Batch training
                    for i, (inputs, targets) in enumerate(train_dataloader):
                        inputs = inputs.to(self.device)
                        targets = targets.to(self.device)
                        yhat = self.model(inputs)
                        loss = criterion(yhat, targets)
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                        train_batch = np.append(train_batch, loss.item())
                    
            train_loss[epoch] = train_batch.sum() / float( train_batch.shape[0] )
          
            # Validation
            val_batch = np.array([])
            y_preds = []
            y_trues = []
            for date in pd.date_range(start=self.val_start_date, end=self.val_end_date):
                date = str(date).split(" ")[0]
                for period in listdir(join("segment_status", date)):
                    period = period[:-4]

                    # Load a period in a date to train
                    val_dataloader = self.build_dataloader(period, date)

                    if val_dataloader == None:
                        continue

                    # Batch validating
                    for i, (inputs, targets) in enumerate(val_dataloader):
                        inputs = inputs.to(self.device)
                        targets = targets.to(self.device)
                        y_pred = self.model(inputs)
                        loss = criterion(y_pred, targets)
                        val_batch = np.append(val_batch, loss.item())

                        y_preds.append(y_pred.cpu().detach().numpy())
                        y_trues.append(targets.cpu().detach().numpy())

            y_preds = np.concatenate(y_preds, axis=0)
            y_trues = np.concatenate(y_trues, axis=0)
            y_preds = torch.from_numpy(y_preds)
            y_trues = torch.from_numpy(y_trues)

            accuracy_list[epoch] = self.multi_acc(y_preds, y_trues)
            val_loss[epoch] = val_batch.sum() / float( val_batch.shape[0] )

            if val_loss[epoch] < best_val_loss:
                cur_best_epoch_index = epoch
                best_val_loss = val_loss[epoch]
                best_state_dict = self.model.state_dict()

            print(f"Epoch: {epoch}, \
                    train loss: {train_loss[epoch]:.4f}, \
                    val loss: {val_loss[epoch]:.4f}, \
                    accuracy: {accuracy_list[epoch]:.4f}, \
                    time: {time.time() - start_time:.4f}s")
        
        # Save model
        save_file = join(self.save_dir, "model" + "_best_epoch_" + str(cur_best_epoch_index) + ".pt")
        torch.save(best_state_dict, save_file)

        # Print best model
        print(f"best val loss: {val_loss[cur_best_epoch_index]}, accuracy: {accuracy_list[cur_best_epoch_index]}")

In [None]:
learner = Learner(train_start_date="2020-07-10", train_end_date="2020-07-10", \
                  val_start_date="2021-02-10", val_end_date="2021-02-10", \
                  model=model, epochs=100, batch_size=88000, lr=0.001)
learner.train_model()

Epoch: 0,                     train loss: 1.6970,                     val loss: 1.6222,                     accuracy: 41.0000,                     time: 259.8043s
Epoch: 1,                     train loss: 1.5560,                     val loss: 1.5429,                     accuracy: 41.0000,                     time: 256.6310s
Epoch: 2,                     train loss: 1.5214,                     val loss: 1.4785,                     accuracy: 42.0000,                     time: 254.1881s
Epoch: 3,                     train loss: 1.4638,                     val loss: 1.4018,                     accuracy: 42.0000,                     time: 254.2867s
Epoch: 4,                     train loss: 1.3833,                     val loss: 1.3352,                     accuracy: 42.0000,                     time: 254.7829s
Epoch: 5,                     train loss: 1.2721,                     val loss: 1.2707,                     accuracy: 67.0000,                     time: 256.8549s
Epoch: 6,             