In [None]:
!pip install --quiet transformers

In [None]:
!pip install --quiet pytorch-lightning

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.utils.data as dt
import matplotlib.pyplot as plt
import pytorch_lightning as pl
import wandb
import sys
import os

sys.path.append("..")

from transformers import DistilBertTokenizerFast, DistilBertConfig
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import WandbLogger
from scripts.distilbert_reg import DistilBERTRegressor
from scripts.data_module import YelpDataset, YelpDataModule


pl.seed_everything(seed=42)
wandb.login()

# Load the Dataset

In [None]:
df_train_text = pd.read_parquet("../data/new_data/train_text.parquet.snappy")
df_test_text = pd.read_parquet("../data/new_data/test_text.parquet.snappy")

df_train_main = pd.read_parquet("../data/new_data/train_main.parquet.snappy")
df_test_main = pd.read_parquet("../data/new_data/test_main.parquet.snappy")


print(f"Shape of the training dataset : {df_train_text.shape}")
print(f"Shape of the test dataset : {df_test_text.shape}")

print("-"*20)


print(f"Shape of the training dataset : {df_train_main.shape}")
print(f"Shape of the test dataset : {df_test_main.shape}")

In [None]:
def merge_data(df1:pd.DataFrame, df2:pd.DataFrame, on:str, suffixes:tuple=None) -> pd.DataFrame:
    """ Function to merge the dataframe """
  
    if suffixes is None:
        suffixes = ('_x', '_y')
    df_merge = pd.merge(df1, df2, on=on, suffixes=suffixes)
    df_merge = df_merge[['r_text', 'r_useful']]

    return df_merge

df_train = merge_data(df_train_text, df_train_main, "r_id", suffixes=('_text', '_main'))
df_test = merge_data(df_test_text, df_test_main, "r_id", suffixes=('_text', '_main'))

In [None]:
df_train.head()

# Set Configuration

In [None]:
config = {
    "batch_sz":128,
    "lr":1e-4,
    "model_name":"distilbert-base-uncased",
    "max_len":300,
    "drop": 0.2,
    "clip_val":2,
    "schedule":True,
    "n_epochs" : 10,
    "bert_config":DistilBertConfig.from_pretrained('distilbert-base-uncased'),
    "linear1":128,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "criterion":torch.nn.MSELoss(),
    "wandb":True,
    "_wandb_kernel":"neuracort",
    "num_workers":4,
    "weight_decay":1e-6,
    "checkpoint_dir_path":"./checkpoints/",
    "pin_memory":True
}

In [None]:
wandb_logger = WandbLogger(project="Yelp-Review-Usefulness", job_type="train", anonymous='allow', config=config)

# Tokenizer

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(config['model_name'])

# Create DataLoaders

In [None]:
# train_dataset = YelpDataset(df_train['r_text'].values, df_train['r_useful'].values, tokenizer, config['max_len'])
# test_dataset = YelpDataset(df_test['r_text'].values, df_test['r_useful'].values, tokenizer, config['max_len'])

In [None]:
# train_dl = dt.DataLoader(train_dataset, batch_size=config['batch_sz'], shuffle=True, num_workers=4)
# test_dataset = dt.DataLoader(test_dataset, batch_size=config['batch_sz'], shuffle=False, num_workers=4) 

# Cross-Validation

In [None]:
config['steps_per_epoch'] = df_train.shape[0] // config['batch_sz']

In [None]:
# temp_dl = YelpDataModule(df_train, tokenizer, config["max_len"], k=1, num_splits=3, batch_size=config['batch_sz'], num_workers=4)
# temp_dl.setup(stage="cv")
# ttrain_dl = temp_dl.train_dataloader()
# tval_dl = temp_dl.val_dataloader()

# next(iter(ttrain_dl))

In [None]:
for k in range(3):
    print(f'Fold : {k}')
    dl = YelpDataModule(df_train, tokenizer, config["max_len"], k=1, num_splits=3, batch_size=config['batch_sz'], num_workers=config["num_workers"], pin_memory=config["pin_memory"])
    dl.setup(stage="cv")
    train_dl = dl.train_dataloader()
    val_dl = dl.val_dataloader()

    checkpoint = ModelCheckpoint(
        dirpath=config["checkpoint_dir_path"],
        filename=f"fold_{k}_distilbert_base",
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min"
    )


    early_stop = EarlyStopping(
        monitor="val_loss",
        patience=2,
        verbose=True,
        mode="min"
    )

    model = DistilBERTRegressor(config)
    trainer = pl.Trainer(gpus=-1, deterministic=True, max_epochs=config['n_epochs'], callbacks=[checkpoint, early_stop],
                        precision=16,
                        logger=wandb_logger,
                        gradient_clip_val=config['clip_val']
                        # strategy='ddp' #activate for Multi-GPU
                        )
    trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=val_dl)