In [1]:
import numpy as np
import pandas as pd
import torch
import torch.utils.data as dt
import pytorch_lightning as pl
import os
import sys
import gc

sys.path.append("..")

from sklearn.metrics import mean_squared_error, mean_absolute_error
from bertviz import model_view, head_view, neuron_view
from copy import deepcopy

from scripts.distilbert_reg import DistilBERTRegressor
from scripts.data_module import YelpDataModule, YelpPredictDataModule

from scripts.best_config import CONFIG

pl.seed_everything(seed=42)

Global seed set to 42


42

In [2]:
def merge_data(df1:pd.DataFrame, df2:pd.DataFrame, on:str, suffixes:tuple=None) -> pd.DataFrame:
    """ Function to merge the dataframe """
  
    if suffixes is None:
        suffixes = ('_x', '_y')
    df_merge = pd.merge(df1, df2, on=on, suffixes=suffixes)
    df_merge = df_merge[['r_text', 'r_useful']]

    return df_merge

In [11]:
df_text = pd.read_parquet("../data/new_data/train_text.parquet.snappy")
df_main = pd.read_parquet("../data/new_data/train_main.parquet.snappy")
df = merge_data(df_text, df_main, "r_id", suffixes=("_text", "_main"))
sample_df = df.iloc[[4, 10]]
print(sample_df.head())

                                               r_text  r_useful
4   Always a good experience. Dr Ramsey has been m...         2
10  I see that Steve's Prince of steaks has been i...         3


In [12]:
sample_df['r_text'].apply(lambda x: len(x.split()))

4      35
10    113
Name: r_text, dtype: int64

In [13]:
model = DistilBERTRegressor(CONFIG)
print(model.dbert)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [14]:
def sample_predict_data(data:pd.DataFrame):
    dm = YelpPredictDataModule(data, CONFIG)
    model = DistilBERTRegressor(CONFIG)
    dm.setup()
    trainer = pl.Trainer(
        devices=1,
        accelerator="gpu",
        precision=16,
    )
    
    pred_dl = dm.predict_dataloader()
    preds = trainer.predict(model=model, dataloaders=pred_dl, ckpt_path="../scripts/models/dbert.ckpt")
    preds = np.round(torch.cat(preds).flatten().numpy())
    
    dbert = model.load_from_checkpoint("../scripts/models/dbert.ckpt", config=CONFIG).dbert
    return preds, dbert

In [15]:
# np.round()
preds, model2 = sample_predict_data(sample_df)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at ../scripts/models/dbe

Predicting: 0it [00:00, ?it/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
preds

array([2., 2.], dtype=float16)

In [17]:
sample_df['r_text'].values[0]

'Always a good experience. Dr Ramsey has been my doctor since he began practicing. He is easy to talk with and explains things so I understand. So glad to be a patient in this group.'

In [22]:
sample_text = "Always a good experience. Dr Ramsey has been my doctor since he began practicing. He is easy to talk with and explains things so I understand. So glad to be a patient in this group."
tokenizer = CONFIG['bert']['tokenizer']
inputs = tokenizer.encode(sample_text, return_tensors="pt")
outputs = model2(inputs)
attention = outputs[-1]
tokens = tokenizer.convert_ids_to_tokens(inputs[0])
print(tokens)

['[CLS]', 'always', 'a', 'good', 'experience', '.', 'dr', 'ramsey', 'has', 'been', 'my', 'doctor', 'since', 'he', 'began', 'practicing', '.', 'he', 'is', 'easy', 'to', 'talk', 'with', 'and', 'explains', 'things', 'so', 'i', 'understand', '.', 'so', 'glad', 'to', 'be', 'a', 'patient', 'in', 'this', 'group', '.', '[SEP]']


In [23]:
if not os.path.exists("../html_views/"):
    os.mkdir("../html_views/")

In [26]:
head_view_html = head_view(attention, tokens, html_action='return')
with open("../html_views/head_view.html", "w") as fp:
    fp.write(head_view_html.data)

In [27]:
model_view_html = model_view(attention, tokens, html_action='return')
with open("../html_views/model_view.html", "w") as fp:
    fp.write(model_view_html.data)