Introduction


**Table of contents**<a id='toc0_'></a>    
- 1. [Data preparation](#toc1_)    
  - 1.1. [Tokenize sentence and aspect BIO encoding class](#toc1_1_)    
  - 1.2. [Load data](#toc1_2_)    
  - 1.3. [Inspect tagging issues](#toc1_3_)    
  - 1.4. [Merge conflict to negative & remove neutral](#toc1_4_)    
  - 1.5. [Convert df to HuggingFace datasets](#toc1_5_)    
  - 1.6. [Word features](#toc1_6_)    
- 2. [EDA](#toc2_)    
- 3. [Model performance class](#toc3_)    
- 4. [Random forest](#toc4_)    
- 5. [CRF](#toc5_)    
- 6. [Bi-LSTM](#toc6_)    
- 7. [BERT](#toc7_)    
  - 7.1. [Model](#toc7_1_)    
  - 7.2. [Data preparation](#toc7_2_)    
  - 7.3. [Upsampling / downsampling](#toc7_3_)    
  - 7.4. [Model tuning](#toc7_4_)    
  - 7.5. [Error analysis](#toc7_5_)    
    - 7.5.1. [Group by word token](#toc7_5_1_)    
    - 7.5.2. [Group by Tag ID](#toc7_5_2_)    
  - 7.6. [Load saved model](#toc7_6_)    
    - 7.6.1. [Load model manually](#toc7_6_1_)    
    - 7.6.2. [Pipeline](#toc7_6_2_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
import os 
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0' # this setting is needed to run NN on my Mac

from datetime import datetime
import re
import pandas as pd
import numpy as np
from collections import Counter
from collections import defaultdict

import plotly.express as px
import plotly.graph_objects as go
# from plotly.subplots import make_subplots
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# from highlight_text import HighlightText, ax_text, fig_text


from nltk.tokenize import TreebankWordTokenizer

from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from seqeval.metrics import f1_score, classification_report
from sklearn.metrics import f1_score as sklearn_f1_score

# pip install torch==2.2.0 torchtext --index-url https://download.pytorch.org/whl/test/cpu
# pip install torch==2.3.0.dev20240121 # this does not work

# pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu # run this in CLI before running the notebook

# import torch
# if torch.backends.mps.is_available():
#     mps_device = torch.device("mps")
#     x = torch.ones(1, device=mps_device)
#     print (x)
# else:
#     print ("MPS device not found.")

# 1. <a id='toc1_'></a>[Data preparation](#toc0_)

## 1.1. <a id='toc1_1_'></a>[Tokenize sentence and aspect BIO encoding class](#toc0_)

In [2]:
class SentenceToken:
  '''
    SentenceToken

    This class takes care of word tokenize and tagging aspect entities
  '''
  def __init__(self, sentence, aspect_type=None, aspects=None, sentence_id=None):
    
    if sentence_id is not None:
      print(sentence_id)

    self.sentence_id = sentence_id
    self.sentence = sentence.replace(u"\u00A0", " ").replace(u'\xa0',' ') # replace unicode space character
                            
    self.aspect_bio_tags = None
    self.unified_aspect_bio_tags = None
    self.token_span = None
    self.space_pre_token = None

    # Tokenize sentence
    self.__tokenize_sentence(self.sentence)

    if aspect_type == 'dict':
      self.set_aspect_tagging_from_dict(aspects)
    elif aspect_type == 'bio':
      self.set_aspect_bio_tags(aspects)
    elif aspect_type == 'unified bio':
      self.set_aspect_unified_bio_tags(aspects)
  
  def __tokenize_sentence(self, sentence):
    '''
    __tokenize_sentence

    Break sentence into word token span
    '''
    token_span = list(TreebankWordTokenizer().span_tokenize(sentence))
    
    self.token_span = token_span
    self.space_pre_token = [True if sentence[k[0]-1:k[0]] == ' ' else False for i,k in enumerate(token_span)]

  def set_aspect_tagging_from_dict(self, aspects):
    '''
    set_aspect_tagging_from_dict

    Calculate & assign aspect entities to token given an array of aspects (term, start_index, to_index, and polarity)
    '''
    polarity_map = {'positive':'POS'
              ,'negative': 'NEG'
              ,'conflict': 'CON'
              ,'neutral': 'NEU'}
    
    bio_tags = ['O'] * len(self.token_span)
    unified_bio_tags = bio_tags

    for x in aspects:
      if x['term'] != '':
        aspect_from = int(x['from'])
        aspect_to = int(x['to'])
        polarity = '-' + polarity_map[x['polarity']]

        aspect_from_index = [i for i, v in enumerate(self.token_span) if (v[0] <= aspect_from) & (v[1] >= aspect_from)][0]
        aspect_to_index = [i for i, v in enumerate(self.token_span) if (v[0] <= aspect_to) & (v[1] >= aspect_to)][0]
      
        aspect_length = aspect_to_index - aspect_from_index
        bio_tags = bio_tags[:aspect_from_index] + ['B'] + ['I'] * (aspect_length) + bio_tags[aspect_to_index+1:]
        unified_bio_tags = unified_bio_tags[:aspect_from_index] + ['B' + polarity] + ['I'+ polarity] * (aspect_length) + unified_bio_tags[aspect_to_index+1:]

    self.set_aspect_bio_tags(bio_tags)
    self.set_aspect_unified_bio_tags(unified_bio_tags)

  def rebuild_sentence_from_token(self):
    '''
    rebuild_sentence_from_token

    Return sentence built from computed tokens
    '''
    return ''.join([(' ' if self.space_pre_token[i] else '') + self.sentence[k[0]:k[1]] for i, k in enumerate(self.token_span)])
 
  def set_aspect_bio_tags(self, aspect_bio_tags):
    '''
    set_aspect_bio_tags

    Setter method to set aspect_unified_bio_tags and aspect_bio_tags
    '''
    self.aspect_bio_tags = aspect_bio_tags
    self.aspect_unified_bio_tags = aspect_bio_tags

  def set_aspect_unified_bio_tags(self, aspect_unified_bio_tags):
    ''''
    set_aspect_unified_bio_tags
    
    Setter method to set aspect_unified_bio_tags and aspect_bio_tags
    '''
    self.aspect_unified_bio_tags = aspect_unified_bio_tags
    self.aspect_bio_tags = [k[0:1] for k in aspect_unified_bio_tags]

  def get_tokens(self):
    '''
    get_tokens()
    Return an array of sentence word tokens
    '''
    return [self.sentence[k[0]:k[1]] for k in self.token_span]
  
  def check_rebuild_sentence_from_token(self):
    '''
    check_rebuild_sentence_from_token()

    This is a test / debugger function.
    This help validating if we have computed the sentence to token properly and whether we can re-compute the exact sentence from information stored.
    '''
    return re.sub(r'\s+', ' ',self.sentence.strip()) == self.rebuild_sentence_from_token().strip()
  
  def check_rebuild_aspect_terms(self, aspect_dict):
    '''
    check_rebuild_aspect_terms(aspect_dict)

    This is a test / debugger fucntion. 
    This help validate if we have compute the correct aspect terms as given by the aspect dict

    INPUT:
    aspect dict: array of aspect dictionaries in the following format
      [{'term': 'storage', 
       'polarity': 'positive', 
       'from': '14', 
       'to': '21'}]
    '''
    aspect_dict = sorted(aspect_dict, key=lambda d: int(d['from']))
    aspect_input = [k['term'].replace(u"\u00A0", " ").replace(u'\xa0',' ') for k in aspect_dict if k['term'] != '' ]
    aspect_computed = []
    aspect = ''
    
    for i,k in enumerate(self.aspect_bio_tags):
      token = self.sentence[self.token_span[i][0]:self.token_span[i][1]]
      
      if k == 'B':
        if (self.aspect_bio_tags[i-1] == 'B' if i > 0 else False):
          aspect_computed.append(aspect)
        aspect = token
      elif k == 'I':
        aspect += ' ' * ((self.token_span[i][0] -  self.token_span[i-1][1]) if i > 0 else 0) + token
      
      if (aspect != '') & ((k == 'O') or (i == (len(self.aspect_bio_tags) - 1))):
          aspect_computed.append(aspect)
          aspect = ''

    return [aspect_input == aspect_computed, aspect_input, aspect_computed]

  def __str__(self):
    return self.rebuild_sentence_from_token()


## 1.2. <a id='toc1_2_'></a>[Load data](#toc0_)

In [3]:
def reload_Data():
  df_train = pd.read_json('data/laptop/train.json')
  # First, I will need to drop some duplicated data in our training dataset, as identified in the EDA process.
  # We have removed 12 duplicated records in our training dataset
  df_train.drop_duplicates(subset='text', inplace=True)
  print('df_train shape: ', df_train.shape)

  df_val = pd.read_json('data/laptop/validate.json') # This will only be used for the very last step to evaluate how well the model is, but is input now for validating the BIO tagging to ensure the function works properly
  print('df_val shape: ', df_val.shape)

  df_train['sentence_token'] = df_train.apply(lambda x: SentenceToken(x['text'], 'dict', x['aspects']), axis=1)
  df_train['sentence_check'] = df_train.apply(lambda x: x['sentence_token'].check_rebuild_sentence_from_token(), axis=1)
  df_train['aspect_check'] = df_train.apply(lambda x: x['sentence_token'].check_rebuild_aspect_terms(x['aspects']), axis=1)
  df_train['aspect_check_TF'] = df_train.apply(lambda x: x['sentence_token'].check_rebuild_aspect_terms(x['aspects'])[0], axis=1)
  df_train['tokens'] = df_train.apply(lambda x: x['sentence_token'].get_tokens(), axis=1)
  df_train['tags'] = df_train.apply(lambda x: x['sentence_token'].aspect_unified_bio_tags, axis=1)

  df_val['sentence_token'] = df_val.apply(lambda x: SentenceToken(x['text'], 'dict', x['aspects']), axis=1)
  df_val['sentence_check'] = df_val.apply(lambda x: x['sentence_token'].check_rebuild_sentence_from_token(), axis=1)
  df_val['aspect_check'] = df_val.apply(lambda x: x['sentence_token'].check_rebuild_aspect_terms(x['aspects']), axis=1)
  df_val['aspect_check_TF'] = df_val.apply(lambda x: x['sentence_token'].check_rebuild_aspect_terms(x['aspects'])[0], axis=1)
  df_val['tokens'] = df_val.apply(lambda x: x['sentence_token'].get_tokens(), axis=1)
  df_val['tags'] = df_val.apply(lambda x: x['sentence_token'].aspect_unified_bio_tags, axis=1)
  return df_train, df_val

In [4]:
df_train, df_test = reload_Data()
print('# of df_train records having tokenizing issues: ', len(df_train[df_train['sentence_check']==False]))
print('# of df_train records having aspect bio tagging issues: ', len(df_train[df_train['aspect_check_TF']==False]))
print('# of df_test records having tokenizing issues: ', len(df_test[df_test['sentence_check']==False]))
print('# of df_test records having aspect bio tagging issues: ', len(df_test[df_test['aspect_check_TF']==False]))

df_train shape:  (3036, 3)
df_val shape:  (800, 3)
# of df_train records having tokenizing issues:  0
# of df_train records having aspect bio tagging issues:  36
# of df_test records having tokenizing issues:  0
# of df_test records having aspect bio tagging issues:  9


## 1.3. <a id='toc1_3_'></a>[Inspect tagging issues](#toc0_)

In [5]:
print(df_train[df_train['aspect_check_TF']==False].index)

Index([ 125,  140,  220,  293,  374,  375,  431,  612,  656,  834,  922,  924,
        953,  999, 1031, 1374, 1456, 1502, 1631, 1716, 1936, 1958, 2113, 2160,
       2244, 2392, 2502, 2533, 2587, 2606, 2783, 2831, 2842, 2876, 2930, 2940],
      dtype='int64')


In [6]:
num = 2606

print(df_train.loc[num]['aspects'])
print(df_train.loc[num]['text'])
print(df_train.loc[num]['aspect_check'])
print(df_train.loc[num]['tokens'])

[{'term': 'delivery service', 'polarity': 'negative', 'from': '59', 'to': '75'}]
After way too many times sending the thing in for repairs (delivery service was slow, and without the laptop I had no access to the internet, and thus no way of tracking it to find out when I might hope to see my computer again), it finally kicked the bucket after just over 2 years.
[False, ['delivery service'], ['(delivery service']]
['After', 'way', 'too', 'many', 'times', 'sending', 'the', 'thing', 'in', 'for', 'repairs', '(', 'delivery', 'service', 'was', 'slow', ',', 'and', 'without', 'the', 'laptop', 'I', 'had', 'no', 'access', 'to', 'the', 'internet', ',', 'and', 'thus', 'no', 'way', 'of', 'tracking', 'it', 'to', 'find', 'out', 'when', 'I', 'might', 'hope', 'to', 'see', 'my', 'computer', 'again', ')', ',', 'it', 'finally', 'kicked', 'the', 'bucket', 'after', 'just', 'over', '2', 'years', '.']


Most of the tagging issues due to word that are not separated properly from special characters/ punctuations. The issue is unavoidable in practice as reviews may not adherent to perfect grammar.

I have tried to fix this issues to have 100% accuracy with further token breakdown to match the specified aspect tokens, however, this can break some of the standard logics for word tokenizer and further modelling. 

Therefore, I decided to include a whole token where the aspect may start or end, even if the index is in the middle of token, which may results with aspect tokens that could include extra characters than planned. This is the risk we will accept for this approach, and we can perform a cleaning process to remove these extra characters during implementation with actual use cases.

## 1.4. <a id='toc1_4_'></a>[Merge conflict to negative & remove neutral](#toc0_)

In [7]:
df_train['tags'] = df_train['tags'].apply(lambda x: [tag[:2] + 'NEG' if tag[2:] == 'CON' else tag for tag in x ])
df_test['tags'] = df_test['tags'].apply(lambda x: [tag[:2] + 'NEG' if tag[2:] == 'CON' else tag for tag in x])

In [8]:
# df_train['tags'] = df_train['tags'].apply(lambda x: ['O' if tag[2:] == 'NEU' else tag for tag in x])
# df_test['tags'] = df_test['tags'].apply(lambda x: ['O' if tag[2:] == 'NEU' else tag for tag in x])

## 1.5. <a id='toc1_5_'></a>[Convert df to HuggingFace datasets](#toc0_)

In [9]:
import datasets
from datasets import Dataset, DatasetDict, Features, Sequence, Value, ClassLabel

Convert aspect tag to id

In [10]:
# Generate list of unique aspect tags
tags = list(set(sum(df_train['tags'],[])))
tags.sort()

tag2idx = {k:i for i,k in enumerate(tags)}
idx2tag = {i:k for i,k in enumerate(tags)}

# Convert aspect tag text to ids
df_train['tags_idx'] = df_train['tags'].apply(lambda x: [tag2idx[k] for k in x])
df_test['tags_idx'] = df_test['tags'].apply(lambda x: [tag2idx[k] for k in x])

In [11]:
idx2tag

{0: 'B-NEG',
 1: 'B-NEU',
 2: 'B-POS',
 3: 'I-NEG',
 4: 'I-NEU',
 5: 'I-POS',
 6: 'O'}

In [12]:
tags

['B-NEG', 'B-NEU', 'B-POS', 'I-NEG', 'I-NEU', 'I-POS', 'O']

Convert pandas to HuggingFace datasets

In [13]:
# Split train into train & validation set
df_train_ori, df_val = train_test_split(df_train, test_size=0.3, random_state=42,)

In [14]:
# Define dataset features
features = Features({'tokens': Sequence(Value(dtype='string', id=None)),
                    'tags_idx': Sequence(ClassLabel(names=tags))
                    })

tds = Dataset.from_pandas(df_train_ori[['tokens','tags_idx']], features=features, preserve_index=False)
vds = Dataset.from_pandas(df_val[['tokens','tags_idx']], features=features,  preserve_index=False)
tsds = Dataset.from_pandas(df_test[['tokens','tags_idx']], features=features, preserve_index=False)

ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds
ds['test'] = tsds

print(ds)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags_idx'],
        num_rows: 2125
    })
    validation: Dataset({
        features: ['tokens', 'tags_idx'],
        num_rows: 911
    })
    test: Dataset({
        features: ['tokens', 'tags_idx'],
        num_rows: 800
    })
})


Check for total counts per aspect type in each data split

In [15]:
# Convert aspect idx to aspect aspect text
def create_tag_names(batch):
    return {"tags": [ds["train"].features["tags_idx"].feature.int2str(idx) for idx in batch["tags_idx"]]}

ds = ds.map(create_tag_names)

Map:   0%|          | 0/2125 [00:00<?, ? examples/s]

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [16]:
# Perform aspect type counts per each dataset split
split2freqs = defaultdict(Counter)
for split, dataset in ds.items():
    for row in dataset["tags"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
                
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,NEG,POS,NEU
train,582,675,304
validation,325,312,157
test,144,340,169


## 1.6. <a id='toc1_6_'></a>[Word features](#toc0_)

In [17]:
import nltk
from nltk.corpus import stopwords, opinion_lexicon
import string

# Load positive and negative words from the opinion lexicon
POSITIVE_WORDS = set(opinion_lexicon.positive())
NEGATIVE_WORDS = set(opinion_lexicon.negative())
# Load common stop words in english
EN_STOP_WORDS = set(stopwords.words('english'))

In [18]:
# Function to convert sentences into features
def word2features(sent, word_idx, backward_window_size=5, forward_window_size=5): 
    word = sent[word_idx][0]

    _, pos = zip(*nltk.pos_tag([x[0] for x in sent]))

    tag_sentiment = lambda word: 'POS' if word in POSITIVE_WORDS else 'NEG' if word in NEGATIVE_WORDS else 'NEU'
    features = {
        'word.lower()': word.lower(), # word
        'word.index()': word_idx,
        'word.reverseindex()': len(sent) - 1 - word_idx, # reverse index - nth word from end of sentence
        'word.pos': pos[word_idx],
        'word.opinionlexicon': tag_sentiment(word.lower()),
        'word.isstopword()': word.lower() in EN_STOP_WORDS,
        'word[-3:]': word[-3:], # last 4 char
        'word[-2:]': word[-2:], # last 3 char - in case of -ing, -ion, etc.
        'word.isupper()': word.isupper(), # is the word in upper case
        'word.istitle()': word.istitle(), # is the first letter of the word in upper case
        'word.isdigit()': word.isdigit(), # is the word full of digit
        'word.ispunctuation()': word.lower() in string.punctuation, # is punctuation
    }

    if word_idx > 0:
        for k in range(1, min(backward_window_size, word_idx)+1):
            prev_word = sent[word_idx - k][0]
            prev_pos = pos[word_idx - k]
            
            features.update({
                f'-{k}:word.lower()': prev_word.lower(),
                f'-{k}:word.pos': prev_pos,
                f'-{k}:word.opinionlexicon': tag_sentiment(word.lower()),
                f'-{k}:word.isstopword()': prev_word in EN_STOP_WORDS,
                f'-{k}:word.istitle()': prev_word.istitle(),
                f'-{k}:word.isupper()': prev_word.isupper(),
                f'-{k}:word.ispunctuation()': prev_word.lower() in string.punctuation, # is punctuation
            })
    else:
        features['BOS'] = True  # Beginning of sentence

    if word_idx < len(sent) - 1:
        for k in range(1, min(forward_window_size, len(sent) - word_idx - 1)+1):
            next_word = sent[word_idx + k][0]
            next_pos = pos[word_idx + k]

            features.update({
                f'+{k}:word.lower()': next_word.lower(),
                f'+{k}:word.pos': next_pos,
                f'+{k}:word.opinionlexicon': tag_sentiment(word.lower()),
                f'+{k}:word.isstopword()': next_word in EN_STOP_WORDS,
                f'+{k}:word.istitle()': next_word.istitle(),
                f'+{k}:word.isupper()': next_word.isupper(),
                f'+{k}:word.ispunctuation()': next_word.lower() in string.punctuation, # is punctuation
            })
    else:
        features['EOS'] = True  # End of sentence

    return features

# Function to convert sentences into feature sequences
def sent2features(sent, backward_window_size=5, forward_window_size=5):
    return [word2features(sent, i, backward_window_size, forward_window_size) for i in range(len(sent))]

def get_features(example):
    example['word_features'] = [ sent2features(sent) for sent in example['tokens']]
    return example

In [19]:
ds_features = ds.map(get_features, batched=True)

Map:   0%|          | 0/2125 [00:00<?, ? examples/s]

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

# 2. <a id='toc2_'></a>[EDA](#toc0_)

In [20]:
# 

# 3. <a id='toc3_'></a>[Model performance class](#toc0_)

In [21]:
class ModelPerformanceLog:
  
  def __init__(self, y_true):
    self.y_true = y_true
    self.log = {}
    self.bert_epoch_history = pd.DataFrame()

  def add_model_performance(self, model_name, y_pred, model=None): # Model can be path to the model or something like that
    sklearn_f1_score, f1, clf_report = self.__calculate_metrics(y_pred)

    model_perf = {'y_pred': y_pred
                  ,'model': model
                  ,'sklearn_f1_score': sklearn_f1_score.round(2)
                  ,'f1': f1.round(4)
                  ,'report': clf_report
                  }
    
    self.log[model_name] = model_perf

  def __calculate_sklearn_f1(self, y_pred):
    y_pred = sum(y_pred,[])
    y_true = sum(self.y_true,[])
    return sklearn_f1_score(y_true, y_pred, average='macro', zero_division=0)

  def __calculate_metrics(self, y_pred):
    return self.__calculate_sklearn_f1(y_pred), f1_score(self.y_true, y_pred), classification_report(self.y_true, y_pred, zero_division=0)

  def get_logs(self):
    logs_df = pd.DataFrame(list(self.log.values()), index=self.log.keys())
    return logs_df[['sklearn_f1_score', 'f1']]
  
  def add_bert_training_epoch_history(self, model_name, epoch_history_df):
    epoch_history_df['Model'] = model_name
    self.bert_epoch_history = pd.concat([self.bert_epoch_history, epoch_history_df])
    

In [22]:
y_val = ds['validation'].to_pandas()['tags_idx']
y_val = [[idx2tag[tag] for tag in sent] for sent in y_val]

modelPerformanceLog = ModelPerformanceLog(y_val)
# modelPerformanceLog.get_logs()

# 4. <a id='toc4_'></a>[Random forest](#toc0_)

In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [24]:
def features2df(sent, sent_idx):
  sent_df = pd.DataFrame(list(sent))
  sent_df['sentence_idx'] = sent_idx

  return sent_df

In [25]:
def tags2df(tags, sent_idx):
  tags_df = pd.DataFrame({'tags': tags})

  tags_df['sentence_idx'] = sent_idx
  return tags_df

In [26]:
def random_forest_data_prepare(ds, one_hot_encoder=None):
  X_train = ds.to_pandas()['word_features']
  X_train = pd.concat([features2df(sent, i) for i, sent in enumerate(X_train)], ignore_index=True)

  X_train.replace(True,1, inplace=True)
  X_train.replace(False,0, inplace=True)

    # Fill NaN values in object-type columns with a 'missing'
  object_columns = X_train.select_dtypes(include=['object']).columns
  X_train[object_columns] = X_train[object_columns].fillna(value='missing')

  # Fill the rest NaN with -1, since most of missing features are boolean
  X_train = X_train.fillna(-1)

  # We renove all categorical columns that are not POS, or sentiment lexicon + nth_sentence which the number of the sentence we we use to string back the data at the end, but is not needed for training
  drop_cols = [x for x in list(X_train.columns) if re.match('(.*word\.lower\(\))|(.*word\[-\d\:])', x)] # word columns
  X_train = X_train.drop(columns=drop_cols, axis=1)

    # Perform one-hot-encoder on the reamining columns
  if one_hot_encoder is None: # This is to make sure we use the same one-hot-encoder for both train & test split (avoiding data leakage)
    one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')  # 'drop' parameter is optional, set to 'first' to avoid multicollinearity
    encoded_data = one_hot_encoder.fit_transform(X_train.select_dtypes(include=['object']))

  else: # when we perform data prep for test data, we can reuse the one-hot-encoder used during training data preparation
    encoded_data = one_hot_encoder.transform(X_train.select_dtypes(include=['object']))

  # Re-added one-hot-encoding data back to the main dataframe
  df_encoded = pd.DataFrame(encoded_data, columns=one_hot_encoder.get_feature_names_out(X_train.select_dtypes(include=['object']).columns))
  X_train = pd.concat([X_train, df_encoded], axis=1)

  # Dropped all the categorical features that have already been one-hot-encoded
  X_train.drop(columns=X_train.select_dtypes(include=['object']).columns, inplace=True)

  y_train =  pd.concat([tags2df(sent, i) for i, sent in enumerate(ds.to_pandas()['tags_idx'])], ignore_index=True)# sum(ds.to_pandas()['tags_idx'], [])
  return X_train, y_train, one_hot_encoder


In [27]:
X_train, y_train, one_hot_encoder = random_forest_data_prepare(ds_features['train'])
X_val, y_val, _ = random_forest_data_prepare(ds_features['validation'], one_hot_encoder)



In [28]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the features
X_train_scaled = scaler.fit_transform(X_train.drop('sentence_idx', axis=1))
X_val_scaled = scaler.transform(X_val.drop('sentence_idx', axis=1)) # Double check why we are seeing more columns in test????

In [29]:
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=5, random_state=42)
rf_classifier.fit(X_train_scaled, y_train['tags'])

y_pred_rf = rf_classifier.predict(X_val_scaled)

In [30]:
# Add back to validation set to get sentence id
y_val['predict'] = y_pred_rf 

# Collapse back to the required format
y_pred_rf = y_val.groupby('sentence_idx')['predict'].agg(lambda x: x.tolist())

# Convert to name lablel
y_pred_rf = [[idx2tag[tag] for tag in sent] for sent in y_pred_rf]

modelPerformanceLog.add_model_performance('random forest',y_pred_rf)
modelPerformanceLog.get_logs()

Unnamed: 0,sklearn_f1_score,f1
random forest,0.14,0.0


# 5. <a id='toc5_'></a>[CRF](#toc0_)

In [31]:
from sklearn_crfsuite import CRF

In [32]:
X_train = ds_features['train'].to_pandas()['word_features']
y_train = ds_features['train'].to_pandas()['tags_idx']
y_train = [[idx2tag[tag] for tag in sent] for sent in y_train]

X_val = ds_features['validation'].to_pandas()['word_features']

In [33]:
# Create and train CRF model
crf_model = CRF(algorithm='lbfgs',
                max_iterations=100,
                c1=0.5,
                c2=0.05)

# There is this error existing with this library: 'CRF' object has no attribute 'keep_tempfiles'
# which has not been resolved and we can bypass it using this trick.
try:
  crf_model.fit(X_train, y_train)
except AttributeError:
  pass

In [34]:
y_pred_crf = crf_model.predict(X_val)

In [35]:
modelPerformanceLog.add_model_performance('crf',y_pred_crf)
modelPerformanceLog.get_logs()

Unnamed: 0,sklearn_f1_score,f1
random forest,0.14,0.0
crf,0.14,0.0


# 6. <a id='toc6_'></a>[Bi-LSTM](#toc0_)

In [36]:
# # Author: Robert Guthrie

# import torch
# import torch.autograd as autograd
# import torch.nn as nn
# import torch.optim as optim

# torch.manual_seed(1)

In [37]:
# def argmax(vec):
#     # return the argmax as a python int
#     _, idx = torch.max(vec, 1)
#     return idx.item()


# def prepare_sequence(seq, to_ix):
#     idxs = [to_ix[w] for w in seq]
#     return torch.tensor(idxs, dtype=torch.long)


# # Compute log sum exp in a numerically stable way for the forward algorithm
# def log_sum_exp(vec):
#     max_score = vec[0, argmax(vec)]
#     max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
#     return max_score + \
#         torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [38]:
# class BiLSTM_CRF(nn.Module):

#     def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
#         super(BiLSTM_CRF, self).__init__()
#         self.embedding_dim = embedding_dim
#         self.hidden_dim = hidden_dim
#         self.vocab_size = vocab_size
#         self.tag_to_ix = tag_to_ix
#         self.tagset_size = len(tag_to_ix)

#         self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
#                             num_layers=1, bidirectional=True)

#         # Maps the output of the LSTM into tag space.
#         self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

#         # Matrix of transition parameters.  Entry i,j is the score of
#         # transitioning *to* i *from* j.
#         self.transitions = nn.Parameter(
#             torch.randn(self.tagset_size, self.tagset_size))

#         # These two statements enforce the constraint that we never transfer
#         # to the start tag and we never transfer from the stop tag
#         self.transitions.data[tag_to_ix[START_TAG], :] = -10000
#         self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

#         self.hidden = self.init_hidden()

#     def init_hidden(self):
#         return (torch.randn(2, 1, self.hidden_dim // 2),
#                 torch.randn(2, 1, self.hidden_dim // 2))

#     def _forward_alg(self, feats):
#         # Do the forward algorithm to compute the partition function
#         init_alphas = torch.full((1, self.tagset_size), -10000.)
#         # START_TAG has all of the score.
#         init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

#         # Wrap in a variable so that we will get automatic backprop
#         forward_var = init_alphas

#         # Iterate through the sentence
#         for feat in feats:
#             alphas_t = []  # The forward tensors at this timestep
#             for next_tag in range(self.tagset_size):
#                 # broadcast the emission score: it is the same regardless of
#                 # the previous tag
#                 emit_score = feat[next_tag].view(
#                     1, -1).expand(1, self.tagset_size)
#                 # the ith entry of trans_score is the score of transitioning to
#                 # next_tag from i
#                 trans_score = self.transitions[next_tag].view(1, -1)
#                 # The ith entry of next_tag_var is the value for the
#                 # edge (i -> next_tag) before we do log-sum-exp
#                 next_tag_var = forward_var + trans_score + emit_score
#                 # The forward variable for this tag is log-sum-exp of all the
#                 # scores.
#                 alphas_t.append(log_sum_exp(next_tag_var).view(1))
#             forward_var = torch.cat(alphas_t).view(1, -1)
#         terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
#         alpha = log_sum_exp(terminal_var)
#         return alpha

#     def _get_lstm_features(self, sentence):
#         self.hidden = self.init_hidden()
#         embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
#         lstm_out, self.hidden = self.lstm(embeds, self.hidden)
#         lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
#         lstm_feats = self.hidden2tag(lstm_out)
#         return lstm_feats

#     def _score_sentence(self, feats, tags):
#         # Gives the score of a provided tag sequence
#         score = torch.zeros(1)
#         tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
#         for i, feat in enumerate(feats):
#             score = score + \
#                 self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
#         score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
#         return score

#     def _viterbi_decode(self, feats):
#         backpointers = []

#         # Initialize the viterbi variables in log space
#         init_vvars = torch.full((1, self.tagset_size), -10000.)
#         init_vvars[0][self.tag_to_ix[START_TAG]] = 0

#         # forward_var at step i holds the viterbi variables for step i-1
#         forward_var = init_vvars
#         for feat in feats:
#             bptrs_t = []  # holds the backpointers for this step
#             viterbivars_t = []  # holds the viterbi variables for this step

#             for next_tag in range(self.tagset_size):
#                 # next_tag_var[i] holds the viterbi variable for tag i at the
#                 # previous step, plus the score of transitioning
#                 # from tag i to next_tag.
#                 # We don't include the emission scores here because the max
#                 # does not depend on them (we add them in below)
#                 next_tag_var = forward_var + self.transitions[next_tag]
#                 best_tag_id = argmax(next_tag_var)
#                 bptrs_t.append(best_tag_id)
#                 viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
#             # Now add in the emission scores, and assign forward_var to the set
#             # of viterbi variables we just computed
#             forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
#             backpointers.append(bptrs_t)

#         # Transition to STOP_TAG
#         terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
#         best_tag_id = argmax(terminal_var)
#         path_score = terminal_var[0][best_tag_id]

#         # Follow the back pointers to decode the best path.
#         best_path = [best_tag_id]
#         for bptrs_t in reversed(backpointers):
#             best_tag_id = bptrs_t[best_tag_id]
#             best_path.append(best_tag_id)
#         # Pop off the start tag (we dont want to return that to the caller)
#         start = best_path.pop()
#         assert start == self.tag_to_ix[START_TAG]  # Sanity check
#         best_path.reverse()
#         return path_score, best_path

#     def neg_log_likelihood(self, sentence, tags):
#         feats = self._get_lstm_features(sentence)
#         forward_score = self._forward_alg(feats)
#         gold_score = self._score_sentence(feats, tags)
#         return forward_score - gold_score

#     def forward(self, sentence):  # dont confuse this with _forward_alg above.
#         # Get the emission scores from the BiLSTM
#         lstm_feats = self._get_lstm_features(sentence)

#         # Find the best path, given the features.
#         score, tag_seq = self._viterbi_decode(lstm_feats)
#         return score, tag_seq

In [39]:
# START_TAG = "<START>"
# STOP_TAG = "<STOP>"
# EMBEDDING_DIM = 5
# HIDDEN_DIM = 4

# # Make up some training data
# training_data = [(
#     "the wall street journal reported today that apple corporation made money".split(),
#     "B I I I O O O B I O O".split()
# ), (
#     "georgia tech is a university in georgia".split(),
#     "B I O O O O B".split()
# )]

# word_to_ix = {}
# for sentence, tags in training_data:
#     for word in sentence:
#         if word not in word_to_ix:
#             word_to_ix[word] = len(word_to_ix)

# tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}

# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# # Check predictions before training
# with torch.no_grad():
#     precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
#     precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
#     print(model(precheck_sent))

# # Make sure prepare_sequence from earlier in the LSTM section is loaded
# for epoch in range(
#         300):  # again, normally you would NOT do 300 epochs, it is toy data
#     for sentence, tags in training_data:
#         # Step 1. Remember that Pytorch accumulates gradients.
#         # We need to clear them out before each instance
#         model.zero_grad()

#         # Step 2. Get our inputs ready for the network, that is,
#         # turn them into Tensors of word indices.
#         sentence_in = prepare_sequence(sentence, word_to_ix)
#         targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

#         # Step 3. Run our forward pass.
#         loss = model.neg_log_likelihood(sentence_in, targets)

#         # Step 4. Compute the loss, gradients, and update the parameters by
#         # calling optimizer.step()
#         loss.backward()
#         optimizer.step()

# # Check predictions after training
# with torch.no_grad():
#     precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
#     print(model(precheck_sent))
# # We got it!

# 7. <a id='toc7_'></a>[BERT](#toc0_)

In [40]:

import torch
import torch.nn as nn
from torch.nn.functional import cross_entropy

from transformers import AutoConfig, DistilBertConfig, AutoTokenizer, TrainingArguments, DataCollatorForTokenClassification, Trainer
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.distilbert.modeling_distilbert import DistilBertModel
from transformers.models.distilbert.modeling_distilbert import DistilBertPreTrainedModel

device = torch.device('mps') # This is required for Mac
# torch.mps.empty_cache()

2024-02-03 17:23:52.363882: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 7.1. <a id='toc7_1_'></a>[Model](#toc0_)

In [41]:
class DistilBertForABSA(DistilBertPreTrainedModel):
  config_class = DistilBertConfig

  def __init__(self, config, architecture, loss_token_weights):
    super().__init__(config)
    # Roberta body
    self.num_labels = config.num_labels
    self.distilbert = DistilBertModel(config)#, add_pooling_layer=False)
    self.architecture = architecture
    self.loss_token_weights = loss_token_weights
    # Classification head
    self.dropout = nn.Dropout(config.dropout)#hidden_dropout_prob)
    

    if architecture == 'additional_linear':
        # Additional layer
        self.additional_linear = nn.Linear(config.hidden_size, 10)
        self.additional_dropout = nn.Dropout(0.5)#hidden_dropout_prob)
        self.classifier = nn.Linear(10, config.num_labels)
        

    elif architecture == 'lstm':
        self.lstm = nn.LSTM(config.hidden_size, 30, bidirectional=True)
        self.additional_dropout = nn.Dropout(0.5)#hidden_dropout_prob)
        self.classifier = nn.Linear(60, config.num_labels)
    else:
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    # Load and initialize weights from pretrained
    self.init_weights()

  def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, 
                labels=None, **kwargs):
        
    # Use model body to get encoder representations
    outputs = self.distilbert(input_ids, attention_mask=attention_mask,
                        #    token_type_ids=token_type_ids,
                                **kwargs)
    
    # Apply classifier to encoder representation (model head)
    sequence_output = self.dropout(outputs[0])

    if self.architecture == 'additional_linear':
        # Additional layer
      additional_linear_output = self.additional_linear(sequence_output)
      logit_input = self.additional_dropout(additional_linear_output)
    elif self.architecture == 'lstm':
      lstm_output, _ = self.lstm(sequence_output)
      logit_input = self.additional_dropout(lstm_output)
    else:
      logit_input = sequence_output
    
    logits = self.classifier(logit_input)
    
    # Calculate losses
    loss = None
    if labels is not None:
        if len(self.loss_token_weights) > 0:
          # loss_weights = torch.tensor(self.loss_token_weights, dtype=torch.float16).to(device) # This kills the kernel, not sure why
          loss_fct = nn.CrossEntropyLoss()
        else:
          loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
    # Return model output object
    return TokenClassifierOutput(loss=loss, logits=logits, 
                                     hidden_states=outputs.hidden_states, 
                                     attentions=outputs.attentions)
    

## 7.3. <a id='toc7_3_'></a>[Upsampling / downsampling](#toc0_)

## Downsampling

In [42]:
from random import randint
df_train_ori['aspect_token_counts'] = df_train_ori['tags'].apply(lambda x: sum([1 for tag in x if tag != 'O']))
df_train_ori['select'] = df_train_ori['aspect_token_counts'].apply(lambda x: randint(0,2) if x == 0 else 1)

downsample_df_train = df_train_ori[df_train_ori['select']==1].copy()

In [43]:
import copy

downsample_tds = Dataset.from_pandas(downsample_df_train[['tokens','tags_idx','tags']], features=ds['train'].features, preserve_index=False)

downsample_ds = copy.copy(ds)

downsample_ds['train'] = downsample_tds


print(downsample_ds)
print(ds)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 1382
    })
    validation: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 911
    })
    test: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 800
    })
})
DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 2125
    })
    validation: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 911
    })
    test: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 800
    })
})


### Upsampling

In [44]:
from math import log,sqrt, ceil

def custom_sampling_based_on_score(df, method):
  # Get stats of the class distribution of the dataset
  labels = sum(df['tags'],[])
  num_tokens = len(labels)
  ent = [label[2:] for label in labels if label != 'O']
  stats = Counter(ent)
  for key in stats:
      #Use frequency instead of count
      stats[key] = stats[key]/num_tokens

  if method not in ['sc','sCR','sCRD','nsCRD']:
    raise ValueError("Unidentified Resampling Method")


  
  # df['score'] = df['tags'].apply(lambda x: sum([ 0 if tag.startswith('O') else 1 for tag in x  ]) + 1)
  
  custom_sampled_df = pd.DataFrame(columns=df.columns)

  # for idx in range(len(df)):
  #     num_samples = df.iloc[idx]['score']
  #     sent = df.iloc[idx].to_dict()

  #     for i in range(num_samples):
  #     # Append the sampled subset to the list
  #       sampled_data.append(sent)
  #       # custom_sampled_df = pd.concat([custom_sampled_df, df.iloc[idx]], ignore_index=True, axis = 0)
  # # Concatenate the sampled subsets to create the final upsampled DataFrame
  # custom_sampled_df = pd.concat([custom_sampled_df, pd.DataFrame(sampled_data)], ignore_index=True)
  
  for sen in range(len(df)):
    sampled_data = [] 
    # Resampling time can at least be 1, which means sentence without 
    # entity will be reserved in the dataset  
    rsp_time = 1
    sen_len = len(df.iloc[sen]['tags'])
    ents = Counter([label[2:] for label in df.iloc[sen]['tags'] if label != 'O'])
          # Pass if there's no entity in a sentence
    
    
    if ents:
      for ent in ents.keys():
        # Resampling method selection and resampling time calculation, 
        # see section 'Resampling Functions' in our paper for details.
        if method == 'sc':
          rsp_time += ents[ent]
        if method == 'sCR' or method == 'sCRD':
          weight = -log(stats[ent],2)
          rsp_time += ents[ent]*weight
        if method == 'nsCRD':
          weight = -log(stats[ent],2)
          rsp_time += sqrt(ents[ent])*weight
        if method == 'sCR':
          rsp_time = sqrt(rsp_time)
        if method == 'sCRD' or method == 'nsCRD':
          rsp_time = rsp_time/sqrt(sen_len)
      # Ceiling to ensure the integrity of resamling time
      rsp_time = ceil(rsp_time) 
    
    for t in range(rsp_time):
      for token in range(sen_len):
        sampled_data.append(df.iloc[sen].to_dict())


    custom_sampled_df = pd.concat([custom_sampled_df, pd.DataFrame(sampled_data)], ignore_index=True)




  return custom_sampled_df

In [45]:
# Use the custom sampling method
sCR_df = custom_sampling_based_on_score(df_train_ori.copy(), 'sCR')

# Check the distribution of scores in the custom sampled DataFrame
# print(custom_sampled_df['entities_score'].value_counts())
print(len(sCR_df))

95286


In [46]:
# Perform aspect type counts per each dataset split
split2freqs = defaultdict(Counter)

for row in df_train_ori["tags"]:
    for tag in row:
        if tag.startswith("B"):
            tag_type = tag.split("-")[1]
        # split2freqs['train'][tag] += 1

for row in sCR_df["tags"]:
    for tag in row:
        if tag.startswith("B"):
            tag_type = tag.split("-")[1]
        # split2freqs['sCR_df'][tag] += 1
                
pd.DataFrame.from_dict(split2freqs, orient="index")

In [47]:
import copy

scr_tds = Dataset.from_pandas(sCR_df[['tokens','tags_idx','tags']], features=ds['train'].features, preserve_index=False)

scr_ds = copy.copy(ds)

scr_ds['train'] = scr_tds


print(scr_ds)
print(ds)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 95286
    })
    validation: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 911
    })
    test: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 800
    })
})
DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 2125
    })
    validation: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 911
    })
    test: Dataset({
        features: ['tokens', 'tags_idx', 'tags'],
        num_rows: 800
    })
})


In [48]:
# df_train_ori['score'] = df_train_ori['tags'].apply(lambda x:sum([ 1 for tag in x if tag != 'O']))

In [49]:
# print(len(df_train_ori))
# print(len(df_train_ori[df_train_ori['score']==0]))

In [50]:
# from random import randint
# df_train_ori['select'] = df_train_ori['score'].apply(lambda x: randint(0,2) if x == 0 else 1)

In [51]:
# custom_sampled_df = df_train_ori[df_train_ori['select']==1].copy()

In [52]:
# for row in custom_sampled_df["tags"]:
#     for tag in row:
#         # if tag.startswith("B"):
#         #     tag_type = tag.split("-")[1]
#         split2freqs['down_sample_1_3'][tag] += 1

# pd.DataFrame.from_dict(split2freqs, orient="index")

In [53]:
# tag_counts = pd.DataFrame.from_dict(split2freqs, orient="index")

# tag_counts['total_tag'] = tag_counts.sum(axis=1)
# tag_counts['entity_ratio'] = tag_counts['O'] / tag_counts['total_tag']
# tag_counts

## 7.4. <a id='toc7_4_'></a>[Model tuning](#toc0_)

In [54]:
class ModelTuning:
  def __init__(self, pretrained_model_name, index2tag, tag2index, y_validation=None, modelPerformanceLog=None):
    
    self.pretrained_model_name = pretrained_model_name
    self.index2tag = index2tag
    self.tag2index = tag2index

    self.pretrained_model_config = AutoConfig.from_pretrained(pretrained_model_name, 
                                        num_labels=len(self.index2tag),
                                        id2label=index2tag, label2id=tag2index)
    
    if modelPerformanceLog != None:
      self.modelPerformanceLog = modelPerformanceLog
    else:
      self.modelPerformanceLog = ModelPerformanceLog(y_validation)

  def model_init(self, architecture, loss_token_weights):
    return (DistilBertForABSA
            .from_pretrained(self.pretrained_model_name, config=self.pretrained_model_config, architecture=architecture, loss_token_weights=loss_token_weights)
            .to(device))
  
  def align_predictions(self, predictions, label_ids):
    preds = np.argmax(predictions, axis=2)

    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                index2tag = self.pretrained_model_config.id2label.copy()
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

  # Define performance metrics
  def compute_metrics(self, eval_pred):
    y_pred, y_true = self.align_predictions(eval_pred.predictions, 
                                        eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred, average='macro')}

  def forward_pass_with_label(self,data_collator, trainer, batch):
    # Convert dict of lists to list of dicts suitable for data collator
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    # Pad inputs and labels and put all tensors on device
    batch = data_collator(features)
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    with torch.no_grad():
        # Pass data through model  
        output = trainer.model.to(device)(input_ids, attention_mask)

        # Logit.size: [batch_size, sequence_length, classes]
        # Predict class with largest logit value on classes axis
        predicted_label = torch.argmax(output.logits, axis=-1).cpu().numpy()
    # Calculate loss per token after flattening batch dimension with view
    loss = cross_entropy(output.logits.view(-1, tags.num_classes), 
                          labels.view(-1), reduction="none")
    # Unflatten batch dimension and convert to numpy array
    loss = loss.view(len(input_ids), -1).cpu().numpy()

    return {"loss":loss, "predicted_label": predicted_label}

  def tokenize_and_align_labels(self, tokenizer, examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, 
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["tags_idx"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs
  
  def encode_dataset(self, tokenizer, corpus):
    return corpus.map(lambda examples: self.tokenize_and_align_labels(tokenizer, examples), 
                      batched=True, 
                      remove_columns=['tags_idx', 'tokens','tags']
                      )
     
  def fine_tune_model(self, ds, model_name, architecture, num_epochs=6, batch_size=32, weight_decay=0.01, loss_token_weights=None):

    
    # Define tokenizer
    pretrained_tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name)

    # Define data collator for data batching 
    data_collator = DataCollatorForTokenClassification(pretrained_tokenizer)

    ds_encoded = self.encode_dataset(pretrained_tokenizer, ds)

    logging_steps = len(ds_encoded['train']) // batch_size
    model_name_ = f"{model_name}_{architecture}_e{num_epochs}_ld{re.sub(r'[^0-9]+','',str(weight_decay))}_b{batch_size}_w{'T' if len(loss_token_weights) > 0 else 'F'}"
    
    print('Training model: ', model_name_)
    training_args = TrainingArguments(output_dir="model/logs/" + model_name_
                                      ,log_level="error"
                                      ,num_train_epochs=num_epochs
                                      ,per_device_train_batch_size=batch_size
                                      ,per_device_eval_batch_size=batch_size
                                      ,evaluation_strategy="epoch"
                                      ,save_steps=1e6
                                      ,weight_decay=weight_decay
                                      ,disable_tqdm=True
                                      ,logging_steps=logging_steps
                                      ,push_to_hub=False)
    
    trainer = Trainer(model_init=lambda: self.model_init(architecture, loss_token_weights)
                      ,args=training_args
                      ,data_collator=data_collator
                      ,compute_metrics=self.compute_metrics
                      ,train_dataset=ds_encoded["train"]
                      ,eval_dataset=ds_encoded["validation"]
                      ,tokenizer=pretrained_tokenizer)
    
    trainer.train()
    # self.trainer = trainer

    trainer_res_df = pd.DataFrame(trainer.state.log_history)[['epoch','loss' ,'eval_loss','eval_f1']]
    trainer_res_df = trainer_res_df.rename(columns={"epoch":"Epoch","loss": "Training Loss", "eval_loss": "Validation Loss", 'eval_f1':'F1'})
    trainer_res_df['Epoch'] = trainer_res_df["Epoch"].apply(lambda x: round(x))
    trainer_res_df['Training Loss'] = trainer_res_df["Training Loss"].ffill()
    trainer_res_df[['Validation Loss', 'F1']] = trainer_res_df[['Validation Loss', 'F1']].bfill().ffill()
    trainer_res_df = trainer_res_df.drop_duplicates()
    self.modelPerformanceLog.add_bert_training_epoch_history(model_name_, trainer_res_df)

    validation_set = ds_encoded["validation"]
    validation_set = validation_set.map(lambda batch: self.forward_pass_with_label(data_collator, trainer, batch), batched=True, batch_size=32)
    df_validation = validation_set.to_pandas()

    # Cleanup & conver id2text 
    index2tag = trainer.model.config.id2label.copy()
    index2tag[-100] = "IGN"
    df_validation["input_tokens"] = df_validation["input_ids"].apply(
        lambda x: pretrained_tokenizer.convert_ids_to_tokens(x))
    df_validation["predicted_label_txt"] = df_validation["predicted_label"].apply(
        lambda x: [index2tag[i] for i in x])
    df_validation["labels_txt"] = df_validation["labels"].apply(
        lambda x: [index2tag[i] for i in x])
    df_validation['loss'] = df_validation.apply(
        lambda x: x['loss'][:len(x['input_ids'])], axis=1) # Remove padding tokens
    df_validation['predicted_label'] = df_validation.apply(
        lambda x: x['predicted_label'][:len(x['input_ids'])], axis=1) # Remove padding tokens
    df_validation['predicted_label_txt'] = df_validation.apply(
        lambda x: x['predicted_label_txt'][:len(x['input_ids'])], axis=1) # Remove padding tokens
    df_validation['labels_txt'] = df_validation.apply(
        lambda x: x['labels_txt'][:len(x['input_ids'])], axis=1) # Remove padding tokens


    # Flatten the outputs
    df_tokens = df_validation.apply(pd.Series.explode)
    df_tokens = df_tokens.query("labels != 'IGN'")
    df_tokens["loss"] = df_tokens["loss"].astype(float).round(2)

    df_validation_metrics = df_validation.copy()

    df_validation_metrics['predicted_label_txt'] = df_validation_metrics.apply(lambda x: [x['predicted_label_txt'][i] for i,k in enumerate(x['labels_txt']) if k != 'IGN' ], axis=1)
    df_validation_metrics[['input_tokens','predicted_label_txt']].to_csv('model/predictions/'+ model_name_ +'.csv')

    self.modelPerformanceLog.add_model_performance(model_name_,df_validation_metrics['predicted_label_txt'])

    trainer.save_model('model/saved/bert_'+ model_name_)
    return model_name_, df_validation, df_tokens

## Start tuning

In [55]:
pretrained_model_name = "distilbert-base-uncased"
tags = ds['train'].features['tags_idx'].feature
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

y_val = ds['validation'].to_pandas()['tags_idx']
y_val = [[index2tag[tag] for tag in sent] for sent in y_val]

modelTuning = ModelTuning(pretrained_model_name, index2tag, tag2index, y_val, modelPerformanceLog)
# modelTuning.modelPerformanceLog.get_logs()

In [56]:
def run_scenarios(model_idx, scenarios):
  runtime = datetime.now().strftime("%Y%m%d_%H%M")
  
  datasets = {'ori': ds
            ,'down': downsample_ds
            ,'scr': scr_ds
            ,'batch_size': 64
            }

  for scenario in scenarios:
    # Clear cache
    torch.mps.empty_cache()
    os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0' # this setting is needed to run NN on my Mac
    device = torch.device('mps') # This is required for Mac
    
    model_name_input = f"sc_{model_idx}_ds_{scenario['dataset']}"
    print('Start training: ', model_name_input)
    model_name, df_validation_predict, df_tokens_predict = modelTuning.fine_tune_model(datasets[scenario['dataset']], model_name_input, scenario['architecture'], scenario['epoch'],scenario['batch_size'],scenario['weight_decay'],[])
    model_idx += 1

  modelTuning.modelPerformanceLog.get_logs().to_csv(f'model/model_tuning_results_{runtime}.csv')  
  modelTuning.modelPerformanceLog.bert_epoch_history.to_csv(f'model/bert_epoch_history_{runtime}.csv')  

## First BERT

In [57]:
runtime = datetime.now().strftime("%Y%m%d_%H%M")
model_idx = 1

dataset = ds
architecture = 'linear'
epoch = 15
batch_size = 64
weight_decay = 0.1
loss_weights = []

# Clear cache
torch.mps.empty_cache()
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0' # this setting is needed to run NN on my Mac
device = torch.device('mps') # This is required for Mac

model_name_input = f"sc_{model_idx}_ds_ori"
model_name, df_validation_predict, df_tokens_predict = modelTuning.fine_tune_model(dataset, model_name_input, architecture, epoch,batch_size,weight_decay,loss_weights)

modelTuning.modelPerformanceLog.get_logs().to_csv(f'model/model_tuning_results_{runtime}.csv')  
modelTuning.modelPerformanceLog.bert_epoch_history.to_csv(f'model/bert_epoch_history_{runtime}.csv')  

Map:   0%|          | 0/2125 [00:00<?, ? examples/s]

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Training model:  sc_1_ds_ori_linear_e15_ld01_b64_wF




{'loss': 0.4314, 'learning_rate': 4.6764705882352944e-05, 'epoch': 0.97}
{'eval_loss': 0.23763501644134521, 'eval_f1': 0.13653438514583166, 'eval_runtime': 15.585, 'eval_samples_per_second': 58.454, 'eval_steps_per_second': 0.962, 'epoch': 1.0}
{'loss': 0.168, 'learning_rate': 4.3529411764705885e-05, 'epoch': 1.94}
{'eval_loss': 0.18244996666908264, 'eval_f1': 0.3100483046325188, 'eval_runtime': 16.0764, 'eval_samples_per_second': 56.667, 'eval_steps_per_second': 0.933, 'epoch': 2.0}
{'loss': 0.113, 'learning_rate': 4.029411764705883e-05, 'epoch': 2.91}
{'eval_loss': 0.16173210740089417, 'eval_f1': 0.37333183385814966, 'eval_runtime': 17.4929, 'eval_samples_per_second': 52.078, 'eval_steps_per_second': 0.857, 'epoch': 3.0}
{'loss': 0.0716, 'learning_rate': 3.705882352941177e-05, 'epoch': 3.88}
{'eval_loss': 0.15820477902889252, 'eval_f1': 0.4666225354205377, 'eval_runtime': 16.411, 'eval_samples_per_second': 55.511, 'eval_steps_per_second': 0.914, 'epoch': 4.0}
{'loss': 0.0493, 'learni

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

In [58]:
modelTuning.modelPerformanceLog.get_logs()

Unnamed: 0,sklearn_f1_score,f1
random forest,0.14,0.0
crf,0.14,0.0
sc_1_ds_ori_linear_e15_ld01_b64_wF,0.6,0.5338


In [59]:
modelTuning.modelPerformanceLog.bert_epoch_history

Unnamed: 0,Epoch,Training Loss,Validation Loss,F1,Model
0,1,0.4314,0.237635,0.136534,sc_1_ds_ori_linear_e15_ld01_b64_wF
2,2,0.168,0.18245,0.310048,sc_1_ds_ori_linear_e15_ld01_b64_wF
4,3,0.113,0.161732,0.373332,sc_1_ds_ori_linear_e15_ld01_b64_wF
6,4,0.0716,0.158205,0.466623,sc_1_ds_ori_linear_e15_ld01_b64_wF
8,5,0.0493,0.17056,0.487552,sc_1_ds_ori_linear_e15_ld01_b64_wF
10,6,0.0364,0.175099,0.491567,sc_1_ds_ori_linear_e15_ld01_b64_wF
12,7,0.0263,0.187352,0.498673,sc_1_ds_ori_linear_e15_ld01_b64_wF
14,8,0.0183,0.200961,0.496602,sc_1_ds_ori_linear_e15_ld01_b64_wF
16,9,0.0144,0.196477,0.51219,sc_1_ds_ori_linear_e15_ld01_b64_wF
18,10,0.0098,0.201962,0.504115,sc_1_ds_ori_linear_e15_ld01_b64_wF


## 7.5. <a id='toc7_5_'></a>[Error analysis](#toc0_)

### 7.5.1. <a id='toc7_5_1_'></a>[Group by word token](#toc0_)

In [60]:
# (
#     df_tokens.groupby("input_tokens")[["loss"]]
#     .agg(["count", "mean", "sum"])
#     .droplevel(level=0, axis=1)  # Get rid of multi-level columns
#     .sort_values(by="sum", ascending=False)
#     .reset_index()
#     .round(2)
#     .head(10)
#     .T
# )

In [61]:
# (
#   df_tokens[df_tokens['input_tokens']=='battery'].groupby(["predicted_label", 'labels'])[["loss"]]
#     .agg(["count", "mean", "sum"])
#     .droplevel(level=0, axis=1)  # Get rid of multi-level columns
#     .sort_values(by="sum", ascending=False)
#     .reset_index()
#     .round(2)
#     .head(10)
#     .T
# )

### 7.5.2. <a id='toc7_5_2_'></a>[Group by Tag ID](#toc0_)

In [62]:
# (
#     df_tokens.groupby("labels")[["loss"]] 
#     .agg(["count", "mean", "sum"])
#     .droplevel(level=0, axis=1)
#     .sort_values(by="mean", ascending=False)
#     .reset_index()
#     .round(2)
#     .T
# )

In [63]:
# def plot_confusion_matrix(y_preds, y_true, labels):
#     cm = confusion_matrix(y_true, y_preds, normalize="true")
#     fig, ax = plt.subplots(figsize=(6, 6))
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
#     disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
#     plt.title("Normalized confusion matrix")
#     plt.show()

In [64]:
# plot_confusion_matrix(df_tokens["labels"], df_tokens["predicted_label"],
#                       list(tag2index.keys()))


In [65]:
# def get_samples(df):
#     for _, row in df.iterrows():
#         labels, preds, tokens, losses = [], [], [], []
#         for i, mask in enumerate(row["attention_mask"]):
#             if i not in {0, len(row["attention_mask"])}:
#                 labels.append(row["labels"][i])
#                 preds.append(row["predicted_label"][i])
#                 tokens.append(row["input_tokens"][i])
#                 losses.append(f"{row['loss'][i]:.2f}")
#         df_tmp = pd.DataFrame({"tokens": tokens, "labels": labels, 
#                                "preds": preds, "losses": losses}).T
#         yield df_tmp

# df_validation["total_loss"] = df_validation["loss"].apply(sum)
# df_tmp = df_validation.sort_values(by="total_loss", ascending=False).head(3)

# for sample in get_samples(df_tmp):
#     display(sample.T)
     

Errors could be from human / annotation errors: United Nations is ORG, not PER, similar to Central African Republic. This can happen as data was annotated using rule based, it is better with human annotations, but mistakes can always occur.

In [66]:
# df_tmp = df_validation.loc[df_validation["input_tokens"].apply(lambda x: u"\u2581(" in x)].head(2)
# for sample in get_samples(df_tmp):
#     display(sample.T)

## Tuning BERT

### Different datasets

In [67]:
model_idx = 10
scenarios = [
            # {'dataset': 'ori'
            #  ,'epoch': 15
            #  ,'architecture': 'linear'
            #  ,'weight_decay': 0.1
            #  ,'batch_size': 64
            # }
            
            {'dataset': 'down'
             ,'epoch': 15
             ,'architecture': 'linear'
             ,'weight_decay': 0.1
             ,'batch_size': 64
            }

            # ,{'dataset': 'scr'
            #  ,'epoch': 15
            #  ,'architecture': 'linear'
            #  ,'weight_decay': 0.1
            #  ,'batch_size': 64
            # }
            ]

run_scenarios(model_idx, scenarios)
modelTuning.modelPerformanceLog.get_logs()

Start training:  sc_10_ds_down


Map:   0%|          | 0/1382 [00:00<?, ? examples/s]

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Training model:  sc_10_ds_down_linear_e15_ld01_b64_wF




{'loss': 0.616, 'learning_rate': 4.681818181818182e-05, 'epoch': 0.95}
{'eval_loss': 0.25522541999816895, 'eval_f1': 0.06150440123042863, 'eval_runtime': 15.1583, 'eval_samples_per_second': 60.099, 'eval_steps_per_second': 0.99, 'epoch': 1.0}
{'loss': 0.2489, 'learning_rate': 4.3636363636363636e-05, 'epoch': 1.91}
{'eval_loss': 0.19782713055610657, 'eval_f1': 0.23462517387580198, 'eval_runtime': 14.6496, 'eval_samples_per_second': 62.186, 'eval_steps_per_second': 1.024, 'epoch': 2.0}
{'loss': 0.1772, 'learning_rate': 4.045454545454546e-05, 'epoch': 2.86}
{'eval_loss': 0.1702330857515335, 'eval_f1': 0.3299829789035698, 'eval_runtime': 16.0862, 'eval_samples_per_second': 56.632, 'eval_steps_per_second': 0.932, 'epoch': 3.0}
{'loss': 0.1229, 'learning_rate': 3.7272727272727276e-05, 'epoch': 3.82}
{'eval_loss': 0.15534688532352448, 'eval_f1': 0.4139983871257707, 'eval_runtime': 14.5215, 'eval_samples_per_second': 62.734, 'eval_steps_per_second': 1.033, 'epoch': 4.0}
{'loss': 0.0871, 'learn

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Start training:  sc_11_ds_scr


Map:   0%|          | 0/95286 [00:00<?, ? examples/s]

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Training model:  sc_11_ds_scr_linear_e15_ld01_b64_wF




{'loss': 0.0204, 'learning_rate': 4.6668905305574214e-05, 'epoch': 1.0}
{'eval_loss': 0.31874021887779236, 'eval_f1': 0.4706011739540095, 'eval_runtime': 14.4768, 'eval_samples_per_second': 62.928, 'eval_steps_per_second': 1.036, 'epoch': 1.0}


KeyboardInterrupt: 

In [68]:
best_ds = 'down' # 'scr'

### Architecture

In [82]:
model_idx = 10000
scenarios = [
            {'dataset': best_ds
             ,'epoch': 150
             ,'architecture': 'additional_linear'
             ,'weight_decay': 0.1
             ,'batch_size': 64
            }
            
            ,{'dataset': best_ds
             ,'epoch': 300
             ,'architecture': 'lstm'
             ,'weight_decay': 0.1
             ,'batch_size': 64
            }
            ]

run_scenarios(model_idx, scenarios)

Start training:  sc_10000_ds_down


Map:   0%|          | 0/1382 [00:00<?, ? examples/s]

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Training model:  sc_10000_ds_down_additional_linear_e150_ld01_b64_wF




{'loss': 1.6377, 'learning_rate': 4.968181818181818e-05, 'epoch': 0.95}
{'eval_loss': 1.379583477973938, 'eval_f1': 0.0, 'eval_runtime': 14.6425, 'eval_samples_per_second': 62.216, 'eval_steps_per_second': 1.024, 'epoch': 1.0}
{'loss': 1.3649, 'learning_rate': 4.936363636363637e-05, 'epoch': 1.91}
{'eval_loss': 1.2475454807281494, 'eval_f1': 0.0, 'eval_runtime': 16.0435, 'eval_samples_per_second': 56.783, 'eval_steps_per_second': 0.935, 'epoch': 2.0}
{'loss': 1.2707, 'learning_rate': 4.904545454545455e-05, 'epoch': 2.86}
{'eval_loss': 1.142648458480835, 'eval_f1': 0.0, 'eval_runtime': 14.5504, 'eval_samples_per_second': 62.61, 'eval_steps_per_second': 1.031, 'epoch': 3.0}
{'loss': 1.1788, 'learning_rate': 4.872727272727273e-05, 'epoch': 3.82}
{'eval_loss': 1.0339782238006592, 'eval_f1': 0.0, 'eval_runtime': 15.3201, 'eval_samples_per_second': 59.464, 'eval_steps_per_second': 0.979, 'epoch': 4.0}
{'loss': 1.0846, 'learning_rate': 4.840909090909091e-05, 'epoch': 4.77}
{'eval_loss': 0.924

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Start training:  sc_10001_ds_down


Map:   0%|          | 0/1382 [00:00<?, ? examples/s]

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Training model:  sc_10001_ds_down_lstm_e300_ld01_b64_wF




{'loss': 1.5747, 'learning_rate': 4.9840909090909096e-05, 'epoch': 0.95}
{'eval_loss': 1.305628776550293, 'eval_f1': 0.0, 'eval_runtime': 15.0392, 'eval_samples_per_second': 60.575, 'eval_steps_per_second': 0.997, 'epoch': 1.0}
{'loss': 1.2784, 'learning_rate': 4.968181818181818e-05, 'epoch': 1.91}
{'eval_loss': 1.184409499168396, 'eval_f1': 0.0, 'eval_runtime': 16.0361, 'eval_samples_per_second': 56.809, 'eval_steps_per_second': 0.935, 'epoch': 2.0}
{'loss': 1.1795, 'learning_rate': 4.9522727272727275e-05, 'epoch': 2.86}
{'eval_loss': 1.1006083488464355, 'eval_f1': 0.0, 'eval_runtime': 14.8957, 'eval_samples_per_second': 61.159, 'eval_steps_per_second': 1.007, 'epoch': 3.0}
{'loss': 1.1061, 'learning_rate': 4.936363636363637e-05, 'epoch': 3.82}
{'eval_loss': 1.0270586013793945, 'eval_f1': 0.0, 'eval_runtime': 16.2541, 'eval_samples_per_second': 56.047, 'eval_steps_per_second': 0.923, 'epoch': 4.0}
{'loss': 1.0396, 'learning_rate': 4.920454545454546e-05, 'epoch': 4.77}
{'eval_loss': 0.

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

In [83]:
modelTuning.modelPerformanceLog.get_logs()

Unnamed: 0,sklearn_f1_score,f1
random forest,0.14,0.0
crf,0.14,0.0
sc_1_ds_ori_linear_e15_ld01_b64_wF,0.6,0.5338
sc_10_ds_down_linear_e15_ld01_b64_wF,0.62,0.5491
sc_100_ds_down_additional_linear_e50_ld01_b64_wF,0.42,0.346
sc_101_ds_down_lstm_e70_ld01_b64_wF,0.14,0.0
sc_10000_ds_down_additional_linear_e100_ld05_b512_wF,0.14,0.0
sc_10000_ds_down_additional_linear_e150_ld01_b64_wF,0.57,0.4807
sc_10001_ds_down_lstm_e300_ld01_b64_wF,0.33,0.3333


In [84]:
modelTuning.modelPerformanceLog.bert_epoch_history

Unnamed: 0,Epoch,Training Loss,Validation Loss,F1,Model
0,1,0.4314,0.237635,0.136534,sc_1_ds_ori_linear_e15_ld01_b64_wF
2,2,0.1680,0.182450,0.310048,sc_1_ds_ori_linear_e15_ld01_b64_wF
4,3,0.1130,0.161732,0.373332,sc_1_ds_ori_linear_e15_ld01_b64_wF
6,4,0.0716,0.158205,0.466623,sc_1_ds_ori_linear_e15_ld01_b64_wF
8,5,0.0493,0.170560,0.487552,sc_1_ds_ori_linear_e15_ld01_b64_wF
...,...,...,...,...,...
604,296,0.1373,0.223521,0.286934,sc_10001_ds_down_lstm_e300_ld01_b64_wF
606,297,0.1376,0.223511,0.286754,sc_10001_ds_down_lstm_e300_ld01_b64_wF
608,298,0.1367,0.223524,0.286755,sc_10001_ds_down_lstm_e300_ld01_b64_wF
610,299,0.1387,0.223519,0.286755,sc_10001_ds_down_lstm_e300_ld01_b64_wF


### Tuning learning decay & batch size

In [88]:
model_idx = 100000
scenarios = [
            {'dataset': 'down'
             ,'epoch': 800
             ,'architecture': 'lstm'
             ,'weight_decay': 0.1
             ,'batch_size': 64
            }

            ]

run_scenarios(model_idx, scenarios)

Start training:  sc_100000_ds_down


Map:   0%|          | 0/1382 [00:00<?, ? examples/s]

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Training model:  sc_100000_ds_down_lstm_e800_ld01_b64_wF
{'loss': 1.5746, 'learning_rate': 4.994034090909091e-05, 'epoch': 0.95}
{'eval_loss': 1.3054938316345215, 'eval_f1': 0.0, 'eval_runtime': 16.8179, 'eval_samples_per_second': 54.168, 'eval_steps_per_second': 0.892, 'epoch': 1.0}
{'loss': 1.2782, 'learning_rate': 4.988068181818182e-05, 'epoch': 1.91}
{'eval_loss': 1.1840885877609253, 'eval_f1': 0.0, 'eval_runtime': 18.032, 'eval_samples_per_second': 50.521, 'eval_steps_per_second': 0.832, 'epoch': 2.0}
{'loss': 1.1791, 'learning_rate': 4.982102272727273e-05, 'epoch': 2.86}
{'eval_loss': 1.0998882055282593, 'eval_f1': 0.0, 'eval_runtime': 17.8819, 'eval_samples_per_second': 50.945, 'eval_steps_per_second': 0.839, 'epoch': 3.0}
{'loss': 1.1053, 'learning_rate': 4.976136363636364e-05, 'epoch': 3.82}
{'eval_loss': 1.0259116888046265, 'eval_f1': 0.0, 'eval_runtime': 17.1009, 'eval_samples_per_second': 53.272, 'eval_steps_per_second': 0.877, 'epoch': 4.0}
{'loss': 1.0384, 'learning_rate'

In [None]:
modelTuning.modelPerformanceLog.get_logs()

Unnamed: 0,sklearn_f1_score,f1
random forest,0.14,0.0
crf,0.14,0.0
sc_1_ds_ori_linear_e15_ld01_b64_wF,0.6,0.5338
sc_10_ds_down_linear_e15_ld01_b64_wF,0.62,0.5491
sc_100_ds_down_additional_linear_e50_ld01_b64_wF,0.42,0.346
sc_101_ds_down_lstm_e70_ld01_b64_wF,0.14,0.0
sc_10000_ds_down_additional_linear_e100_ld05_b512_wF,0.14,0.0


In [None]:
modelTuning.modelPerformanceLog.bert_epoch_history

In [85]:
model_idx = 100000
scenarios = [
            {'dataset': 'scr'
             ,'epoch': 10
             ,'architecture': 'linear'
             ,'weight_decay': 0.1
             ,'batch_size': 128
            }
            
            ]

run_scenarios(model_idx, scenarios)

Start training:  sc_100000_ds_scr


Map:   0%|          | 0/95286 [00:00<?, ? examples/s]

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Training model:  sc_100000_ds_scr_linear_e10_ld01_b128_wF




{'loss': 0.0341, 'learning_rate': 4.5006711409395974e-05, 'epoch': 1.0}
{'eval_loss': 0.29800546169281006, 'eval_f1': 0.5056252139297225, 'eval_runtime': 18.0774, 'eval_samples_per_second': 50.394, 'eval_steps_per_second': 0.443, 'epoch': 1.0}
{'loss': 0.0006, 'learning_rate': 4.0013422818791946e-05, 'epoch': 2.0}
{'eval_loss': 0.32127606868743896, 'eval_f1': 0.49374912550738204, 'eval_runtime': 17.0059, 'eval_samples_per_second': 53.57, 'eval_steps_per_second': 0.47, 'epoch': 2.0}
{'loss': 0.0003, 'learning_rate': 3.5020134228187925e-05, 'epoch': 3.0}
{'eval_loss': 0.33480408787727356, 'eval_f1': 0.5111566636092125, 'eval_runtime': 16.9126, 'eval_samples_per_second': 53.865, 'eval_steps_per_second': 0.473, 'epoch': 3.0}
{'loss': 0.0002, 'learning_rate': 3.0026845637583893e-05, 'epoch': 3.99}
{'eval_loss': 0.3487224578857422, 'eval_f1': 0.4916302020721394, 'eval_runtime': 17.1002, 'eval_samples_per_second': 53.274, 'eval_steps_per_second': 0.468, 'epoch': 4.0}
{'loss': 0.0002, 'learnin

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

In [86]:
modelTuning.modelPerformanceLog.get_logs()

Unnamed: 0,sklearn_f1_score,f1
random forest,0.14,0.0
crf,0.14,0.0
sc_1_ds_ori_linear_e15_ld01_b64_wF,0.6,0.5338
sc_10_ds_down_linear_e15_ld01_b64_wF,0.62,0.5491
sc_100_ds_down_additional_linear_e50_ld01_b64_wF,0.42,0.346
sc_101_ds_down_lstm_e70_ld01_b64_wF,0.14,0.0
sc_10000_ds_down_additional_linear_e100_ld05_b512_wF,0.14,0.0
sc_10000_ds_down_additional_linear_e150_ld01_b64_wF,0.57,0.4807
sc_10001_ds_down_lstm_e300_ld01_b64_wF,0.33,0.3333
sc_100000_ds_scr_linear_e10_ld01_b128_wF,0.59,0.5562


## 7.6. <a id='toc7_6_'></a>[Load saved model](#toc0_)

### 7.6.1. <a id='toc7_6_1_'></a>[Load model manually](#toc0_)

In [None]:
# from transformers import AutoModelForTokenClassification

# # Reload the model
# loaded_model = AutoModelForTokenClassification.from_pretrained('model/distilbert-base-uncased-absa-downsample-1-3').to(device) #output_model_dir
# loaded_tokenizer = AutoTokenizer.from_pretrained('model/distilbert-base-uncased-absa-downsample-1-3')


In [None]:
# num = 99
# sample_input = df_train.iloc[num]['text']
# print(sample_input)
# print(df_train.iloc[num]['aspects'])

In [None]:
# # Assuming you have some input data
# input_data = [sample_input]

# # Tokenize and get predictions
# inputs = loaded_tokenizer(input_data, is_split_into_words=True, return_tensors="pt")

# input_ids = inputs["input_ids"].to(device)
# attention_mask = inputs["attention_mask"].to(device)

# # Make predictions
# with torch.no_grad():
#   outputs = loaded_model(input_ids, attention_mask)

# predicted_label_idx = torch.argmax(outputs.logits, axis=-1).cpu().numpy()
# df_res = pd.DataFrame({'predicted_label': predicted_label_idx.tolist(), 
#                       'input_ids': inputs['input_ids'].numpy().tolist()}
#                       )

In [None]:
# index2tag_new = loaded_model.config.id2label.copy()
# index2tag_new[-100] = "IGN"
# df_res["input_tokens"] = df_res["input_ids"].apply(
#     lambda x: loaded_tokenizer.convert_ids_to_tokens(x))
# df_res["predicted_label_text"] = df_res["predicted_label"].apply(
#     lambda x: [index2tag_new[i] for i in x])
# df_res['predicted_label'] = df_res.apply(
#     lambda x: x['predicted_label'][:len(x['input_ids'])], axis=1)
# df_res['predicted_label_text'] = df_res.apply(
#     lambda x: x['predicted_label_text'][:len(x['input_ids'])], axis=1)

# df_res_tokens = df_res.apply(pd.Series.explode)

# df_res_tokens

### 7.6.2. <a id='toc7_6_2_'></a>[Pipeline](#toc0_)

In [None]:
# from transformers import pipeline

# token_classifier = pipeline(
#     "token-classification", model='model/distilbert-base-uncased-absa-downsample-1-3', aggregation_strategy="simple"
# )
# token_classifier(sample_input)