In [None]:
# February 2024
# Resampling and tokenizing text 
# Violeta Berdejo-Espinola

In [None]:
# linting
# !nbqa pylint 1.pre_process_main_text.ipynb

# background theme 
# !jt -t monokai -cellw 90% #grade3

from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import numpy as np
import pandas as pd
import os
import mpu

# read data

In [None]:
# clean corpus

corpus = mpu.io.read('../data/corpus_clean.pickle')
pos = mpu.io.read('../data/pos.pickle')
neg = mpu.io.read('../data/neg_complete.pickle')

x = pos + neg
y = [1] * len(pos) + [0] * len(neg)

print(len(y))
print(len(neg))

# raw corpus

corpus_raw = mpu.io.read('../data/corpus_raw.pickle')

pos_raw = corpus_raw[0:62]
neg_raw = corpus_raw[62:5020]

x_raw = corpus_raw
y_raw = y

print(len(y))
print(len(neg))

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
x_train_r, x_test_r, y_train_r, y_test_r = train_test_split(x_raw, y_raw, test_size=0.20, random_state=42)


from collections import Counter

cntt = Counter()

for instance_per_class in y_train:
    cntt[instance_per_class] += 1

cntt

# feature extraction

https://stackoverflow.com/questions/62812198/valueerror-in-while-predict-where-test-data-is-having-different-shape-of-word-ve

transformer tokenizer: https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#:~:text=max_length%20


In [None]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer # uses one-dim array of strings ~ shape (n,)
from sklearn.feature_extraction.text import CountVectorizer # returns arrays

vect_tfidf = TfidfVectorizer()
vect_cv = CountVectorizer()

X_train_cv = vect_cv.fit_transform(x_train)                    #fit: tokenize & buid vocab (turn object into an estimator)
X_train_tfidf = vect_tfidf.fit_transform(x_train)              #transform: instances into matrices
X_test_cv = vect_cv.transform(x_test)
X_test_tfidf = vect_tfidf.transform(x_test) 

In [None]:
%%time

from sentence_transformers import SentenceTransformer # returns dict of tensors

embed_model = SentenceTransformer('distiluse-base-multilingual-cased-v1') 

embedding_train = embed_model.encode(x_train_r)
embedding_test = embed_model.encode(x_test_r)

In [None]:
%%time

from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
     
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # returns dictionary with two key:value. input ids:tensors and attention mask:tensors both of them contain tensors
    
X_train_xlm = tokenize(x_train_r)                              
X_test_xlm = tokenize(x_test_r)

y_train_xlm = torch.tensor(y_train_r) 
y_test_xlm = torch.tensor(y_test_r) 

a = X_train_xlm['input_ids'].size()
b = X_test_xlm['input_ids'].size()

In [None]:
print('document-term matrix\n')
print(f'count based vectors - cv & tfidf:\n {X_train_cv.shape, X_test_cv.shape} \n {X_train_tfidf.shape, X_test_tfidf.shape}\n')
print(f'embedding - sentence transformer:\n {embedding_train.shape, embedding_test.shape}\n')
print(f'embedding - xlm roberta:\n {a} {b}\n') # size is batch_size, n_tokens

# transformers 

print(torch.is_tensor(X_train_xlm))
print(type(X_train_xlm))

print(f'xlm vocabulary size: {tokenizer.vocab_size} \nmodel context size: {tokenizer.model_max_length}\nmodel input {tokenizer.model_input_names}\n')
print(f'xlm input ids:\n {X_train_xlm.input_ids}\nxlm attention masks:\n {X_train_xlm.attention_mask}\n')

print('let\'s explore an example:\n')
print(X_train_xlm['input_ids'][10].size())

# the input sequence of each batch is padded [1] to the maximum sequence length in the batch (model context size)
# the attention mask array is used to ignore the paddded areas of the betch 
print(X_train_xlm['input_ids'][0])
print(tokenizer.convert_ids_to_tokens(X_train_xlm['input_ids'][0]))

# class imbalance

the distribution of one class is highly skewed so the learning algorithm might 
tend to be biased towards the majority class leading to poor predictions for the minority class

 approaches to deal with imbalanced datasets are:
 
- undersample majority class: discards potentially valuable data
- oversample minority class: can lead to overfitting and increases training times
- weight loss function: assigns higher importance to minority classes providing a direct optimisation approach.
sample: wj will be high
majority sample: wj will be low
- synthetic datasets: could be generated to complement the the minority class and increase its representation

In [None]:
%%time

# resample vectorised x_train and y_train - > returns list of arrays

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN # generates synthetic samples in regions of the minority class where the class density is low

rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
ros = RandomOverSampler(random_state=42, sampling_strategy='not majority')
ada = ADASYN(random_state=42)

resampler = [rus, ros, ada]

# train set

res_cv = []
for x in resampler:
    res_cv.append(x.fit_resample(X_train_cv, np.array(y_train)))
    
res_tfidf = []
for x in resampler:
    res_tfidf.append(x.fit_resample(X_train_tfidf, np.array(y_train)))
    
# oversample embeddings

embedding_train_ros, y_train_ros = ros.fit_resample(embedding_train, y_train)

In [None]:
len(embedding_test_ros)

from collections import Counter
counter = Counter()

for instance_per_class in y_train_ros:
    counter[instance_per_class] +=1
    
counter

In [None]:
import mpu

# baseline

mpu.io.write('X_train_cv.pickle', X_train_cv)
mpu.io.write('X_test_cv.pickle', X_test_cv)

mpu.io.write('X_train_tfidf.pickle', X_train_tfidf)
mpu.io.write('X_test_tfidf.pickle', X_test_tfidf)

mpu.io.write('embedding_train.pickle', embedding_train)
mpu.io.write('embedding_test.pickle', embedding_test)

mpu.io.write('y_train.pickle', y_train)
mpu.io.write('y_test.pickle', y_test)

# resampled

mpu.io.write('X_rus_train_tfidf.pickle', res_tfidf[0][0])
mpu.io.write('y_rus_train_tfidf.pickle', res_tfidf[0][1])

mpu.io.write('X_ros_train_tfidf.pickle', res_tfidf[1][0])
mpu.io.write('y_ros_train_tfidf.pickle', res_tfidf[1][1])

mpu.io.write('X_ada_train_tfidf.pickle', res_tfidf[2][0])
mpu.io.write('y_ada_train_tfidf.pickle', res_tfidf[2][1])

mpu.io.write('X_rus_train_cv.pickle', res_cv[0][0])
mpu.io.write('y_rus_train_cv.pickle', res_cv[0][1])

mpu.io.write('X_ros_train_cv.pickle', res_cv[1][0])
mpu.io.write('y_ros_train_cv.pickle', res_cv[1][1])

mpu.io.write('X_ada_train_cv.pickle', res_cv[2][0])
mpu.io.write('y_ada_train_cv.pickle', res_cv[2][1])

mpu.io.write('../data/x_emb_train_ros.pickle', embedding_train_ros) 
mpu.io.write('../data/y_emb_train_ros.pickle', y_train_ros)

# xlm

mpu.io.write('X_train_xlm.pickle', X_train_xlm)
mpu.io.write('X_test_xlm.pickle', X_test_xlm)
mpu.io.write('y_train_xlm.pickle', y_train_xlm)
mpu.io.write('y_test_xlm.pickle', y_test_xlm)