In [None]:
# February 2024
# Resampling and extracting text 
# Violeta Berdejo-Espinola

In [5]:
import numpy as np
import os
import mpu

# read data

We read two types of data with two different lengths, totalling four corpuses. <br> 
- title and abstract text with minimal pre-processing (removed only URL) <br>
- title abstract and main text text with minimal pre-processing (removed only URL) <br>
- title and abstract pre-processed text <br>
- title abstract and main re-processed tex

In [6]:
# clean corpus

corpus = mpu.io.read('../data/corpus_clean.pickle')
corpus_long = mpu.io.read('../data/corpus_clean_long.pickle')

x = corpus
x_long = corpus_long
y = [1] * len(corpus[:62]) + [0] * len(corpus[62:])

# raw corpus

corpus_raw = mpu.io.read('../data/corpus_raw.pickle')
corpus_raw_long = mpu.io.read('../data/corpus_raw_long.pickle')

x_raw = corpus_raw
x_raw_long = corpus_raw_long

# split data

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
x_train_long, x_test_long, y_train_long, y_test_long = train_test_split(x_long, y, test_size=0.20, random_state=42)
x_train_r, x_test_r, y_train_r, y_test_r = train_test_split(x_raw, y, test_size=0.20, random_state=42)
x_train_r_long, x_test_r_long, y_train_r_long, y_test_r_long = train_test_split(x_raw_long, y, test_size=0.20, random_state=42)

# check instances in each class

from collections import Counter

cntt = Counter()

for instance_per_class in y_train_long:
    cntt[instance_per_class] += 1

cntt

Counter({0: 3966, 1: 49})

# feature extraction

https://stackoverflow.com/questions/62812198/valueerror-in-while-predict-where-test-data-is-having-different-shape-of-word-ve

transformer tokenizer: https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#:~:text=max_length%20


# count-based

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer # uses one-dim array of strings ~ shape (n,)
from sklearn.feature_extraction.text import CountVectorizer # returns arrays

vect_tfidf = TfidfVectorizer()
vect_cv = CountVectorizer()

# corpus with title and abstract

X_train_cv = vect_cv.fit_transform(x_train)                    #fit: tokenize & buid vocab (turn object into an estimator) model learns the vectors to which they are used to transform data
X_train_tfidf = vect_tfidf.fit_transform(x_train)              #transform: transforms instances into matrices
X_test_cv = vect_cv.transform(x_test)
X_test_tfidf = vect_tfidf.transform(x_test) 

import mpu

mpu.io.write('X_train_cv.pickle', X_train_cv)
mpu.io.write('X_test_cv.pickle', X_test_cv)

mpu.io.write('X_train_tfidf.pickle', X_train_tfidf)
mpu.io.write('X_test_tfidf.pickle', X_test_tfidf)

mpu.io.write('y_train.pickle', y_train)
mpu.io.write('y_test.pickle', y_test)


In [10]:
# corpus with title abstract and main text 

X_train_cv = vect_cv.fit_transform(x_train_long)                    #fit: tokenize & buid vocab (turn object into an estimator)
X_train_tfidf = vect_tfidf.fit_transform(x_train_long)              #transform: instances into matrices
X_test_cv = vect_cv.transform(x_test_long)
X_test_tfidf = vect_tfidf.transform(x_test_long) 

mpu.io.write('X_train_long_cv.pickle', X_train_cv)
mpu.io.write('X_test_long_cv.pickle', X_test_cv)

mpu.io.write('X_train_long_tfidf.pickle', X_train_tfidf)
mpu.io.write('X_test_long_tfidf.pickle', X_test_tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 598947 stored elements and shape (1004, 162943)>

# embedding text

output are dense vector that encapsulate semantic meaning

In [8]:
from sentence_transformers import SentenceTransformer
import torch
import platform

print(torch.__version__) # checking pytorch version 2.0 or more
print(platform.mac_ver()) # checking pytorch version for mac - should be arm64
print(torch.backends.mps.is_built())  # checking if mps is available

  from tqdm.autonotebook import tqdm, trange


2.5.0
('14.7', ('', '', ''), 'arm64')
True


In [17]:
model_distil = SentenceTransformer('distiluse-base-multilingual-cased-v1') 

distil_train = model_distil.encode(x_train_r_long)
distil_test = model_distil.encode(x_test_r_long)

mpu.io.write('../data/embedding_train_long.pickle', distil_train)
mpu.io.write('../data/embedding_test_long.pickle', distil_test)

array([[-0.00914845, -0.00346731,  0.01054023, ...,  0.00480202,
         0.03056456,  0.10237923],
       [ 0.01896946,  0.0260251 , -0.00869109, ..., -0.0671379 ,
         0.03614533, -0.01881293],
       [ 0.00585423, -0.06012807, -0.03169392, ...,  0.00432672,
         0.05887518, -0.05666463],
       ...,
       [-0.00157769,  0.08838714,  0.00691193, ..., -0.04349812,
         0.00940177, -0.00767317],
       [-0.00077686, -0.0094168 ,  0.01987838, ...,  0.00946549,
         0.02030043,  0.05779348],
       [-0.02183528,  0.01464673,  0.07375681, ...,  0.01719196,
         0.0200747 , -0.02840189]], dtype=float32)

In [9]:
model_gemma2 = SentenceTransformer("BAAI/bge-multilingual-gemma2", 
                                    model_kwargs={"torch_dtype": torch.float16},
                                    device=torch.device('mps'))

gemma2_train = model_gemma2.encode(x_train_r_long)
gemma2_test = model_gemma2.encode(x_test_r_long)

mpu.io.write('../data/gemma2_train_long.pickle', gemma2_train)
mpu.io.write('../data/gemma2_test_long.pickle', gemma2_test)

Loading checkpoint shards: 100%|██████████| 4/4 [00:32<00:00,  8.14s/it]


array([[ 1.254  ,  0.54   , -2.879  , ..., -0.03717, -0.5786 ,  0.6885 ],
       [-3.59   ,  4.508  , -1.722  , ..., -0.997  ,  3.465  , -0.9683 ],
       [-0.8643 ,  3.86   ,  4.34   , ..., -0.6377 ,  3.936  , -3.604  ],
       ...,
       [-2.873  , -1.105  , -1.051  , ..., -2.984  ,  0.1659 ,  2.074  ],
       [ 0.4363 ,  2.512  ,  2.988  , ...,  2.98   ,  1.547  , -1.72   ],
       [ 1.334  ,  1.533  ,  1.433  , ...,  1.312  ,  5.16   , -2.514  ]],
      dtype=float16)

# pre-trained model as a feature extractor
output is a sequence of token IDs and attention masks

In [None]:
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
     
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # returns dictionary with two key:value. input ids:tensors and attention mask:tensors both of them contain tensors
    
X_train_xlm = tokenize(x_train_r)                              
X_test_xlm = tokenize(x_test_r)

y_train_xlm = torch.tensor(y_train_r) 
y_test_xlm = torch.tensor(y_test_r) 

a = X_train_xlm['input_ids'].size()
b = X_test_xlm['input_ids'].size()

In [None]:
mpu.io.write('X_train_xlm.pickle', X_train_xlm)
mpu.io.write('X_test_xlm.pickle', X_test_xlm)
mpu.io.write('y_train_xlm.pickle', y_train_xlm)
mpu.io.write('y_test_xlm.pickle', y_test_xlm)

In [None]:
print('document-term matrix\n')
print(f'count based vectors - cv & tfidf:\n {X_train_cv.shape, X_test_cv.shape} \n {X_train_tfidf.shape, X_test_tfidf.shape}\n')
print(f'embedding - sentence transformer:\n {embedding_train.shape, embedding_test.shape}\n')
print(f'embedding - xlm roberta:\n {a} {b}\n') # size is batch_size, n_tokens

# transformers 

print(torch.is_tensor(X_train_xlm))
print(type(X_train_xlm))

print(f'xlm vocabulary size: {tokenizer.vocab_size} \nmodel context size: {tokenizer.model_max_length}\nmodel input {tokenizer.model_input_names}\n')
print(f'xlm input ids:\n {X_train_xlm.input_ids}\nxlm attention masks:\n {X_train_xlm.attention_mask}\n')

print('let\'s explore an example:\n')
print(X_train_xlm['input_ids'][10].size())

# the input sequence of each batch is padded [1] to the maximum sequence length in the batch (model context size)
# the attention mask array is used to ignore the paddded areas of the betch 
print(X_train_xlm['input_ids'][0])
print(tokenizer.convert_ids_to_tokens(X_train_xlm['input_ids'][0]))

# dealing with class imbalance

the distribution of one class is highly skewed so the learning algorithm might 
tend to be biased towards the majority class leading to poor predictions for the minority class

 approaches to deal with imbalanced datasets are:
 
- undersample majority class: discards potentially valuable data
- oversample minority class: can lead to overfitting and increases training times
- weight loss function: assigns higher importance to minority classes providing a direct optimisation approach.
sample: wj will be high
majority sample: wj will be low
- synthetic datasets: could be generated to complement the the minority class and increase its representation

In [None]:
%%time

# resample vectorised x_train and y_train - > returns list of arrays

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN # generates synthetic samples in regions of the minority class where the class density is low

rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
ros = RandomOverSampler(random_state=42, sampling_strategy='not majority')
ada = ADASYN(random_state=42)

resampler = [rus, ros, ada]

# train set

res_cv = []
for x in resampler:
    res_cv.append(x.fit_resample(X_train_cv, np.array(y_train)))
    
res_tfidf = []
for x in resampler:
    res_tfidf.append(x.fit_resample(X_train_tfidf, np.array(y_train)))
    
# oversample embeddings

embedding_train_ros, y_train_ros = ros.fit_resample(distil_train, y_train)

# checking number of instances per class

len(embedding_test_ros)

from collections import Counter
counter = Counter()

for instance_per_class in y_train_ros:
    counter[instance_per_class] +=1
    
counter

In [None]:
# resampled

mpu.io.write('X_rus_train_tfidf.pickle', res_tfidf[0][0])
mpu.io.write('y_rus_train_tfidf.pickle', res_tfidf[0][1])

mpu.io.write('X_ros_train_tfidf.pickle', res_tfidf[1][0])
mpu.io.write('y_ros_train_tfidf.pickle', res_tfidf[1][1])

mpu.io.write('X_ada_train_tfidf.pickle', res_tfidf[2][0])
mpu.io.write('y_ada_train_tfidf.pickle', res_tfidf[2][1])

mpu.io.write('X_rus_train_cv.pickle', res_cv[0][0])
mpu.io.write('y_rus_train_cv.pickle', res_cv[0][1])

mpu.io.write('X_ros_train_cv.pickle', res_cv[1][0])
mpu.io.write('y_ros_train_cv.pickle', res_cv[1][1])

mpu.io.write('X_ada_train_cv.pickle', res_cv[2][0])
mpu.io.write('y_ada_train_cv.pickle', res_cv[2][1])

mpu.io.write('../data/x_emb_train_ros.pickle', embedding_train_ros) 
mpu.io.write('../data/y_emb_train_ros.pickle', y_train_ros)