In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import mpu
import torch
import transformers
import platform

print(torch.__version__) # checking pytorch version 2.0 or more
print(platform.mac_ver()) # checking pytorch version for mac - should be arm64
print(transformers.__version__) # checking transformers version
print(torch.backends.mps.is_built())  # checking if mps is available     
device = torch.device('mps') # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# read raw corpus

corpus_raw = mpu.io.read('../data/corpus_raw.pickle')

pos = corpus_raw[0:62]
neg = corpus_raw[62:5020]
# neg = neg[:92]

x = pos + neg
y = [1] * len(pos) + [0] * len(neg)

print(len(y), len(x), len(neg), len(pos))

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

[[y_train.count(item), item] for item in set(y_train)]

In [None]:
# resample data

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN

ros = RandomOverSampler(random_state=42, sampling_strategy='not majority')
ada = ADASYN(random_state=42)

x_train_ros, y_train_ros = ros.fit_resample(x_train, y_train)
x_test_ros, y_test_ros = ros.fit_resample(x_test, y_test)

In [None]:
from collections import Counter

counter = Counter()

for bin_class in y_test_ros:
    counter[bin_class] +=1
    
counter

In [None]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

# https://huggingface.co/docs/datasets/en/access

d = {'train': Dataset.from_dict({'text':x_train_ros,'label':y_train_ros}),
     'test': Dataset.from_dict({'text':x_test_ros, 'label':y_test_ros})
     }

d = DatasetDict(d)

d

In [None]:
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

# batching

def tokenize_text(examples):
    m = tokenizer(examples['text'], truncation=True, max_length=512, padding=True, return_tensors='pt').to(device)
    return m

d_tokenized = d.map(tokenize_text, batched=True)

print(d_tokenized)

In [None]:
mpu.io.write('../data/d_tokenized_ros.pickle', d_tokenized)

In [None]:
# class imbalance

class_weights = [(1 - (len(neg) / len(x))), (1 - (len(pos) / len(x)))]
class_weights = torch.Tensor(class_weights).to(device)
class_weights

mpu.io.write('../data/class_weights.pickle', class_weights)