In [None]:
# Print GPU info
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
# Install required packages
! pip install --upgrade nltk
! pip install evaluate transformers datasets pandarallel

In [2]:
# Imports
import evaluate
import heapq
import huggingface_hub
import nltk
import numpy as np
import pandas as pd
import pickle
from collections import Counter
from datasets import Dataset, load_dataset
from google.colab import drive
from nltk.util import ngrams
from pandarallel import pandarallel
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments

In [None]:
# Mount drive
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive"

In [4]:
# Set required variables
checkpoint_folder_root = Path('/content/drive/My Drive/DP/checkpoints')
models_folder_root = Path('/content/drive/My Drive/DP/models')
dataset_folder_root = Path('/content/drive/My Drive/DP/datasets/csfd')

train_file_path = dataset_folder_root / 'train_top5_withoutOOC.csv'
test_file_path = dataset_folder_root / 'test_top5_withoutOOC.csv'

checkpoint_folder = checkpoint_folder_root / 'xlm-roberta'

xlmroberta_folder = models_folder_root / 'xlm-roberta'
style_lr_folder = models_folder_root / 'style-lr'
ngrams_lr_folder = models_folder_root / 'ngrams-lr'
final_lr_folder = models_folder_root / 'final-lr'

model_base = 'xlm-roberta-base'

In [5]:
# Ensure all directories exist
dataset_folder_root.mkdir(parents=True, exist_ok=True)
checkpoint_folder.mkdir(parents=True, exist_ok=True)
xlmroberta_folder.mkdir(parents=True, exist_ok=True)
style_lr_folder.mkdir(parents=True, exist_ok=True)
ngrams_lr_folder.mkdir(parents=True, exist_ok=True)
final_lr_folder.mkdir(parents=True, exist_ok=True)

In [9]:
def return_best_bi_grams(text):
  bigrams = ngrams(text,2)

  data = dict(Counter(bigrams))
  list_ngrams = heapq.nlargest(100, data.keys(), key=lambda k: data[k])
  return list_ngrams

def return_best_tri_grams(text):
  trigrams = ngrams(text,3)

  data = dict(Counter(trigrams))
  list_ngrams = heapq.nlargest(100, data.keys(), key=lambda k: data[k])
  return list_ngrams

def find_freq_n_gram_in_txt(text, list_bigram, list_trigram):
  to_ret = []

  num_bigrams = len(Counter(zip(text,text[1:])))
  num_trigrams = len(Counter(zip(text,text[1:],text[2:])))

  for n_gram in list_bigram:
      to_ret.append(text.count(''.join(n_gram))/num_bigrams)

  for n_gram in list_trigram:
      to_ret.append(text.count(''.join(n_gram))/num_trigrams)

  return to_ret

In [10]:
# Load datasets
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

# Split datasets into feature dataset and dataset with labels and text
train_features_df = train_df.drop(columns=['label', 'text'])
test_features_df = test_df.drop(columns=['label', 'text'])
train_df = train_df[['label', 'text']]
test_df = test_df[['label', 'text']]

text = " ".join(pd.concat([train_df, test_df], ignore_index=True)['text'].values)
list_bigram = return_best_bi_grams(text)
list_trigram = return_best_tri_grams(text)

train_ds = Dataset.from_pandas(train_df, split="train")
test_ds = Dataset.from_pandas(test_df, split="test")

train_ds[0]

{'label': 3,
 'text': 'Abych pravdu řekl, příliš jsem si od pátého pokračování této franšízy nesliboval a ve výsledku jsem tak nebyl nemile překvapen. Přes množství různých akčních scén a typických honiček jsem se po většinu času mírně nudil; ne, že by se o mě pokoušel spánek, na to se tam stále něco dělo, problém byl spíš v tom, že mě snímek za celou dobu nedokázal doopravdy zaujmout. Myslel jsem si, že filmu zajistí alespoň jednu pěticípou nostalgická nálada, se kterou jsem najisto počítal, k mému rozčarování ale s nostalgií kalkulovali i tvůrci, kteří celou záležitost prošpikovali odkazy na předchozí díly, a z toho, jak zalidnili děj postavami z Indyho minulosti, usuzuji, že jsou i samotní tvůrci zkušenými archeology. Celkově šlo spíše o vzpomínkový počin, v němž vystupovalo až příliš mnoho starých známých, recyklovala se nejen zápletka s nácky, protože došlo i na dětského hrdinu, nezapomnělo se na různá oku nelahodící lezolata, a kdyby to pomaleji chápajícím divákům nestačilo, dopr

In [11]:
# Define metric function
accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy.compute(references=labels, predictions=preds)
    return {
        'accuracy': acc["accuracy"],
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# Tokenize dataset
tokenizer = AutoTokenizer.from_pretrained(model_base)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_test_ds = test_ds.map(tokenize_function, batched=True)

tokenized_train_ds[0]

# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
author_names = test_df['label'].unique()
num_of_authors = len(author_names)
id2label = {int(i): str(author_names[i]) for i in range(num_of_authors)}
label2id = {str(author_names[i]): int(i) for i in range(num_of_authors)}

In [None]:
# Create model and training arguments

model = AutoModelForSequenceClassification.from_pretrained(
    model_base, num_labels=num_of_authors, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir=str(checkpoint_folder),
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=2, # 10
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to=["tensorboard"],
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [27]:
# Train the model
trainer.train()

# Save the model
tokenizer.save_pretrained(str(xlmroberta_folder))
trainer.save_model(str(xlmroberta_folder))

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6631,0.253165,0.909888,0.909809,0.912003,0.909937
2,0.1827,0.166228,0.947634,0.947958,0.94838,0.947658


In [45]:
from scipy.special import softmax

In [None]:
output = trainer.predict(tokenized_test_ds)
roberta_test_output = output.predictions

output = trainer.predict(tokenized_train_ds)
roberta_train_output = output.predictions

In [54]:
# Style-based classifier
style_clf = LogisticRegression(random_state=0).fit(train_features_df, train_df['label'])
style_test_pred = style_clf.predict(test_features_df)
style_test_proba = style_clf.predict_proba(test_features_df)
style_train_proba = style_clf.predict_proba(train_features_df)

style_score = accuracy_score(test_df['label'], style_test_pred)
style_f1 = f1_score(test_df['label'], style_test_pred, average='macro')

print("Style accuracy is: ", style_score)
print("Style f1-score is: ", style_f1)

# Save model
filepath = style_lr_folder / 'style_clf.pkl'
with filepath.open("wb") as f:
  pickle.dump(style_clf, f)

(3762,)
2
(3762, 5)
[0.01828129 0.18770114 0.44851867 0.16242188 0.18307702]
Style training done, accuracy is :  0.4864433811802233
Style training done, f1-score is :  0.4568892384825438


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
# Character N-gram classifier
train_ngrams = train_df['text'].apply(lambda x: find_freq_n_gram_in_txt(x, list_bigram, list_trigram)).values
test_ngrams = test_df['text'].apply(lambda x: find_freq_n_gram_in_txt(x, list_bigram, list_trigram)).values

train_ngrams_df = pd.DataFrame(train_ngrams)[0].apply(lambda x: pd.Series(x))
test_ngrams_df = pd.DataFrame(test_ngrams)[0].apply(lambda x: pd.Series(x))

In [56]:
ngrams_clf = LogisticRegression(random_state=0).fit(train_ngrams_df, train_df['label'])
ngrams_test_pred = ngrams_clf.predict(test_ngrams_df)
ngrams_test_proba = ngrams_clf.predict_proba(test_ngrams_df)
ngrams_train_proba = ngrams_clf.predict_proba(train_ngrams_df)

ngrams_score = accuracy_score(test_df['label'], ngrams_test_pred)
ngrams_f1 = f1_score(test_df['label'], ngrams_test_pred, average='macro')

print("N-grams accuracy is: ", ngrams_score)
print("N-grams f1-score is: ", ngrams_f1)

# Save model
filepath = ngrams_lr_folder / 'ngrams_clf.pkl'
with filepath.open("wb") as f:
  pickle.dump(ngrams_clf, f)

N-grams training done, accuracy is :  0.73471557682084
N-grams training done, f1-score is :  0.7283267184983201


In [57]:
# Combined model
train_combined = np.concatenate([roberta_train_output, style_train_proba, ngrams_train_proba], axis=1)
test_combined = np.concatenate([roberta_test_output, style_test_proba, ngrams_test_proba], axis=1)

final_clf = LogisticRegression(random_state=0).fit(train_combined, train_df['label'])

final_pred = final_clf.predict(test_combined)
final_score = accuracy_score(test_df['label'], final_pred)
final_f1 = f1_score(test_df['label'], final_pred, average='macro')

print("Final accuracy is: ", final_score)
print("Final f1-score is: ", final_f1)

filepath = final_lr_folder / 'final_clf.pkl'
with filepath.open("wb") as f:
  pickle.dump(final_clf, f)

Training done, accuracy is :  0.9516214779372674
Training done, f1-score is :  0.9517676883363408


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
