In [None]:
!pip install transformers datasets evaluate

In [None]:
import dask.dataframe as dd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import torch
from tqdm import tqdm
import os

In [None]:
!pip install gdown --upgrade --no-cache-dir

In [None]:
!gdown https://drive.google.com/uc?id=1-0H_eBwefHytPeI8MvaK9OCeqbJOnkvQ

In [None]:
!gdown https://drive.google.com/uc?id=1-2HnVeEO8xkgDA9H3cOLCUUIG7W2cSOZ

In [None]:
!gdown https://drive.google.com/uc?id=1-2gzBfy_33vZalt4qqL3ltg2wq--LLEM

In [None]:
df = dd.read_csv("labelled_data_*.csv")

In [None]:
df = df.compute()
df = df.reset_index(drop=True)

In [None]:
df.head()

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
train_idx, test_idx = train_test_split(list(range(len(df))), test_size = 0.2, stratify = df.label)

In [None]:
train_ds = Dataset.from_pandas(df.loc[train_idx,:], split = "train")
test_ds = Dataset.from_pandas(df.loc[test_idx,:], split = "test")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_train_ds = train_ds.map(preprocess_function)
tokenized_test_ds = test_ds.map(preprocess_function)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")
roc_auc = evaluate.load("roc_auc")
f1 = evaluate.load("f1")
recall = evaluate.load("recall")
precision = evaluate.load("precision")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc_score =  accuracy.compute(predictions=predictions, references=labels)
    roc_score = roc_auc.compute(prediction_scores=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels)
    recall_score = recall.compute(predictions=predictions, references=labels)
    precision_score = precision.compute(predictions=predictions, references=labels)
    # return {'accuracy': acc_score['accuracy'], "roc_auc":roc_score['roc_auc'], "f1":f1_score, "recall" : recall_score, "precision" : precision_score}
    return {**acc_score,**roc_score,**f1_score,**recall_score, **precision_score}

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
);

In [None]:
training_args = TrainingArguments(
    output_dir="./Fake_Text_Detection",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps = 5000,
)
training_args;

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
# trainer.add_callback(CustomCallback(trainer))
trainer.train()

In [None]:
%tensorboard --logdir ./Fake_Text_Detection/runs