This program trains a BERT model on 14 stories to identify whether the inputted story is published before or after 2014.

In [1]:
import json, torch
import pandas as pd
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification
from transformers import BertForTokenClassification, TrainingArguments, Trainer, AutoTokenizer

2024-06-07 21:27:14.407966: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# This section builds a dataset where the variable "dataset" is a list of dictionaries in this format: [{"story": storyContent, "year": publicationYear, "label": 1 for after 2014 and 0 for before 2014}]

storyDataJsonPath = "publicationYearStory.json"
with open(storyDataJsonPath, "r") as file:
    storyData = json.load(file)
dataset = [{"story": story, "year": int(year)} for year, story in storyData.items()]
for story in dataset:
    story["label"] = 1 if story["year"] > 2004 else 0
#print(dataset)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenizeStories(stories):
    return tokenizer(stories["story"], padding="max_length", truncation=True, max_length=256)

tokenizedStories = [tokenizeStories(storyItem) for storyItem in dataset]
#print(tokenizedStories)

In [4]:
def formatData(tokenizedStories, label):
    return{
        "input_ids": torch.tensor(tokenizedStories["input_ids"]),
        "attentionMask": torch.tensor(tokenizedStories["attention_mask"]),
        "label": torch.tensor(label)
    }
formattedData = [formatData(tokenizedStories[i], dataset[i]["label"]) for i in range(len(tokenizedStories))]
#print(formattedData)

In [5]:
labels = [terms["label"] for terms in dataset]
trainingData, validationData = train_test_split(formattedData, test_size=0.2, stratify=labels, random_state=2000)
#print(trainingData)

In [6]:
# building a class for PyTorch (could use another code if one wants to use pyarrow):
class StoryDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data=data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return self.data[index]

trainingDataset = StoryDataset(trainingData)
validationDataset = StoryDataset(validationData)
#print(trainingDataset)

In [7]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
trainingArgument = TrainingArguments(output_dir="./trainingResult")
trainer = Trainer(
    model=model,
    args=trainingArgument,
    train_dataset=trainingDataset,
    eval_dataset=validationDataset,
    tokenizer=tokenizer
)

In [9]:
trainer.train()
trainer.evaluate()
model.save_pretrained('./modelTrained/')
tokenizer.save_pretrained("./modelTrained/")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/6 [00:00<?, ?it/s]

{'train_runtime': 176.565, 'train_samples_per_second': 0.17, 'train_steps_per_second': 0.034, 'train_loss': 0.6874442100524902, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

('./modelTrained/tokenizer_config.json',
 './modelTrained/special_tokens_map.json',
 './modelTrained/vocab.txt',
 './modelTrained/added_tokens.json',
 './modelTrained/tokenizer.json')

In [10]:
#training ends above. The code below processes the user input data.
modelPath = "modelTrained"
newModel = BertForSequenceClassification.from_pretrained(modelPath)
newTokenizer = AutoTokenizer.from_pretrained(modelPath)
newModel.eval()

def predictInputYear(story, model, tokenizer, threshold=0.5):
    inputStory = tokenizer(story, return_tensors="pt", padding="max_length", truncation=True, max_length=256)
    operatingDevice = torch.device("cuda" if torch.cuda.is_available() else "cpu") # cuda -> gpu
    model.to(operatingDevice)
    inputStory = {key: value.to(model.device) for key, value in inputStory.items()}
    with torch.no_grad():
        outputs = model(**inputStory)
        logits = outputs.logits
    # getting probabilities
    probabilities = torch.sigmoid(logits).cpu().numpy().flatten()
    
    # Use 0.5 threshold for binary classification
    label = 1 if probabilities[1] > threshold else 0
    if label == 1:
        return {"sentiment": "After 2014", "probability": probabilities[1]}
    else:
        return {"sentiment": "Before 2014", "probability": probabilities[0]}

In [None]:
newStoryFilePath = "" # Story file in txt
with open(newStoryFilePath, "r") as file:
    content = file.read()

storyInput = content
resultPrediction = predictInputYear(storyInput, model, tokenizer)
print("Prediction results", resultPrediction)

Prediction results {'sentiment': 'After 2014', 'probability': 0.6371219}
