## BERT-based-finetuned

In [None]:
import os
import csv
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm

# Load the model and tokenizer
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Define function to extract entities and predicate
def extract_spo_from_tweet(tweet):
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    entities = nlp(tweet)

    # Ensure we have enough entities to extract
    if len(entities) < 2:
        return {"subject": "", "predicate": "", "object": ""}

    # Sort entities based on their start positions
    entities = sorted(entities, key=lambda x: x['start'])

    # Assuming the first entity is the subject, and the second entity is the object.
    entity1 = entities[0]['word']
    entity2 = entities[1]['word']
    entity1_start = entities[0]['start']
    entity2_start = entities[1]['start']

    # Manually extracting the predicate (verb phrase) from the tweet.
    predicate = tweet[entity1_start + len(entity1):entity2_start].strip()

    return {"subject": entity1, "predicate": predicate, "object": entity2}

# Function to process all tweets in a file and save to CSV
def process_tweets(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["Tweet", "Predicate", "Entity 1", "Entity 2"])

        lines = infile.readlines()
        for line in tqdm(lines, desc="Processing tweets"):
            tweet = line.strip()
            if tweet:
                spo_triple = extract_spo_from_tweet(tweet)
                writer.writerow([tweet, spo_triple['predicate'], spo_triple['subject'], spo_triple['object']])

# Process .txt files
input_file = "/content/drive/MyDrive/01-Research/Geo-Isa/Coding/evaluation-1600-annotated-tweets.txt"
output_file = "/content/drive/MyDrive/01-Research/Geo-Isa/Coding/BERT_evaluation_1600_annotated_tweets.csv"
process_tweets(input_file, output_file)


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Processing tweets:   0%|          | 0/1672 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing tweets:   0%|          | 1/1672 [00:00<07:09,  3.89it/s]Hardware accelerator e.g. GPU is avai