In [1]:
import pandas as pd
import ast
import os
print(os.getpid())
import json
from sacrebleu.metrics import BLEU
from nltk.util import ngrams
from collections import Counter
import random
import multiprocessing as mp
from tqdm import tqdm

1119


In [2]:
def preprocess_sentences(sentences):
    """
    Tokenize sentences using Hugging Face's Tokenizers for efficiency.
    """
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased", cache_dir="/mntssd/mnt3/shanshanbai/my_storage_from_qian/.cache/huggingface/hub")
    return [tokenizer.tokenize(tweet) for tweet in all_tweets]

# def preprocess_sentences(sentences):
#     """
#     Tokenize sentences using a whitespace tokenizer.
#     """
#     return [sentence.split() for sentence in sentences]

In [3]:
def calculate_ngram_overlap(candidate, references, n=4):
    candidate_ngrams = Counter(ngrams(candidate, n))
    reference_ngrams = Counter()
    for ref in references:
        reference_ngrams.update(ngrams(ref, n))
    overlap = sum((candidate_ngrams & reference_ngrams).values())
    total_ngrams = sum(candidate_ngrams.values())
    return overlap / total_ngrams if total_ngrams > 0 else 0

In [4]:
def precompute_references(tokenized_sentences, sample_size):
    return [
        random.sample(tokenized_sentences[:i] + tokenized_sentences[i + 1:], min(sample_size, len(tokenized_sentences) - 1))
        for i in range(len(tokenized_sentences))
    ]

In [5]:
def calculate_self_bleu_with_precomputed(index, tokenized_sentences, precomputed_references, n=4):
    """
    Calculate Self-BLEU score for a single sentence using precomputed references.
    """
    candidate = tokenized_sentences[index]
    references = precomputed_references[index]
    bleu = BLEU(effective_order=True)
    return bleu.corpus_score([' '.join(candidate)], [[' '.join(ref)] for ref in references]).score

In [6]:
def compute_self_bleu_optimized(tokenized_sentences, precomputed_references, n=4):
    """
    Compute Self-BLEU score in parallel using precomputed references.
    """
    with mp.Pool(processes=min(mp.cpu_count(), 8)) as pool:
        results = list(
            tqdm(
                pool.starmap(
                    calculate_self_bleu_with_precomputed,
                    [(i, tokenized_sentences, precomputed_references, n) for i in range(len(tokenized_sentences))]
                ),
                total=len(tokenized_sentences),
                desc="Calculating Self-BLEU"
            )
        )
    return sum(results) / len(results)

In [12]:
# Load dataset
# df = pd.read_csv('/mntssd/mnt3/shanshanbai/nlpinearthobservation/synthetic_data/data/synthetic.csv')
df = pd.read_csv('/mntssd/mnt3/shanshanbai/my_storage_from_qian/results/generated tweets/real.csv')

# # Undersample the dataset for balanced class distribution
# min_class_size = 3000
# df = (
#     df.groupby('mapped_class')
#     .apply(lambda x: x.sample(n=min_class_size, random_state=42))
#     .reset_index(drop=True)
# )

# print("Class distribution after undersampling:\n", df['mapped_class'].value_counts())

# Process tweets
df['tweets'] = df['tweet_no_url'].apply(ast.literal_eval) # tweet_no_url
all_tweets = [tweet for sublist in df['tweets'] for tweet in sublist]


In [13]:
len(all_tweets)

25747

In [14]:
tokenized_tweets = preprocess_sentences(all_tweets)

In [15]:
# Precompute references
print("Precomputing references...")
precomputed_references = precompute_references(tokenized_tweets, sample_size=1000)

# Compute Self-BLEU
print("Calculating Self-BLEU...")
self_bleu_score = compute_self_bleu_optimized(tokenized_tweets, precomputed_references, n=4)

print(f"Self-BLEU Score: {self_bleu_score:.4f}")


Precomputing references...
Calculating Self-BLEU...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Self-BLEU Score: 46.7727



