## Importing libraries

In [1]:
import pandas as pd
from collections import Counter

## Train data

*VUA_train.csv*

In [2]:
def metaphor_stats(df):
    total_tokens = len(df)
    total_sentences = df["sentence"].nunique()
    label_counts = df["label"].value_counts()
    vocab_size = df["word"].nunique()
    avg_sentence_length = df.groupby("sentence").size().mean()
    
    metaphor_counts = df[df["label"] == 1].groupby("sentence").size()
    avg_metaphors_per_sentence = metaphor_counts.mean() if not metaphor_counts.empty else 0.0
    
    top_metaphors = df[df["label"] == 1]["word"].value_counts().head(10)
    
    print(f"Total tokens: {total_tokens}\n")
    print(f"Total sentences: {total_sentences}\n")
    print(f"Label distribution:\n{label_counts}\n")
    print(f"Vocabulary size: {vocab_size}\n")
    print(f"Average sentence length: {avg_sentence_length:.2f} tokens\n")
    print(f"Average metaphors per sentence: {avg_metaphors_per_sentence:.2f}\n")
    print("Top metaphorical words:")
    print(top_metaphors)

In [3]:
original_train_data = pd.read_csv("/Users/urtejakubauskaite/Desktop/Metaphors/Final project/Data/Train Data/VUA_train.csv")
metaphor_stats(original_train_data)

Total tokens: 181488

Total sentences: 10912

Label distribution:
label
0    162311
1     19177
Name: count, dtype: int64

Vocabulary size: 17290

Average sentence length: 16.63 tokens

Average metaphors per sentence: 3.18

Top metaphorical words:
word
in       1315
to        774
on        656
with      607
that      510
this      393
from      305
at        300
about     282
have      178
Name: count, dtype: int64


## Original test data

*VUA_test_all.csv*

In [4]:
original_test_data = pd.read_csv("/Users/urtejakubauskaite/Desktop/Metaphors/Final project/Data/Test Data/VUA_test_all.csv")
metaphor_stats(original_test_data)

Total tokens: 58359

Total sentences: 3814

Label distribution:
label
0    51540
1     6819
Name: count, dtype: int64

Vocabulary size: 7763

Average sentence length: 15.30 tokens

Average metaphors per sentence: 3.18

Top metaphorical words:
word
in       410
to       266
that     226
on       210
with     203
this     144
at       127
about    102
from     101
have      76
Name: count, dtype: int64


## Our test data

In [5]:
def test_metaphor_and_function_word_stats(df):
    function_word_pos = {"DET", "ADP", "PRON", "CCONJ", "SCONJ", "AUX", "PART", "INTJ", "SYM"}

    total_tokens = len(df)
    total_sentences = df["sent_id"].nunique()
    label_counts = df["FINAL"].value_counts()
    vocab_size = df["token_text"].str.lower().nunique()
    
    sentence_lengths = df.groupby("sent_id").size()
    avg_sentence_length = sentence_lengths.mean()
    
    metaphor_counts_per_sentence = df[df["FINAL"] == 1].groupby("sent_id").size()
    metaphor_counts_per_sentence = metaphor_counts_per_sentence.reindex(df["sent_id"].unique(), fill_value=0)
    avg_metaphors_per_sentence = metaphor_counts_per_sentence.mean()
    
    top_metaphors = df[df["FINAL"] == 1]["token_text"].value_counts().head(10)
    
    function_word_tokens = df[df["pos"].isin(function_word_pos)]
    num_function_words = len(function_word_tokens)
    percent_function_words = num_function_words / total_tokens * 100
    
    function_words_per_sentence = function_word_tokens.groupby("sent_id").size()
    avg_function_words_per_sentence = function_words_per_sentence.mean()
    
    print(f"Total tokens: {total_tokens}\n")
    print(f"Total sentences: {total_sentences}\n")
    print(f"Label distribution:\n{label_counts}\n")
    print(f"Vocabulary size: {vocab_size}\n")
    print(f"Average sentence length: {avg_sentence_length:.2f} tokens\n")
    print(f"Average metaphors per sentence: {avg_metaphors_per_sentence:.2f}\n")
    print("Top metaphorical words:")
    for word, count in top_metaphors.items():
        print(f"{word}: {count}")
    print(f"\nNumber of function words: {num_function_words}")
    print(f"Percentage of function words: {percent_function_words:.2f}%")
    print(f"Average function words per sentence: {avg_function_words_per_sentence:.2f}")

In [6]:
first = pd.read_excel("/Users/urtejakubauskaite/Desktop/Metaphors/Final project/Data/Test Data/4acb1_annotation_checked.xlsx")
test_metaphor_and_function_word_stats(first)

Total tokens: 487

Total sentences: 19

Label distribution:
FINAL
0    439
1     48
Name: count, dtype: int64

Vocabulary size: 236

Average sentence length: 25.63 tokens

Average metaphors per sentence: 2.53

Top metaphorical words:
team: 3
compounds: 2
ways: 2
undone: 2
saying: 2
release: 1
removal: 1
adding: 1
used: 1
futures: 1

Number of function words: 179
Percentage of function words: 36.76%
Average function words per sentence: 9.42


In [7]:
second = pd.read_excel("/Users/urtejakubauskaite/Desktop/Metaphors/Final project/Data/Test Data/7976c_annotation_checked.xlsx")
test_metaphor_and_function_word_stats(second)

Total tokens: 219

Total sentences: 10

Label distribution:
FINAL
0    185
1     34
Name: count, dtype: int64

Vocabulary size: 145

Average sentence length: 21.90 tokens

Average metaphors per sentence: 3.40

Top metaphorical words:
joining: 1
said: 1
engine: 1
strong: 1
growth: 1
dropped: 1
out: 1
processes: 1
considering: 1
board: 1

Number of function words: 89
Percentage of function words: 40.64%
Average function words per sentence: 8.90


In [8]:
third = pd.read_excel("/Users/urtejakubauskaite/Desktop/Metaphors/Final project/Data/Test Data/58717_annotation_checked.xlsx")
test_metaphor_and_function_word_stats(third)

Total tokens: 304

Total sentences: 16

Label distribution:
FINAL
0    261
1     43
Name: count, dtype: int64

Vocabulary size: 175

Average sentence length: 19.00 tokens

Average metaphors per sentence: 2.69

Top metaphorical words:
green: 4
can: 2
for: 2
social: 2
going: 2
incentives: 1
provides: 1
puts: 1
in: 1
adoption: 1

Number of function words: 118
Percentage of function words: 38.82%
Average function words per sentence: 7.38


In [9]:
fourth = pd.read_excel("/Users/urtejakubauskaite/Desktop/Metaphors/Final project/Data/Test Data/fe9d0_annotation_checked.xlsx")
test_metaphor_and_function_word_stats(fourth)

Total tokens: 220

Total sentences: 13

Label distribution:
FINAL
0    186
1     34
Name: count, dtype: int64

Vocabulary size: 133

Average sentence length: 16.92 tokens

Average metaphors per sentence: 2.62

Top metaphorical words:
talent: 2
area: 2
bright: 1
lining: 1
look: 1
puts: 1
constraint: 1
Access: 1
hub: 1
emerging: 1

Number of function words: 75
Percentage of function words: 34.09%
Average function words per sentence: 5.77


In [10]:
total_tokens = 487 + 219 + 304 + 220
total_sentences = 19 + 10 + 16 + 13
total_metaphors = 48 + 34 + 43 + 34

print(f"Total tokens: {total_tokens}")
print(f"Total sentences: {total_sentences}")
print(f"Total metaphors: {total_metaphors}")

Total tokens: 1230
Total sentences: 58
Total metaphors: 159


## Continued pretraining data

*updated_articles_flattened_records_20241121.xlsx*

In [11]:
pretraining_data = pd.read_excel("/Users/urtejakubauskaite/Desktop/Metaphors/Final project/Data/Continued Pretraining Data/updated_articles_flattened_records_20241121.xlsx")

num_articles = pretraining_data["article_id"].nunique()
avg_article_length = pretraining_data["length"].mean()
total_tokens = pretraining_data["length"].sum()
    
print(f"Number of articles: {num_articles}")
print(f"Average article length (tokens): {avg_article_length:.2f}")
print(f"Total tokens (sum of all article lengths): {total_tokens}")

Number of articles: 1462
Average article length (tokens): 1386.45
Total tokens (sum of all article lengths): 2026996
