# Import package and data

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
if str(os.getcwd()).endswith('BertModel'):
    os.chdir("..")

from BertModel.Analyzer import BertAnalyzer
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from BertModel.BaselineModels import Bow_Baseline_Model, Tfidf_Baseline_Model
import sentencepiece
from BertModel.Sampling import DataSampling
from BertModel.Analyzer import BertAnalyzer
from BertModel.PreTrainedBert import model
from BertModel.PreProcessing import BertDataset, generate_batch
from torch.utils.data import DataLoader

In [None]:
path = 'dontpatronizeme_pcl.tsv'
titles = ['par_id', 'art_id', 'keyword','country_code','text','label']
raw_data_orig = pd.read_csv(path, skiprows = 4, sep = '\t',
                       names = titles)
raw_data = raw_data_orig.fillna("missing_value")
raw_data['label'] = np.where(raw_data['label'] > 1, 1, 0)

In [None]:
raw_data.head()

# Task 1

In [None]:
# Analyze class distribution
label_counts = raw_data['label'].value_counts()
print(label_counts)
# Plot class distribution
plt.figure(figsize=(4,4))
plt.bar(label_counts.index.astype(str), label_counts.values)
plt.xticks([0, 1], ['Non-Patronizing', 'Patronizing'])

plt.ylabel("Number of lines")
plt.title("Class Label Distribution")
plt.show()

In [None]:
# Count occurrences of each label per keyword
keyword_counts = raw_data.groupby("keyword")["label"].value_counts().unstack()

# Plot distribution
keyword_counts.plot(kind="bar", stacked=True, figsize=(12,6), colormap="viridis")

plt.ylabel("Number of lines")
plt.title("Distribution of Class Labels for Each Keyword Group")
plt.legend(title="Label", labels=["Non-Patronizing", "Patronizing"])
plt.xticks(rotation=45)
plt.show()

In [None]:
# Compute proportion of patronizing texts per keyword
patronizing_ratio_keyword = raw_data.groupby("keyword")["label"].mean().sort_values(ascending=False)

# Plot
plt.figure(figsize=(10,5))
patronizing_ratio_keyword.plot(kind="bar", color="orange")

plt.ylabel("Proportion of Patronizing Texts")
plt.title("Keywords with Highest to Lowest Patronizing Language")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Count occurrences of each label per country
country_label_counts = raw_data.groupby("country_code")["label"].value_counts().unstack()

# Plot distribution
country_label_counts.plot(kind="bar", stacked=True, figsize=(12,6), colormap="viridis")
plt.xlabel("Country Code")
plt.ylabel("Number of Lines")
plt.title("Distribution of Class Labels Across Countries")
plt.legend(title="Label", labels=["Non-Patronizing", "Patronizing"])
plt.xticks(rotation=45)
plt.show()

In [None]:
# Compute proportion of patronizing texts per country
patronizing_ratio = raw_data.groupby("country_code")["label"].mean().sort_values(ascending=False)

# Select top 10 countries
top_countries = patronizing_ratio.head(10)

# Plot
plt.figure(figsize=(10,5))
top_countries.plot(kind="bar", color="orange")
plt.xlabel("Country Code")
plt.ylabel("Proportion of Patronizing Texts")
plt.title("Top 10 Countries with Highest Patronizing Language")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Visualise text_length distribution, correlation and distribution of labels
raw_data["text_length"] = raw_data["text"].apply(lambda x: len(x.split()))
bins = [0, 50, 100, 150, 200, float('inf')]
bin_names = ['1-50', '51-100', '101-150', '151-200', '>200']
raw_data['text_length_bucket'] = pd.cut(raw_data['text_length'], bins=bins, labels=bin_names, right=False)

# Step 3: Group by text_length_bucket and label, and count the number of lines
grouped = raw_data.groupby("text_length_bucket")["label"].value_counts().unstack()

# Step 4: Plot the stacked bar chart
grouped.plot(kind="bar", stacked=True, figsize=(6,3), colormap="viridis")
plt.xlabel('Text Length Buckets')
plt.ylabel('Number of Lines')
plt.title('Distribution of Class Labels Across Text Length Buckets')
plt.xticks(rotation=45)
plt.legend(title="Label", labels=["Non-Patronizing", "Patronizing"])
plt.show()

# Compute proportion of patronizing texts per country
patronizing_ratio = raw_data.groupby("text_length_bucket")["label"].mean().sort_values(ascending=False)

# Plot
plt.figure(figsize=(10,5))
patronizing_ratio.plot(kind="bar", color="orange")
plt.xlabel("text_length_bucket")
plt.ylabel("Proportion of Patronizing Texts")
plt.title("Text Length with Highest to Lowest Patronizing Language")
plt.xticks(rotation=45)
plt.show()


# Compute text length and correlation with label
correlation = raw_data[["text_length", "label"]].corr().iloc[0,1]
print(f"correlation between text_length and label prediction: {correlation}")

# Task 2a - for final model implementation, please check Analyzer.py, please see the code for generating dev.txt and test.txt in this section

In [None]:
# Train - dev split
train = pd.read_csv("semeval-2022/practice splits/train_semeval_parids-labels.csv")
dev = pd.read_csv("semeval-2022/practice splits/dev_semeval_parids-labels.csv")
train_df = raw_data[raw_data["par_id"].isin(train['par_id'])]
dev_df = raw_data[raw_data["par_id"].isin(dev['par_id'])]

In [None]:
# 1:2 ratio between positive and negative
def downsample(raw_data, seeds = 42):
    # downsampling the unpatronizing text data based on the keyword
    keywords = raw_data['keyword'].unique()
    dfs = []
    for keyword in keywords:
        patro_df = raw_data[(raw_data['keyword'] == keyword) & (raw_data['label'] == 1)]
        non_patro_df = raw_data[(raw_data['keyword'] == keyword) & (raw_data['label'] == 0)]
        patro_count = len(patro_df)
        select_patro_df = non_patro_df.sample(n=2*patro_count, random_state=seeds)
        downsampled_df = pd.concat([patro_df, select_patro_df])
        dfs.append( downsampled_df)
    result = pd.concat(dfs)
    return result.sample(frac = 1, random_state=seeds)

train_df = downsample(train_df)

In [None]:
xlnet_model = model("xlnet-base-cased")
xlnet_analyzer = BertAnalyzer(model = xlnet_model,
                                batch_size=64,
                                max_seq_len=128,
                                epochs=5,
                                lr=4e-5)
xlnet_analyzer.train(train_df, None)

xlnetmodel = xlnet_analyzer.net
dev_data = BertDataset.from_data(dev_df)
test_loader = DataLoader(dev_data,
                         batch_size = 64,
                         shuffle = False, 
                         num_workers = 4,
                         collate_fn = lambda batch: generate_batch(batch, max_seq_len = 128))
predicted = []
truths = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with torch.no_grad():
    for batch in test_loader:
        print(1)
        input_ids, attn_mask, labels = tuple(i.to(device) for i in batch)
        outputs = xlnetmodel(input_ids, attn_mask).squeeze(dim = 1)
        pred = (outputs >=0).int()
        predicted += pred.tolist()
        truths += labels.tolist()
with open("dev.txt", 'w') as file:
    for number in predicted:
        file.write(str(number) + '\n')

In [None]:
with open("dev.txt", "r") as file:
    dev = [int(line.strip()) for line in file]
print(len(dev))

In [None]:
from sklearn import metrics
f1 = metrics.f1_score(dev_df["label"].tolist(), dev)
f1

In [None]:
path = 'semeval-2022/TEST/task4_test.tsv'
titles = ['par_id', 'art_id', 'keyword','country_code','text','label']
test_df = pd.read_csv(path, sep = '\t',
                       names = titles)

In [None]:
test_data = BertDataset.from_data(test_df)
test_loader = DataLoader(test_data,
                         batch_size = 64,
                         shuffle = False, 
                         num_workers = 4,
                         collate_fn = lambda batch: generate_batch(batch, max_seq_len = 128))
predicted = []
truths = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with torch.no_grad():
    for batch in test_loader:
        input_ids, attn_mask, labels = tuple(i.to(device) for i in batch)
        outputs = xlnetmodel(input_ids, attn_mask).squeeze(dim = 1)
        pred = (outputs >=0).int()
        predicted += pred.tolist()
        truths += labels.tolist()
with open("test.txt", 'w') as file:
    for number in predicted:
        file.write(str(number) + '\n')
        

# Task 2b Hyperparameter Tuning, please see below, and Scheduler.py

In [None]:
# Train - validation (dev) - test split
train = pd.read_csv("semeval-2022/practice splits/train_semeval_parids-labels.csv")
test = pd.read_csv("semeval-2022/practice splits/dev_semeval_parids-labels.csv")
train_df_official = raw_data[raw_data["par_id"].isin(train['par_id'])]
test_df = raw_data[raw_data["par_id"].isin(test['par_id'])]

train_data_shuffled = train_df_official.sample(frac = 1, random_state = 1).reset_index(drop = True)
split_index = int(0.8 * len(train_data_shuffled))

train_df = train_data_shuffled.iloc[:split_index]
val_df = train_data_shuffled.iloc[split_index:]

In [None]:
import itertools
xlnet_model = model("xlnet-base-cased")
best_f1 = 0
best_params = {}
all_params = {}

learning_rates = [1e-4, 4e-5, 1e-5]  # Standard range for BERT fine-tuning
batch_sizes = [32, 64, 128]  # Adjust based on GPU memory
max_token_lens = [64, 128]  # Typical for BERT fine-tuning

# Grid Search Loop
for lr, batch_size, max_token_len in itertools.product(learning_rates, batch_sizes, max_token_lens):
    xlnet_model = model("xlnet-base-cased")
    if batch_size == 128 and max_token_len == 128:
        continue
    print(f"\nTraining with: LR={lr}, Batch Size={batch_size}, max_token_len={max_token_len}")
    
    # Define model arguments
    xlnet_analyzer = BertAnalyzer(model = xlnet_model,
                                    batch_size=batch_size,
                                    max_seq_len=max_token_len,
                                    epochs=3,
                                    lr=lr)
    
    datasampling = DataSampling()
    data = datasampling.downsample(train_df)

    xlnet_analyzer.train(data)
    f1 = xlnet_analyzer.evaluate(val_df)

    all_params[(lr, batch_size, max_token_len)] = f1
    del xlnet_model
    del xlnet_analyzer
    # Check if this is the best model so far
    if f1 > best_f1:
        best_f1 = f1
        best_params = {"learning_rate": lr, "batch_size": batch_size, "max_token_len": max_token_len}

    print(f"F1-score: {f1:.4f}")

# Print best hyperparameters
print("\n Best Hyperparameters:")
print(best_params)
print(f"Best F1-score: {best_f1:.4f}")

# Task 2c: for sampling and augmentation, please check Sampling.py, and upsample_ratio_test.ipynb

# Task 2d, please see below, and BaselineModels.py

In [None]:
# Train - validation (dev) - test split
train = pd.read_csv("semeval-2022/practice splits/train_semeval_parids-labels.csv")
test = pd.read_csv("semeval-2022/practice splits/dev_semeval_parids-labels.csv")
train_df_official = raw_data[raw_data["par_id"].isin(train['par_id'])]
test_df = raw_data[raw_data["par_id"].isin(test['par_id'])]

train_data_shuffled = train_df_official.sample(frac = 1, random_state = 1).reset_index(drop = True)
split_index = int(0.8 * len(train_data_shuffled))

train_df = train_data_shuffled.iloc[:split_index]
val_df = train_data_shuffled.iloc[split_index:]

In [None]:
# Initialize and train BoW model
pd.set_option("display.max_colwidth", None)

print("Training BoW model...")
model_bow = Bow_Baseline_Model()
model_bow.train(train_df)
print("Testing BoW model...")
f1_scores_bow = model_bow.test(test_df)

# Initialize and train TF-IDF model
print("\nTraining TF-IDF model...")
model_tfidf = Tfidf_Baseline_Model()
model_tfidf.train(train_df)
print("Testing TF-IDF model...")
f1_scores_tfidf = model_tfidf.test(test_df)


# Task 3

In [None]:
train = pd.read_csv("semeval-2022/practice splits/train_semeval_parids-labels.csv")
test = pd.read_csv("semeval-2022/practice splits/dev_semeval_parids-labels.csv")
train_df = raw_data[raw_data["par_id"].isin(train['par_id'])]
test_df = raw_data[raw_data["par_id"].isin(test['par_id'])]

In [None]:
datasampling = DataSampling()
data = datasampling.downsample(train_df)
xlnet_model = model("xlnet-base-cased")
xlnet_analyzer = BertAnalyzer(model=xlnet_model,
                              batch_size=64,
                              max_seq_len=128,
                              epochs=3,
                              lr=4e-05)
save_dir = "xlnet_analyzer_train_save"
save_path = os.path.join(save_dir, f"part3.pth")
xlnet_analyzer.train(data, save_path)

In [None]:
f1_score = xlnet_analyzer.evaluate(test_df)

In [None]:
# part 3a
raw_data_orig = raw_data_orig.dropna()
test_str_df = raw_data_orig[raw_data_orig["par_id"].isin(test['par_id'])]
test_df['original'] = test_str_df['label']
for original_label in test_df['original'].unique():
    original_df = test_df[(test_df['original'] == original_label)]
    if len(original_df) > 0:
        print(f"Original label {original_label}")
        f1_score = xlnet_analyzer.evaluate(original_df) #note: evaluate() method will print out accuracy and f1, as defined in Analyzer
        count = len(test_df[(test_df['original'] == original_label)])
        print(f"original_label {original_label}: {count} samples")


In [None]:
# part 3b - input length
# Function to compute text length and bucketize
def get_length_buckets(texts, bucket_size=20):
    lengths = texts.str.split().apply(len)
    bins = np.arange(0, lengths.max() + bucket_size, bucket_size)
    bucket_labels = [f"{b}-{b+bucket_size}" for b in bins[:-1]]
    length_buckets = pd.cut(lengths, bins=bins, labels=bucket_labels, right=False)
    return lengths, length_buckets

# Get input lengths and bucket them
test_df["length"], test_df["length_bucket"] = get_length_buckets(test_df["text"])

# Compute performance metrics per length bucket
for bucket in test_df["length_bucket"].unique():
    subset = test_df[test_df["length_bucket"] == bucket]
    if len(subset) > 0:
        print(f"Input length bucket {bucket}")
        f1_score = xlnet_analyzer.evaluate(subset)


In [None]:
# part 3c - data categories
for keyword in test_df['keyword'].unique():
    keyword_df = test_df[(test_df['keyword'] == keyword) ]
    if len(keyword_df) > 0:
        print(f"Keyword {keyword}")
        f1_score = xlnet_analyzer.evaluate(keyword_df)