In [None]:
# Install libraries
!pip install -U transformers datasets --quiet

In [None]:
# Import required libraries
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

In [None]:
# Upload CSV manually
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

# Upload folder
df = pd.read_csv('Expanded_Policy_Statements.csv')
df = df.drop_duplicates(subset='sentence', keep='first').reset_index(drop=True)


print(df.columns)
df.head()


In [None]:
# If the values ​​in the label column look like a list, this code will flatten them
df['label'] = df['label'].apply(lambda x: x[0] if isinstance(x, list) else x)


In [None]:
print(df['label'].head())
print(type(df['label'][0]))


In [None]:
from sklearn.model_selection import train_test_split


sentences = df['sentence'].tolist()
labels = df['label'].tolist()

# Stratified split
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    sentences, labels, test_size=0.2, stratify=labels, random_state=42
)

# New dataframer
df_train = pd.DataFrame({'sentence': train_sentences, 'label': train_labels})
df_test = pd.DataFrame({'sentence': test_sentences, 'label': test_labels})

print("✅ Train set label counts")
print(df_train['label'].value_counts())
print("\n✅ Test set label counts")
print(df_test['label'].value_counts())


In [None]:

label2id = {'subsidy': 0, 'tax': 1, 'ban': 2, 'other': 3}
id2label = {v: k for k, v in label2id.items()}

# If df_train and df_test exist:
df_train['label'] = df_train['label'].apply(lambda x: label2id[x[0]] if isinstance(x, list) else label2id[x])
df_test['label'] = df_test['label'].apply(lambda x: label2id[x[0]] if isinstance(x, list) else label2id[x])

# Transform to HuggingFace Dataset again
from datasets import Dataset
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)


In [None]:
from transformers import BertTokenizerFast

# Upload Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize(batch):
    return tokenizer(batch['sentence'], padding=True, truncation=True)

# Apply Tokenization
dataset_train = dataset_train.map(tokenize, batched=True)
dataset_test = dataset_test.map(tokenize, batched=True)

# Turn into PyTorch format
dataset_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
dataset_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

# Upload the model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=4,  # 4 classes: subsidy, tax, ban, other
    id2label=id2label,
    label2id=label2id
)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=10,
    logging_dir='./logs',
    report_to="none",  # wandb kapalı
    #evaluation_strategy="epoch",                # Evaluate after each epoch
    #save_strategy="epoch",                      # Save after each epoch
    #load_best_model_at_end=True,                # Load the model with the best validation performance
    learning_rate=2e-5,                # küçük öğrenme oranı
    weight_decay=0.01,                 # prevent overfitting
    save_total_limit=1,                # Save only the best model
)




# Function of the evaluation metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()




In [None]:
#save the model in case you want to use it later.
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")


In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Prediction over test set
predictions = trainer.predict(dataset_test)
preds = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# What tags were actually used (sorted)
used_label_ids = np.unique(np.concatenate([true_labels, preds]))

# Get the name match in order (e.g. order like [1, 3, 0])
id2label = {0: 'subsidy', 1: 'tax', 2: 'ban', 3: 'other'}
target_names = [id2label[i] for i in used_label_ids]

# Classification performance report
print(classification_report(true_labels, preds, target_names=target_names, labels=used_label_ids))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
cm = confusion_matrix(true_labels, preds, labels=used_label_ids)


id2label = {0: 'subsidy', 1: 'tax', 2: 'ban', 3: 'other'}
labels_order = [id2label[i] for i in used_label_ids]

# Matrix plot
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels_order, yticklabels=labels_order)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()


In [None]:
# id2label dictionary
id2label = {0: 'subsidy', 1: 'tax', 2: 'ban', 3: 'other'}

# Test samples ve model predictions
for i in range(10):  # ilk 10 örnek
    sentence = df_test.iloc[i]['sentence']
    true_label = id2label[df_test.iloc[i]['label']]
    pred_label = id2label[preds[i]]

    print(f"🟢 Sentence: {sentence}")
    print(f"   ✅ True label: {true_label}")
    print(f"   🤖 Predicted: {pred_label}")
    print("-" * 80)
