In [1]:
import numpy as np
import pickle
import json

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
processed_path = "/content/drive/MyDrive/NLP-POS/data/processed"

X_train = np.load(f"{processed_path}/X_train.npy")
y_train = np.load(f"{processed_path}/y_train.npy")
X_dev   = np.load(f"{processed_path}/X_dev.npy")
y_dev   = np.load(f"{processed_path}/y_dev.npy")
X_test  = np.load(f"{processed_path}/X_test.npy")
y_test  = np.load(f"{processed_path}/y_test.npy")

with open(f"{processed_path}/tag2id.pkl", "rb") as f:
    tag2id = pickle.load(f)
with open(f"{processed_path}/id2tag.pkl", "rb") as f:
    id2tag = pickle.load(f)

print("Train shape:", X_train.shape, y_train.shape)
print("Dev shape:", X_dev.shape)
print("Test shape:", X_test.shape)


Train shape: (14187, 56) (14187, 56)
Dev shape: (1400, 56)
Test shape: (427, 56)


In [5]:
from collections import Counter

tag_counts = Counter()

for row in y_train:
    for tag_id in row:
        if tag_id != -1:      # padding'i sayma
            tag_counts[tag_id] += 1

tag_counts.most_common(5)

most_frequent_tag_id = tag_counts.most_common(1)[0][0]
most_frequent_tag = id2tag[most_frequent_tag_id]

print("Most frequent tag:", most_frequent_tag)


Most frequent tag: NOUN


In [6]:
def predict_mft(X, tag_id):
    # X: padded input
    # her token için aynı tag atanıyor
    return np.full_like(X, tag_id)

def masked_accuracy(y_true, y_pred):
    correct = 0
    total = 0

    for true_row, pred_row in zip(y_true, y_pred):
        for t, p in zip(true_row, pred_row):
            if t == -1:   # PAD
                continue
            total += 1
            if t == p:
                correct += 1
    return correct / total


In [7]:
y_train_pred = predict_mft(X_train, most_frequent_tag_id)
train_acc = masked_accuracy(y_train, y_train_pred)
train_acc

y_dev_pred = predict_mft(X_dev, most_frequent_tag_id)
dev_acc = masked_accuracy(y_dev, y_dev_pred)
dev_acc

y_test_pred = predict_mft(X_test, most_frequent_tag_id)
test_acc = masked_accuracy(y_test, y_test_pred)
test_acc


0.18563343491510667

In [8]:
print("Most Frequent Tag:", most_frequent_tag)
print("Train Accuracy:", round(train_acc, 4))
print("Dev Accuracy:", round(dev_acc, 4))
print("Test Accuracy:", round(test_acc, 4))


Most Frequent Tag: NOUN
Train Accuracy: 0.1798
Dev Accuracy: 0.1775
Test Accuracy: 0.1856
