## Part 4 – Design Challenge (20 points)

In [None]:
from collections import defaultdict

### 1. Data loading and preprocessing
def read_data(path, is_train=True):
    sentences = []
    sentence = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == '':
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                if is_train:
                    word, tag = line.split()
                    sentence.append((word, tag))
                else:
                    sentence.append(line)
    if sentence:
        sentences.append(sentence)
    return sentences

train_data = read_data("EN/train", is_train=True)
dev_data = read_data("EN/dev.in", is_train=False)
tag_set = set(tag for sent in train_data for _, tag in sent)

### 2. Feature extraction
def extract_features(sentence, i, prev_tag, curr_tag):
    word = sentence[i]
    features = {
        f"tag:{curr_tag}": 1,
        f"word:{word}|tag:{curr_tag}": 1,
        f"prev_tag:{prev_tag}|tag:{curr_tag}": 1,
    }
    if word[0].isupper(): features[f"is_cap|tag:{curr_tag}"] = 1
    if word.isdigit(): features[f"is_digit|tag:{curr_tag}"] = 1
    return features

### 3. Viterbi decoding
def viterbi_decode(sentence, tag_set, weights):
    n = len(sentence)
    dp = [{} for _ in range(n+1)]
    back = [{} for _ in range(n+1)]
    dp[0]['<s>'] = 0

    for i in range(n):
        for prev_tag in dp[i]:
            for curr_tag in tag_set:
                feats = extract_features(sentence, i, prev_tag, curr_tag)
                score = sum(weights[f] for f in feats)
                total = dp[i][prev_tag] + score
                if curr_tag not in dp[i+1] or total > dp[i+1][curr_tag]:
                    dp[i+1][curr_tag] = total
                    back[i+1][curr_tag] = prev_tag

    # Backtrack
    tags = []
    last_tag = max(dp[n], key=dp[n].get)
    for i in range(n, 0, -1):
        tags.append(last_tag)
        last_tag = back[i][last_tag]
    return list(reversed(tags))

### 4. Training loop
weights = defaultdict(float)

def train_perceptron(train_data, tag_set, epochs=5):
    for epoch in range(epochs):
        for sent in train_data:
            words = [w for w, t in sent]
            gold_tags = [t for w, t in sent]
            pred_tags = viterbi_decode(words, tag_set, weights)
            for i in range(len(words)):
                if gold_tags[i] != pred_tags[i]:
                    g_feats = extract_features(words, i, gold_tags[i-1] if i>0 else '<s>', gold_tags[i])
                    p_feats = extract_features(words, i, pred_tags[i-1] if i>0 else '<s>', pred_tags[i])
                    for f in g_feats: weights[f] += g_feats[f]
                    for f in p_feats: weights[f] -= p_feats[f]

### 5. Prediction + Write output
def predict_and_write(input_data, output_path, tag_set):
    with open(output_path, 'w', encoding='utf-8') as f:
        for sentence in input_data:
            tags = viterbi_decode(sentence, tag_set, weights)
            for word, tag in zip(sentence, tags):
                f.write(f"{word} {tag}\n")
            f.write("\n")

### 🔁 Train & predict
train_perceptron(train_data, tag_set, epochs=1)
predict_and_write(dev_data, "EN/dev.p4.out", tag_set)
