# POS Tagger Trained on the UD Treebank

Adapted from https://github.com/soutsios/pos_tagger_mlp/blob/master/pos_tagger_mlp.ipynb 

# Plot Functions

These functions are useful to visualize the training dynamics of the learning algorithm and the confusion matrix of the trained models.

In [None]:
import pyconll, nltk, datetime, warnings
import numpy as np
from scipy.sparse import vstack
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, learning_curve
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
from numpy.random import seed

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='accuracy')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
def plot_confusion_matrix(f1,
                          cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True,
                          i=1):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy
    plt.figure(figsize=(10, 6))
    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}; f1-score={:0.4f}'.format(accuracy, misclass, f1))

# Load Dataset

In [None]:
UD_ENGLISH_TRAIN = '../UD_English-EWT/en_ewt-ud-train.conllu'
UD_ENGLISH_DEV = '../UD_English-EWT/en_ewt-ud-dev.conllu'
UD_ENGLISH_TEST = '../UD_English-EWT/en_ewt-ud-test.conllu'

# Preprocessing

In [None]:
def read_conllu(path):
    data = pyconll.load_from_file(path)
    tagged_sentences=[]
    original_sentences=[]
    t=0
    for sentence in data:
        original_sentences.append(sentence.text)
        tagged_sentence=[]
        for token in sentence:
            if token.upos:
                t+=1
                tagged_sentence.append((token.form if token.form else '*None*', token.upos))
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences, original_sentences

Load train, development and test set in the appropriate tagged format, tuple (word, pos-tag)

In [None]:
train_sentences, train_original = read_conllu(UD_ENGLISH_TRAIN)
val_sentences, val_original = read_conllu(UD_ENGLISH_DEV)
test_sentences, test_original = read_conllu(UD_ENGLISH_TEST)

Print statistics

In [None]:
print("Tagged sentences in train set: ", len(train_sentences))
print("Tagged words in train set:", len([item for sublist in train_sentences for item in sublist]))
print(40*'=')
print("Tagged sentences in dev set: ", len(val_sentences))
print("Tagged words in dev set:", len([item for sublist in val_sentences for item in sublist]))
print(40*'=')
print("Tagged sentences in test set: ", len(test_sentences))
print("Tagged words in test set:", len([item for sublist in test_sentences for item in sublist]))

In [None]:
print(train_sentences[0])

In [None]:
def tag_sequence(sentences):
    return [[t for w, t in sentence] for sentence in sentences]

def text_sequence(sentences):
    return [[w for w, t in sentence] for sentence in sentences]

def id2word(sentences):
    wordlist = [item for sublist in text_sequence(sentences) for item in sublist]
    id2word = {k:v for k,v in enumerate(wordlist)}
    return id2word

def untag(tagged_sentence):
    return [w for w, _ in tagged_sentence]

def untag_pos(tagged_sentence):
    return [t for _, t in tagged_sentence]

def build_vocab(sentences):
    vocab =set()
    for sentence in sentences:
        for word in untag(sentence):
            vocab.add(word)
    return sorted(list(vocab))

An important observation: how many terms are in validation set and not found in train set? (This estimates the Out-of-vocabulary rate.)

In [None]:
list_1 = build_vocab(train_sentences)
list_2 = build_vocab(val_sentences)
diff_list = [item for item in list_2 if item not in list_1]
print('Number of terms not found in train set:',len(diff_list))

# Feature Engineering

We use the "classical" machine learning approach: we will train a token classifier model. The classifier gets as input a feature vector describing each token in the sentence. We decide a priori which features are informative to make the tagging decision. In this case, we use a combination of "word shape" features which approximate morphological knowledge. We naturally also include lexical information (the token form itself), and some form of "syntactic knowledge" by adding reference to the previous and next word in each token feature vector.

In [None]:
def features_basic(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'nb_terms': len(sentence),        
        'word': sentence[index],
        'word_lower': sentence[index].lower(),
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'i-1_prefix-3': '' if index == 0 else sentence[index-1][:3],        
        'i-1_suffix-3': '' if index == 0 else sentence[index-1][-3:],        
        'i+1_prefix-3': '' if index == len(sentence) - 1 else sentence[index+1][:3],        
        'i+1_suffix-3': '' if index == len(sentence) - 1 else sentence[index+1][-3:],        
        'prev_word': '' if index == 0 else sentence[index - 1].lower(),
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1].lower(),
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
    }

In the scikit-learn model, we model a dataset as a pair of two data structures:
* The list of feature dictionaries X (one feature dictionary for each token)
* The list of predicted label y (one tag for each token)

In [None]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for doc_index, tagged in enumerate(tagged_sentences):
        for index in range(len(tagged)):
            X.append([features_basic(untag(tagged), index)])
            y.append(tagged[index][1])
    return X, y

Given a sentence represented as a list of tokens, return the list of feature dictionaries using our feature encoding method:

In [None]:
def transform_test_sentence(sentence):
    X = []
    for index in range(len(sentence)):
            X.append([features_basic(sentence, index)])
    return X

## Test untag()

We use untag() to extract raw sentences from the annotated CoNLL dataset. This way we can reproduce a sentence without tags, submit it to the tagger and compare predictions to the gold tags that are provided in the dataset.

In [None]:
untag(test_sentences[1])

# Dataset Transformation

Before we can apply a generic machine learning algorithm (such as Logistic Regression), we need to encode the dataset into a vectorized format.

We proceed in two steps: feature engineering and vectorization.

For each token, we create a dictionary of features that depend on the sentence from which the token is extracted. 
These features include the word itself, the word before and the word after, letter suffixes and prefixes, etc.

In the scikit-learn approach, before we can use a generic machine learning algorithm, we must then "vectorize" the feature dictionaries into vector encodings.
For example, lexical features are encoded into one-hot vectors whose dimension is the size of the vocabulary.
Note the difference between the method `fit_transform` of the vectorizer, which "learns" how to vectorize features, and `transform` which applies a learned vectorizer to feature dictionaries.  We use `fit_transform` on the training data, and `transform` on the other sections (validation and test).

These vector representations are what is passed to the machine learning algorithm.

In [None]:
def vectorize(train, val, test):

    print('Feature encoding method')
    print('Vectorizing Dataset...')
    print('Vectorizing train...')
    X_train, y_train = transform_to_dataset(train)
    v = DictVectorizer(sparse=True) 
    X_train = v.fit_transform([x[0] for x in X_train])
    
    print('Vectorizing validation...')
    X_val, y_val = transform_to_dataset(val)
    X_val = v.transform([x[0] for x in X_val])        
    
    print('Vectorizing test...')
    X_test, y_test = transform_to_dataset(test)
    X_test = v.transform([x[0] for x in X_test])
    
    print('Dataset vectorized.')
    print('Train shape:', X_train.shape)
    print('Validation shape:', X_val.shape)
    print('Test shape:', X_test.shape)
    
    # Compress sparse matrices
    X_train = X_train 
    X_val = X_val 
    X_test = X_test
    return X_train, y_train, X_val, y_val, X_test, y_test, v
    

# Baseline Tagger

Before we train a "real" machine learning algorithm using scikit-learn, we will repeat the very simple statistical method we discussed in class.
We train and evaluate the Baseline Unigram tagger to compare performance with the tagger we will train next.

In [None]:
default_tagger = nltk.DefaultTagger('NOUN')
unigram_tagger = nltk.UnigramTagger(train_sentences+val_sentences, backoff=default_tagger)

In [None]:
y_train = [item for sublist in tag_sequence(train_sentences+val_sentences) for item in sublist]
y_test = [item for sublist in tag_sequence(test_sentences) for item in sublist]
classes = sorted(list(set(y_train)))

In [None]:
def tag_list(tagged_sents):
    return [tag for sent in tagged_sents for (word, tag) in sent]
def apply_tagger(tagger, corpus):
    return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

In [None]:
y_pred = tag_list(apply_tagger(unigram_tagger, test_sentences))
print("Accuracy: {0:.4f}".format(unigram_tagger.accuracy(test_sentences)))
print('f1-macro score: {0:.4f}'.format(f1_score(y_test, y_pred, zero_division=1, average='macro')))

In [None]:
print(classification_report(y_test, y_pred, zero_division=1, digits=4))

See what errors the Baseline tagger makes:

In [None]:
def check_tagger(tagged_sentence):
    note = ''
    for tup in list(zip(unigram_tagger.tag(untag(tagged_sentence)),untag_pos(tagged_sentence))):
        if tup[0][1]!=tup[1]: note='<<--- Error!'
        print(tup[0], tup[1], note)
        note=''

In [None]:
check_tagger(test_sentences[1])

It makes mistakes. Unsurprising given its simplistic approach and the small size of the training data.

In [None]:
cnf_matrix = confusion_matrix(y_test, y_pred)
name='Baseline'

In [None]:
plot_confusion_matrix(f1_score(y_test, y_pred, average='macro'), cnf_matrix, target_names=classes, title='Confusion matrix for '+name+' classifier', normalize=False)

# Train a Logistic Regression Model

Let us move to a more serious machine learning model. We will train a Logistic Regression model using our feature extraction function based on our "expertise" in the domain.

We first transform the whole dataset from the CoNLL format into the scikit-learn vectorized encoding using our feature extraction method.

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test, vec = vectorize(train_sentences, val_sentences, test_sentences)

## Hyper-parameter Tuning

The Logistic Regression algorithm uses a hyper-parameter called C.  We tune the value of this parameter by testing different values on a subset of the training data and observing the impact of the C parameter on selected metrics (accuracy and F1).

Because we will use cross-validation, we can use the full train set (train + validation)

In [None]:
X_train = vstack((X_train, X_val))
y_train = np.append(y_train, y_val, axis=0)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
def hyper_tuning(train, test, y_train, y_test, scores, estimator, parameters, cv):
    print("# Estimator:",estimator)
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)        
        clf = GridSearchCV(estimator, parameters, cv=cv, scoring='%s' % score)
        clf.fit(train, y_train)
        print("Best parameters set found on development set:")
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print()
        print("Detailed classification report:")
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        y_pred = clf.predict(test)
        print(classification_report(y_test, y_pred, digits=4))
        print('Accuracy: {0:0.4f}   f1-score: {1:0.4f} \n'.format(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average='macro')))
    return clf

In [None]:
logregr = LogisticRegression(solver='liblinear', random_state=13)
# Cross validation strategy
skf = StratifiedKFold(n_splits=4)
# Scores could also be ['precision', 'recall', ....]
scores = ['accuracy', 'f1_macro']

params = [{'C': [0.1, 1, 2, 3, 4, 5, 10, 20, 50]}]
#logregr = hyper_tuning(X_train, X_test, y_train, y_test, scores, logregr, params, skf)
#You may want to comment previous line and comment-out next lines to see how hyper-tuning works and dont wait too much time...
skf = StratifiedKFold(n_splits=4)
logregr = hyper_tuning(X_train[:500], X_test[:50], y_train[:500], y_test[:50], scores, logregr, params, skf)

## Training

We can now training using the best hyper-parameter selected above.  This takes a few minutes.

In [None]:
t_ini = datetime.datetime.now()
print('Training...')
clf = LogisticRegression(C=20, solver='liblinear', random_state=13)
clf.fit(X_train, y_train)
t_fin = datetime.datetime.now()
print('Training completed in {} seconds'.format((t_fin - t_ini).total_seconds()))

# Evaluation

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy: {0:.4f}".format(clf.score(X_test, y_test)))
print('f1-macro score: {0:.4f}'.format(f1_score(y_test, y_pred, average='macro')))

## Classification Report

In [None]:
print(classification_report(y_test, clf.predict(X_test), digits=4))

## Frequent Types of Mistakes

In [None]:
import collections
k=0
i2w = id2word(test_sentences)
error_counter = collections.Counter()
for i in range(X_test.shape[0]):
    correct_tag_id = y_test
    if y_pred[i]!=y_test[i]:
        k += 1
        word = i2w[i]
        error_counter[word] += 1
print('Accuracy: {0:.4f}'.format((len(i2w)-k)/len(i2w)))
print('Total errors/Total words: {}/{}\n'.format(k,len(i2w)))
print('Most common errors:',error_counter.most_common(20))

## Learning Curves

The following diagram illustrates the "training dynamics" of the LR model: how fast does it improve as it keeps training. Originally, the difference between the test dataset and the cross-validation (on part of the test data) is large; as training proceeds, the gap reduces. This diagram is important to verify we do not have a case of over-fitting - where the model does "very well" on training data and does not improve on test data.  

This computation takes a long time (as we keep training and evaluating multiple times to obtain the snapshots). It is not necessary to run the rest of the notebook so that you can safely skip it.

In [None]:
#plot_learning_curve(clf, 'Logistic Regression', X_train, y_train, ylim=(0.7, 1.01), cv=5)

## Confusion Matrix

In [None]:
y_pred = clf.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
cnf_matrix = confusion_matrix(y_test, y_pred)
name='Logistic Regression'
plot_confusion_matrix(f1, cnf_matrix, target_names=classes, title='Confusion matrix for '+name+' classifier', normalize=False)

## Tag an Unknown Sentence

Lets try our trained tagger on new sentences.  To tag a sentence given as a string, we must apply the following steps:
* Tokenize the string into a list of tokens
* Turn each token into a features dictionary (using the features used by our model)
* Turn the list of feature dictionaries into vectors (using scikit-learn vectorization method)
* Pass the resulting matrix (one row vector for each token) to the classifier.

In [None]:
# Need to download the nltk model for sentence tokenizer
nltk.download('punkt_tab')

In [None]:
tokens = nltk.word_tokenize('Word embeddings provide a dense representation of words and their relative meanings.')
print(tokens)

In [None]:
X_features = transform_test_sentence(tokens)

In [None]:
X_features

In [None]:
X_vectorized = vec.transform([x[0] for x in X_features])

In [None]:
# The vectorized sentence is a sparse matrix with one row for each token and columns for the vectorized features
# For example, if the vocabulary has 1000 unique words, the vectorized sentence will have 1000 columns for each word feature.
# This is a very sparse matrix, where most of the values are zero.
X_vectorized.shape

In [None]:
pred = clf.predict(X_vectorized)

In [None]:
# np.str_ is a subclass of str that is used to represent string arrays in NumPy.
print('Here is what our LR tagger predicts for the test sentence:\n',list(zip(tokens, pred)))

Let's turn this process into a prediction function from a sentence encoded as a single string to a list of pairs (token, predicted_tag):

In [None]:
def predict_sentence(sentence):
   tokens = nltk.word_tokenize(sentence)
   X_features = transform_test_sentence(tokens)
   X_vectorized = vec.transform([x[0] for x in X_features])
   pred = clf.predict(X_vectorized)
   return list(zip(tokens, pred))

In [None]:
predict_sentence("Let me join the chorus of annoyance over Google's new toolbar , which, as noted in the linked article, commits just about every sin an online marketer could commit, and makes up a few new ones besides.")

# Collect Hard Sentences

Hard sentences are sentences that contain multiple wrongly predicted tags given our classifier.

Write code to collect hard sentences given a classifier clf.

In [None]:
def errors_in_sentence_prediction(clf, tagged_sentence):
    """Given a tagged sentence from the dataset, return the number of errors and the predicted tags."""
    errors = 0
    tokens = [word for word, _ in tagged_sentence] 
    true_tags = [true_tag for _, true_tag in tagged_sentence] 

    X_features = transform_test_sentence(tokens)
    X_vectorized = vec.transform([x[0] for x in X_features])
    pred = clf.predict(X_vectorized)

    for i in range(len(tokens)):
        if true_tags[i] != pred[i]:
            errors += 1

    return errors, pred

In [None]:
hard_sentences = []
idx = 0
for s in test_sentences:
    errors, pred = errors_in_sentence_prediction(clf, s)
    if errors > 0:
        hard_sentences.append((s, errors, pred, idx))
    idx += 1
print(f'Number of sentences with errors: {len(hard_sentences)} out of {len(test_sentences)}')

Draw a histogram showing how the sentences in the test dataset are distributed in terms of prediction errors per sentence.

In [None]:
# Draw a histogram of the number of errors per sentence
plt.figure(figsize=(10, 6))
plt.hist([x[1] for x in hard_sentences], bins=16, edgecolor='black')
plt.title('Number of errors per sentence')
plt.xlabel('Number of errors')
plt.ylabel('Number of sentences')
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
# Show prediction errors for sentences with more than 5 errors.
for s in hard_sentences:
    if s[1] > 5:
        print(" ".join(untag(s[0])))
        print('Number of errors:', s[1])
        for i in range(len(s[0])):
            if s[0][i][1] != s[2][i]:
                print(f'{s[0][i][0]:<20}  C: {s[0][i][1]:<12}  P: {s[2][i]:<12} **** Error')
            else:
                print(f'{s[0][i][0]:<23}  {s[0][i][1]:<12}')
        print(40*'=')


## Error Analysis

1. Identify tokens that are misclassified more than 10 times in the test set. Print the sentences where the errors are predicted (about 100 sentences).
2. Provide a possible reason why these errors are made by the tagger based on your understanding of the knowledge needed to correctly tag these tokens.
3. Based on this error analysis, invent five sentences that are badly tagged. Explain what is your method to create these hard examples.


In [None]:
tokens = collections.defaultdict(list)
sen_ids = [] 
for i, s in enumerate(hard_sentences):
    words = [word for word, _ in hard_sentences[i][0]]
    full_s = ' '.join(words)
    sen_ids.append(full_s)
    for j in range(len(s[0])):
        if s[0][j][1] != s[2][j]:
            tokens[s[0][j][0]].append(i)

sentences = set()
print("Tokens with more than 10 errors:")
for token, lst in tokens.items():
    if len(lst) > 10:
        print(token)
        for index in lst:
            sentences.add(sen_ids[index])

print("Total amount of sentences:", len(sentences))

print("Sentences where the errors are predicted:\n")
for s in sentences:
    print(s)

## 2.1


In [None]:
# --- Imports ---
import os
from openai import OpenAI
from pydantic import BaseModel, Field
from enum import Enum
from typing import List, Optional
import time

# 
model = 'grok-3-mini'

In [None]:
# --- Define the Universal Dependencies POS Tagset (17 core tags) as an enum ---
class UDPosTag(str, Enum):
    ADJ = "ADJ"     # adjective
    ADP = "ADP"     # adposition
    ADV = "ADV"     # adverb
    AUX = "AUX"     # auxiliary verb
    CCONJ = "CCONJ" # coordinating conjunction
    DET = "DET"     # determiner
    INTJ = "INTJ"   # interjection
    NOUN = "NOUN"   # noun
    NUM = "NUM"     # numeral
    PART = "PART"   # particle
    PRON = "PRON"   # pronoun
    PROPN = "PROPN" # proper noun
    PUNCT = "PUNCT" # punctuation
    SCONJ = "SCONJ" # subordinating conjunction
    SYM = "SYM"     # symbol
    VERB = "VERB"   # verb
    X = "X"         # other / unknown


In [None]:
# TODO Define more Pydantic models for structured output
class TokenPOS(BaseModel):
    text: str = Field(description="The token text")
    pos_tag: UDPosTag = Field(description="The Universal Dependencies POS tag")

class SentencePOS(BaseModel):
    tokens: List[TokenPOS] = Field(description="List of tokens with their POS tags")

class TaggedSentences(BaseModel):
    """Represents a list of sentences with their tagged tokens."""
    sentences: List[SentencePOS] = Field(description="A list of sentences, each containing tagged tokens.")

# --- Configure the Grok API ---
# Get a key https://console.x.ai/team 
# Use os.environ.get for production environments.
# For Colab/AI Studio, you might use userdata.get
# Example:
# from google.colab import userdata
# GROK_API_KEY = userdata.get('GROK_API_KEY')
# genai.configure(api_key=GROK_API_KEY)

# Make sure to replace "YOUR_API_KEY" with your actual key if running locally
# and not using environment variables or userdata.
try:
    # Attempt to get API key from environment variable
    def load_env_from_ini(filename):
        with open(filename) as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                key, value = line.split('=', 1)
                os.environ[key.strip()] = value.strip()

    # Load the API key
    load_env_from_ini("grok_key.ini")
    api_key = os.environ.get("GROK_API_KEY")
    if not api_key:
        # Fallback or specific instruction for local setup
        # Replace with your actual key if needed, but environment variables are safer
        api_key = "YOUR_API_KEY"
        if api_key == "YOUR_API_KEY":
           print("⚠️ Warning: API key not found in environment variables. Using placeholder.")
           print("   Please set the GROK_API_KEY environment variable or replace 'YOUR_API_KEY' in the code.")

    client = OpenAI(
        api_key=api_key,
        base_url="https://api.x.ai/v1",
    )

except Exception as e:
    print(f"Error configuring API: {e}")
    print("Please ensure you have a valid API key set.")
    # Depending on the environment, you might want to exit here
    # import sys
    # sys.exit(1)



In [None]:
import json  # Add this import if not already present

def tag_sentences_ud(sentences_json: str) -> Optional[TaggedSentences]:
    """
    Performs POS tagging on the input list of sentences using the Grok API and
    returns the result structured according to the TaggedSentences Pydantic model.
    
    Args:
        sentences_json: JSON string containing one or more sentences to tag
        
    Returns:
        A TaggedSentences object containing the tagged tokens, or None if an error occurs.
    """
    # Construct the prompt with JSON input format
    prompt = f"""You are a specialized POS tagger following Universal Dependencies (UD_English-EWT) conventions that outputs precise structured JSON.
    
    I will provide you with a JSON array of sentences to tag. Process each sentence separately.
    
    Tag each token with Universal Dependencies (UD) POS tags:
    ADJ=adjective, ADP=adposition, ADV=adverb, AUX=auxiliary, CCONJ=coordinating conjunction, DET=determiner, 
    INTJ=interjection, NOUN=noun, NUM=numeral, PART=particle, PRON=pronoun, PROPN=proper noun, 
    PUNCT=punctuation, SCONJ=subordinating conjunction, SYM=symbol, VERB=verb, X=other
    
    Rules:
    - Split on whitespace and punctuation (except in URLs, numbers, abbreviations)
    - Split contractions: "don't" → ["Do", "n't"], "it's" → ["It", "'s"]
    - Separate possessives: "Elena's" → ["Elena", "'s"]
    - Split hyphenated compounds: "search-engine" → ["search", "-", "engine"]
    - Keep punctuation as separate tokens
    - Preserve numbers with internal periods/commas (e.g., 3.14, 1,000)
    - Do not merge words except for contractions/clitics
    - if there is a sentence equal to "..." tag it as "PUNCT", don't ignore it.
    - make sure you don't skip any tokens or strings. tag all tokens.
    
    Input JSON array of sentences:
    {sentences_json}
    """

    completion = client.beta.chat.completions.parse(
        model="grok-3",
        messages=[
            {"role": "system", "content": prompt}
        ],
        response_format=TaggedSentences,
    )
    
    res = completion.choices[0].message.parsed
    return res



In [None]:
@sleep_and_retry
@limits(calls=5, period=1)  # 5 calls per second
def rate_limited_tag(sentences_json: str) -> Optional[TaggedSentences]:
    return tag_sentences_ud(sentences_json)

def batch_tag_sentences_ud(sentences: List[str], batch_size: int = 15) -> List[TaggedSentences]:
    """Process sentences in parallel with rate limiting using JSON formatting"""
    # Create batches
    batches = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        # Convert batch to JSON string instead of using <sentence> tags
        batch_json = json.dumps(batch)
        batches.append(batch_json)
    
    print(f"Processing {len(batches)} batches with parallel workers...")
    results = [None] * len(batches)
    
    # Use ThreadPoolExecutor to process batches in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all tasks to the executor
        future_to_batch = {executor.submit(rate_limited_tag, batch_json): i 
                          for i, batch_json in enumerate(batches)}
        
        # Process results as they complete
        for future in concurrent.futures.as_completed(future_to_batch):
            batch_idx = future_to_batch[future]
            try:
                result = future.result()
                if result:
                    results[batch_idx] = result
                    print(f"✓ Completed batch {batch_idx+1}/{len(batches)}")
                else:
                    print(f"✗ Failed batch {batch_idx+1}/{len(batches)}")
            except Exception as e:
                print(f"✗ Error processing batch {batch_idx+1}: {e}")
    
    return results


In [None]:
lr_hard_sentences = [s for s in hard_sentences if 1 <= s[1] <= 3]
sentences = [" ".join(untag(pairs)) for pairs, _, _ , _ in lr_hard_sentences]
results_llm = batch_tag_sentences_ud(sentences, batch_size=5)


In [None]:
llm_hard_sentences = []


def evaluate_llm_tagger(results_llm, lr_hard_sentences, batch_size=5):
    """
    Evaluates the LLM tagger results against ground truth, handling potential misalignments.
    
    Args:
        results_llm: List of TaggedSentences results from the LLM
        lr_hard_sentences: List of (sentence, errors, lr_tags, idx) tuples with ground truth
        batch_size: The batch size used when processing sentences
    
    Returns:
        Metrics for the LLM tagger performance
    """
    import collections
    from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
    
    # Filter out None results
    results_llm = [r for r in results_llm if r is not None]
    
    # Create batch-aware mapping of results
    batched_sentences = [lr_hard_sentences[i:i+batch_size] for i in range(0, len(lr_hard_sentences), batch_size)]
    
    # Initialize empty lists for predictions and words
    llm_pred = []
    llm_words = []
    sentence_mapping = {}  # Maps original index to result index
    
    print(f"Type of results_llm[0]: {type(results_llm[0]) if results_llm else 'No results'}")
    
    # Process each batch separately
    for batch_idx, batch in enumerate(batched_sentences):
        if batch_idx >= len(results_llm):
            print(f"Missing results for batch {batch_idx}")
            continue
            
        batch_result = results_llm[batch_idx]
        
        # Check if it has sentences attribute
        if hasattr(batch_result, 'sentences'):
            result_sentences = batch_result.sentences
        elif hasattr(batch_result, 'tokens'):
            # Handle single-sentence result case
            result_sentences = [batch_result]
        else:
            print(f"Unknown structure for batch {batch_idx}: {dir(batch_result)}")
            continue
        
        # Check if we got the expected number of sentences
        if len(result_sentences) != len(batch):
            print(f"Warning: Batch {batch_idx} returned {len(result_sentences)} sentences, expected {len(batch)}")
        
        # Map each input sentence to its result (or None if missing)
        for i, (s, errors, lr_tags, idx) in enumerate(batch):
            orig_idx = batch_idx * batch_size + i  # Original index in lr_hard_sentences
            
            if i < len(result_sentences):
                # We have a result for this sentence
                pred = [token.pos_tag for token in result_sentences[i].tokens]
                words = [token.text for token in result_sentences[i].tokens]
                llm_pred.append(pred)
                llm_words.append(words)
                sentence_mapping[orig_idx] = len(llm_pred) - 1
            else:
                # No result for this sentence
                print(f"No prediction available for sentence {orig_idx}")
                # Don't add to llm_pred/llm_words, but record this in mapping
                sentence_mapping[orig_idx] = None
    
    # Only proceed with metrics if we have predictions
    if len(llm_pred) > 0:
        # Initialize counters and lists
        global llm_hard_sentences
        llm_hard_sentences = []
        fixed_by_llm = 0
        new_errors_by_llm = 0
        llm_error_data = []
        mismatches = 0
        
        # For metrics calculation
        all_true_tags = []
        all_pred_tags = []
        all_words = []
        
        for i, (s, errors, lr_tags, idx) in enumerate(lr_hard_sentences):
            # Use the mapping to get the correct prediction index
            if i not in sentence_mapping or sentence_mapping[i] is None:
                print(f"Skipping sentence {i} - no prediction available")
                continue
                
            pred_idx = sentence_mapping[i]
            words = [word for word, _ in s]
            true_tags = [true_tag for _, true_tag in s]
            pred_tags = llm_pred[pred_idx]
            
            # Sanity check: length of tokens should match
            if len(true_tags) != len(pred_tags) or len(pred_tags) != len(words):
                print(f"Length mismatch in sentence {i}: true={len(true_tags)}, pred={len(pred_tags)}")
                # print(llm_words[pred_idx])
                # print(words)
                mismatches += 1
                continue
            
            # Add to the collections for metrics calculations
            all_true_tags.extend([tag.value if hasattr(tag, 'value') else str(tag) for tag in true_tags])
            all_pred_tags.extend([tag.value if hasattr(tag, 'value') else str(tag) for tag in pred_tags])
            all_words.extend(words)


            llm_errors = 0
            for j in range(len(words)):
                if true_tags[j] != pred_tags[j]:
                    llm_errors += 1
                    llm_error_data.append((words[j], true_tags[j], pred_tags[j]))
                    
                lr_wrong = lr_tags[j] != true_tags[j]
                llm_wrong = pred_tags[j] != true_tags[j]
                
                if lr_wrong and not llm_wrong:
                    fixed_by_llm += 1
                if not lr_wrong and llm_wrong:
                    new_errors_by_llm += 1
                    
            if llm_errors > 0:
                llm_hard_sentences.append((s, llm_errors))
        
        # Print comparison metrics
        print(f'Number of sentences with errors (llm): {len(llm_hard_sentences)} out of {len(lr_hard_sentences)}')
        print(f'Number of sentences with errors (lr): {len(lr_hard_sentences)}')
        print(f"✅ Errors fixed by LLM: {fixed_by_llm}")
        print(f"⚠️ New errors made by LLM: {new_errors_by_llm}")
        print(f"❗️ Mismatches in sentence length: {mismatches}")
        
        # Calculate standard metrics
        if len(all_true_tags) > 0:
            print("\n--- LLM Tagger Token-Level Metrics ---")
            llm_accuracy = accuracy_score(all_true_tags, all_pred_tags)
            llm_f1_macro = f1_score(all_true_tags, all_pred_tags, average='macro')
            print(f"Accuracy: {llm_accuracy:.4f}")
            print(f"F1-macro score: {llm_f1_macro:.4f}")
            
            # Generate classification report
            print("\nClassification Report for LLM Tagger:")
            print(classification_report(all_true_tags, all_pred_tags, digits=4))
            
            # Generate confusion matrix
            classes = sorted(list(set(all_true_tags)))
            cnf_matrix = confusion_matrix(all_true_tags, all_pred_tags)
            f1 = f1_score(all_true_tags, all_pred_tags, average='macro')
            plot_confusion_matrix(f1, cnf_matrix, target_names=classes, 
                                title='Confusion matrix for LLM Tagger', normalize=False)
            
            # Count and display most common errors
            error_counter = collections.Counter()
            total_errors = 0
            for i in range(len(all_true_tags)):
                if all_true_tags[i] != all_pred_tags[i]:
                    total_errors += 1
                    word = all_words[i]
                    error_counter[word] += 1
            
            print("\nFrequent Types of Mistakes:")
            print(f'Accuracy: {(len(all_true_tags)-total_errors)/len(all_true_tags):.4f}')
            print(f'Total errors/Total words: {total_errors}/{len(all_true_tags)}\n')
            print('Most common errors:', error_counter.most_common(20))
            
            return {
                'accuracy': llm_accuracy,
                'f1_macro': llm_f1_macro,
                'fixed_by_llm': fixed_by_llm,
                'new_errors_by_llm': new_errors_by_llm,
                'mismatches': mismatches,
                'error_data': llm_error_data,
                'hard_sentences': llm_hard_sentences
            }
        else:
            print("No data available to calculate metrics")
            return None
    else:
        print("No LLM predictions available to calculate metrics")
        return None


In [None]:
import json

import concurrent
from ratelimit import limits, sleep_and_retry 

def tag_pretokenized_sentences(tokenized_sentences_json: str) -> Optional[TaggedSentences]:
    """
    Performs POS tagging on pre-tokenized sentences using the Grok API.
    
    Args:
        tokenized_sentences_json: JSON string containing a list of tokenized sentences, 
                                  where each sentence is a list of tokens
        
    Returns:
        A TaggedSentences object containing the tagged tokens, or None if an error occurs.
    """
    # Construct the prompt with pre-tokenized input format
    prompt = f"""You are a specialized POS tagger following Universal Dependencies (UD_English-EWT) conventions.
    
    I will provide you with a JSON array where each element is a pre-tokenized sentence (a list of tokens).
    Your task is ONLY to assign the correct Universal Dependencies POS tag to each token.
    DO NOT modify the tokenization in any way - use exactly the tokens provided.
    
    Tag each token with one of these Universal Dependencies POS tags:
    ADJ=adjective, ADP=adposition, ADV=adverb, AUX=auxiliary, CCONJ=coordinating conjunction, DET=determiner,
    INTJ=interjection, NOUN=noun, NUM=numeral, PART=particle, PRON=pronoun, PROPN=proper noun,
    PUNCT=punctuation, SCONJ=subordinating conjunction, SYM=symbol, VERB=verb, X=other
    
    Important guidelines:
    - DO NOT change, merge, or split any tokens - use exactly the tokens provided
    - If a token is "..." tag it as PUNCT
    - Tag all tokens - do not skip any
    - Contractions like "n't" should be tagged as PART
    - Possessive markers like "'s" should be tagged as PART
    - Proper nouns (names of specific entities) should be tagged as PROPN
    - Auxiliary verbs (be, have, do, will, etc.) should be tagged as AUX
    
    Input JSON array of pre-tokenized sentences:
    {tokenized_sentences_json}
    """

    completion = client.beta.chat.completions.parse(
        model="grok-3",
        messages=[
            {"role": "system", "content": prompt}
        ],
        response_format=TaggedSentences,
    )
    
    res = completion.choices[0].message.parsed
    return res

@sleep_and_retry
@limits(calls=5, period=1)  # 5 calls per second
def rate_limited_tag_pretokenized(tokenized_sentences_json: str) -> Optional[TaggedSentences]:
    return tag_pretokenized_sentences(tokenized_sentences_json)

def batch_tag_pretokenized_sentences(tokenized_sentences: List[List[str]], batch_size: int = 15) -> List[TaggedSentences]:
    """
    Process pre-tokenized sentences in batches using JSON formatting
    
    Args:
        tokenized_sentences: List of tokenized sentences, where each sentence is a list of tokens
        batch_size: Number of sentences to process in each batch
        
    Returns:
        List of TaggedSentences results
    """
    # Create batches
    batches = []
    for i in range(0, len(tokenized_sentences), batch_size):
        batch = tokenized_sentences[i:i+batch_size]
        # Convert batch to JSON string
        batch_json = json.dumps(batch)
        batches.append(batch_json)
    
    print(f"Processing {len(batches)} batches of pre-tokenized sentences...")
    results = [None] * len(batches)
    
    # Use ThreadPoolExecutor to process batches in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all tasks to the executor
        future_to_batch = {executor.submit(rate_limited_tag_pretokenized, batch_json): i 
                          for i, batch_json in enumerate(batches)}
        
        # Process results as they complete
        for future in concurrent.futures.as_completed(future_to_batch):
            batch_idx = future_to_batch[future]
            try:
                result = future.result()
                if result:
                    results[batch_idx] = result
                    print(f"✓ Completed batch {batch_idx+1}/{len(batches)}")
                else:
                    print(f"✗ Failed batch {batch_idx+1}/{len(batches)}")
            except Exception as e:
                print(f"✗ Error processing batch {batch_idx+1}: {e}")
    
    return results

# lr_hard_sentences = [s for s in hard_sentences if 1 <= s[1] <= 3]

# # Extract pre-tokenized sentences (using the untag function you already have)
# tokenized_sentences = [untag(pairs) for pairs, _, _, _ in lr_hard_sentences]

# # Process the pre-tokenized sentences
# results_llm = batch_tag_pretokenized_sentences(tokenized_sentences, batch_size=5)

# Evaluate with your evaluation function
metrics = evaluate_llm_tagger(results_llm, lr_hard_sentences, batch_size=5)



In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

fig, ax = plt.subplots(1, 2, figsize=(12, 6))  

# Plot the histogram for LR
ax[0].hist([x[1] for x in lr_hard_sentences], bins=range(1, 6), edgecolor='black', rwidth=0.8)
ax[0].set_title("LR Tagger Error Histogram")
ax[0].set_xlabel("Number of Errors")
ax[0].set_ylabel("Number of Sentences")
ax[0].xaxis.set_major_locator(MaxNLocator(integer=True))
ax[0].yaxis.set_major_locator(MaxNLocator(integer=True))

# Plot the histogram for LLM
ax[1].hist([errors for s, errors in llm_hard_sentences], bins=range(1, 6), edgecolor='black', rwidth=0.8)
ax[1].set_title("LLM Tagger Error Histogram")
ax[1].set_xlabel("Number of Errors")
ax[1].set_ylabel("Number of Sentences")
ax[1].xaxis.set_major_locator(MaxNLocator(integer=True))
ax[1].yaxis.set_major_locator(MaxNLocator(integer=True))

plt.tight_layout()
plt.show()


## 2.3

In [None]:
from typing import List, Tuple, Dict
from pydantic import BaseModel

class ErrorExplanation(BaseModel):
    word: str
    correct_tag: str
    predicted_tag: str
    explanation: str
    category: str   

def explain_tagging_errors(
    errors: List[Tuple[str, str, "UDPosTag"]],
    sentence_context: str = "The error word appeared in a sentence. You may assume typical usage.",
    max_errors: int = 20,
    delay: float = 1.0
) -> List[Dict]:
    """
    Uses Grok to explain POS tagging errors.

    Args:
        errors: A list of (word, predicted_tag, correct_tag) tuples.
        sentence_context: Optional sentence to help Grok understand usage.
        max_errors: Maximum number of errors to process.
        delay: Seconds to wait between requests.

    Returns:
        A list of dictionaries with keys: word, correct_tag, predicted_tag, explanation, category.
    """
    explanations = []

    for word, predicted_tag, correct_tag in errors[:max_errors]:
        prompt = f"""
    You are a linguistics expert.

    A POS tagging model made the following error:
    - Word: {word}
    - Correct tag: {correct_tag.value}
    - Predicted tag: {predicted_tag}
    - Sentence context: {sentence_context}

    Explain in 1–3 sentences why this tagging error likely occurred, using a clear linguistic explanation (e.g., idiom, function word, capitalization confusion, named entity confusion, etc.).

    Then provide a **concise category** that captures the main cause of the error.
    ⚠️ The category must be **one of the following** (choose the best match):

    - Function word misclassification
    - Capitalization
    - Named entity issue
    - Punctuation influence
    - Contextual ambiguity
    - Preposition/Adverb confusion
    - Model bias
    - Idiomatic expression
    - Word frequency bias
    - Tokenization mismatch

    Do **not** create new category names. Only pick from the list above.

    Return a JSON object with:
    - word
    - correct_tag
    - predicted_tag
    - explanation
    - category
    """

        try:
            completion = client.beta.chat.completions.parse(
                model="grok-3",
                messages=[
                    {"role": "system", "content": prompt}
                ],
                response_format=ErrorExplanation,
            )
            parsed: ErrorExplanation = completion.choices[0].message.parsed
            explanations.append(parsed.dict())  # Convert to plain dict
            time.sleep(delay)
        except Exception as e:
            print(f"Error explaining word '{word}': {e}")
            continue

    return explanations

In [None]:
import json


def format_explanations_as_json_block(explanations: list) -> str:
    formatted = []

    for e in explanations:
        word = e["word"].upper()
        correct = e["correct_tag"]
        predicted = e["predicted_tag"]

        # Try to extract short category + tag pair
        if "/" in e["category"]:
            parts = e["category"].split("/")
            if len(parts) == 2:
                tag1, tag2 = parts[0].strip(), parts[1].split()[0].strip()  # split off 'ambiguity'
                category = f"Ambiguity ({tag1}/{tag2})"
            elif len(parts) == 3:  # e.g. "ADP/ADV ambiguity"
                tag1, tag2 = parts[0].strip(), parts[1].strip()
                category = f"Ambiguity ({tag1}/{tag2})"
            else:
                category = e["category"].capitalize()
        else:
            category = e["category"].capitalize()

        formatted.append({
            "word": word,
            "correct_tag": correct,
            "predicted_tag": predicted,
            "explanation": e["explanation"].strip(),
            "category": category
        })

    return "JSON\n" + json.dumps(formatted, indent=2)


In [None]:
explanations = explain_tagging_errors(llm_error_data[0:20])
print(format_explanations_as_json_block(explanations))


In [None]:
from collections import Counter

category_counter = Counter([e["category"] for e in explanations])

import matplotlib.pyplot as plt

# Sort categories by frequency
sorted_items = sorted(category_counter.items(), key=lambda x: x[1], reverse=True)
categories, counts = zip(*sorted_items)

plt.figure(figsize=(10, 6))
plt.barh(categories, counts, color='skyblue', edgecolor='black')
plt.xlabel("Number of Errors")
plt.title("Error Categories in LLM Tagging")
plt.gca().invert_yaxis()  # Largest on top
plt.tight_layout()
plt.show()

for cat, count in category_counter.most_common():
    print(f"{cat}: {count}")


In [None]:
import time
import random
from typing import List
from pydantic import BaseModel, RootModel
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# ✅ Define schemas
class SyntheticSentence(BaseModel):
    sentence: List[str]
    tags: List[str]
    categories: List[str]  # ✅ added

class SyntheticBatch(RootModel[List[SyntheticSentence]]):
    pass

# ✅ Predefined error categories
error_categories = [
    "Function word misclassification",
    "Capitalization",
    "Named entity issue",
    "Punctuation influence",
    "Contextual ambiguity",
    "Preposition/Adverb confusion",
    "Model bias",
    "Idiomatic expression",
    "Word frequency bias",
    "Tokenization mismatch"
]

# ✅ Build prompt
def build_synthetic_prompt(categories: List[str]) -> str:
    joined = ", ".join(categories)
    return f"""
Generate 2 English sentences that demonstrate POS tagging challenges involving: {joined}.

For each sentence, return:
- A list of tokens
- Their corresponding UD POS tags

Return a valid JSON list like:
[
  {{
    "sentence": [...],
    "tags": [...]
  }},
  ...
]

Only return JSON. Do not explain.
"""

# ✅ Request a single batch with retry and category attachment
def generate_batch(categories: List[str], max_retries=3) -> List[SyntheticSentence]:
    prompt = build_synthetic_prompt(categories)

    for attempt in range(max_retries):
        try:
            completion = client.beta.chat.completions.parse(
                model="grok-3",
                messages=[{"role": "system", "content": prompt}],
                response_format=SyntheticBatch,
            )
            results = completion.choices[0].message.parsed.root
            # ✅ attach categories to each result
            for r in results:
                r.categories = categories
            return [r for r in results if len(r.sentence) == len(r.tags)]
        except Exception as e:
            wait = 2 + random.uniform(0, 2)
            print(f"[Retry {attempt+1}] Error: {e} — waiting {wait:.1f}s")
            time.sleep(wait)

    print("❌ Failed after retries.")
    return []

# ✅ Run in parallel
def run_parallel_generation(n_batches: int = 200, n_threads: int = 5) -> List[SyntheticSentence]:
    all_results = []
    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        futures = [
            executor.submit(generate_batch, random.sample(error_categories, k=3))
            for _ in range(n_batches)
        ]
        for future in tqdm(as_completed(futures), total=n_batches):
            batch = future.result()
            all_results.extend(batch)
    return all_results

# ✅ Main execution
if __name__ == "__main__":
    n_batches = 100   # 100 batches × 2 sentences = ~200
    n_threads = 5     # Adjust as needed

    results = run_parallel_generation(n_batches=n_batches, n_threads=n_threads)

    print(f"\n✅ Done! Generated {len(results)} synthetic sentences.")

    # ✅ Print some examples with categories
    for i, r in enumerate(results[:10]):
        print(f"\n--- Example {i+1} ---")
        print("Sentence: ", " ".join(r.sentence))
        print("Tags:     ", r.tags)
        print("Categories:", r.categories)


In [None]:
def convert_synthetic_to_tagged_sentences(synthetic: List[SyntheticSentence]):
    return [list(zip(s.sentence, s.tags)) for s in synthetic]


if __name__ == "__main__":
    # Generate synthetic data
    n_batches = 100
    n_threads = 5
    # print("Generating synthetic data...")
    # results = run_parallel_generation(n_batches=n_batches, n_threads=n_threads)

    # print(f"\n Done! Generated {len(results)} synthetic sentences.")
    # for i, r in enumerate(results[:5]):
    #     print(f"\n--- Example {i+1} ---")
    #     print("Sentence: ", " ".join(r.sentence))
    #     print("Tags:     ", r.tags)
    #     print("Categories:", r.categories)

    # Convert to training format and combine
    synthetic_tagged = convert_synthetic_to_tagged_sentences(results)
    train_augmented = train_sentences + synthetic_tagged * 3 
    random.shuffle(train_augmented)

    # Vectorize with your existing function
    print("Vectorizing data...")
    X_train_synth, y_train_synth, X_val_synth, y_val_synth, X_test_synth, y_test_synth, vectorizer_synth = vectorize(train_augmented, val_sentences, test_sentences)

    # Train the LR tagger
    print("Training Logistic Regression...")
    t_ini = datetime.datetime.now()
    clf_synth = LogisticRegression(C=20, solver='liblinear', random_state=13)
    clf_synth.fit(X_train_synth, y_train_synth)
    t_fin = datetime.datetime.now()
    print(f"Training completed in {(t_fin - t_ini).total_seconds():.2f} seconds.")


In [None]:

clf.fit(X_train, y_train)
y_pred_orig = clf.predict(X_test)
y_pred_synth = clf_synth.predict(X_test_synth)

# Track positions where the original was wrong but the synthetic fixed it, and vice versa
fixed_by_synth = []
regressed_by_synth = []
unchanged_errors = []

for i, (true, pred_orig, pred_synth) in enumerate(zip(y_test_synth, y_pred_orig, y_pred_synth)):
    if pred_orig != true and pred_synth == true:
        fixed_by_synth.append((i, true, pred_orig, pred_synth))
    elif pred_orig == true and pred_synth != true:
        regressed_by_synth.append((i, true, pred_orig, pred_synth))
    elif pred_orig != true and pred_synth != true:
        unchanged_errors.append((i, true, pred_orig, pred_synth))

print("Contrastive Error Analysis")
print(f"Fixed errors by synthetic model: {len(fixed_by_synth)}")
print(f"New errors introduced by synthetic model: {len(regressed_by_synth)}")
print(f"Unchanged errors (both models wrong): {len(unchanged_errors)}")

print("\n--- Original Model ---")
print(classification_report(y_test_synth, y_pred_orig, digits=3))

print("\n--- Synthetic-Augmented Model ---")
print(classification_report(y_test_synth, y_pred_synth, digits=3))


## 3

In [None]:
sentences_failed_with_original = []
total_errors = 0
random.seed(42)
sample_indices = random.sample(range(len(test_sentences)), 30)

for i in sample_indices:
    sentence = test_sentences[i]
    true_tags = untag_pos(sentence)
    errors_org = 0
    errors_tok = 0  

    tokenized_res = tag_sentences_ud(" ".join(untag(sentence)))
    original_res = tag_sentences_ud(test_original[i])

    tokenized_tags = [[token.pos_tag.value for token in sentence.tokens] for sentence in tokenized_res.sentences]
    tokenized_words = [[token.text for token in sentence.tokens] for sentence in tokenized_res.sentences]
    original_tags = [[token.pos_tag.value for token in sentence.tokens] for sentence in original_res.sentences]
    original_words = [[token.text for token in sentence.tokens] for sentence in original_res.sentences]
    print(f"tokenized: {tokenized_words}")
    print(f"original: {original_words}")
    print(f"true: {untag(sentence)}")
    
    if not true_tags or not original_tags or not tokenized_tags or len(true_tags) != len(original_tags[0]) or len(true_tags) != len(tokenized_tags[0]) or len(tokenized_tags[0]) != len(original_tags[0]):
        print(f"Length mismatch in sentence {i}: true={len(true_tags)}, original={len(original_tags[0])}, tokenized={len(tokenized_tags[0])}")
        if len(true_tags) != len(original_tags[0]) and len(true_tags) == len(tokenized_tags[0]):
            sentences_failed_with_original.append(sentence)
        continue
    
    for j in range(len(true_tags)):
        if true_tags[j] != original_tags[0][j]:
            errors_org += 1
        if true_tags[j] != tokenized_tags[0][j]:
            errors_tok += 1

    if errors_org > 0 and errors_tok == 0:
        sentences_failed_with_original.append(sentence)  
    else:
        print(f"original: {errors_org} tokenized: {errors_tok}")  
    if errors_org > 0:
        total_errors += 1
    

print(f"Sentences that failed with the original text but not with the tokenized version: {len(sentences_failed_with_original)}")
print(f"Performance on original text: {total_errors} errors out of {len(sample_indices)} sentences")


In [None]:
from typing import List, Optional
from sklearn.metrics import precision_recall_fscore_support

# --- Step 1: Grok-based segmentation ---
def segment_sentence_ud(text: str) -> Optional[List[str]]:
    """
    Segments a raw sentence into UD-style tokens using Grok.
    Returns a list of tokens.
    """
    prompt = f"""
You are a linguist. Segment the following English sentence into word tokens 
according to the Universal Dependencies (UD) English guidelines used for CoNLL POS tagging.

Do not include POS tags. Just return a JSON list of tokens.
Split contractions like "can't" into ["ca", "n't"], punctuation as separate tokens, etc.

Sentence:
{text}
"""
    try:
        response = client.beta.chat.completions.parse(
            model="grok-3",
            messages=[{"role": "system", "content": prompt}],
            response_format=List[str],
        )
        return response.choices[0].message.parsed
    except Exception as e:
        print("✗ Error:", e)
        return None

# --- Step 2: Evaluation metrics ---
def compute_segmentation_metrics(gold_lists, pred_lists):
    tp = 0
    fp = 0
    fn = 0
    exact_match = 0

    for gold, pred in zip(gold_lists, pred_lists):
        gold_set = set(gold)
        pred_set = set(pred)

        tp += len(gold_set & pred_set)
        fp += len(pred_set - gold_set)
        fn += len(gold_set - pred_set)

        if gold == pred:
            exact_match += 1

    precision = tp / (tp + fp + 1e-6)
    recall = tp / (tp + fn + 1e-6)
    f1 = 2 * precision * recall / (precision + recall + 1e-6)
    exact = exact_match / len(gold_lists)

    print("📊 Segmentation Metrics:")
    print(f"  Precision:     {precision:.3f}")
    print(f"  Recall:        {recall:.3f}")
    print(f"  F1 Score:      {f1:.3f}")
    print(f"  Exact match:   {exact:.3f}")

    return precision, recall, f1, exact

# --- Step 3: Evaluation loop ---
def evaluate_segmenter(sentences_failed_with_original: List[List[tuple]]):
    gold_tokens_all = []
    pred_tokens_all = []

    for tagged_sentence in sentences_failed_with_original:
        raw_text = " ".join(word for word, _ in tagged_sentence)
        gold_tokens = [word for word, _ in tagged_sentence]
        pred_tokens = segment_sentence_ud(raw_text)

        if pred_tokens is None:
            continue

        gold_tokens_all.append(gold_tokens)
        pred_tokens_all.append(pred_tokens)

        # Optional: inspect
        if gold_tokens != pred_tokens:
            print("\n⚠️ Mismatch example:")
            print("Raw:     ", raw_text)
            print("Gold:    ", gold_tokens)
            print("Predicted:", pred_tokens)

        time.sleep(0.2)  # be gentle with Grok

    return compute_segmentation_metrics(gold_tokens_all, pred_tokens_all)

evaluate_segmenter(sentences_failed_with_original)
