In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import sys
import csv
import pandas as pd
import sys
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from joblib import dump
from collections import Counter
from gensim.models import KeyedVectors

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\teode\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\teode\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:


def get_wordnet_pos(treebank_tag):
    """ Convert Treebank tags to WordNet tags """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to NOUN

def extract_features_and_labels(file_path, include_embeddings=True):
    data = []
    targets = []
    lemmatizer = WordNetLemmatizer()
    word_vectors = KeyedVectors.load_word2vec_format("embeddings/GoogleNews-vectors-negative300.bin.gz", binary=True)

    with open(file_path, 'r', encoding='utf8') as infile:
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) == 10:
                token, preceding_token, next_token, lemma, capitalization, word_shape, word_length, pos_tag, chunk_label, gold_label = components

                # Prepare embeddings only if needed
                embedding_features = {}
                if include_embeddings:
                    # Get embedding if available, else use a zero vector
                    embedding = word_vectors[token] if token in word_vectors else [0]*300
                    embedding_features = {f'emb_{i}': emb for i, emb in enumerate(embedding)}

                feature_dict = {
                    'token': token,
                    'preceding_token': preceding_token,
                    'next_token': next_token,
                    'lemma': lemma,
                    'capitalization': capitalization,
                    'word_shape': word_shape,
                    'word_length': str(word_length),
                    'pos_tag': pos_tag,
                    'chunk_label': chunk_label
                }

                # Add embedding features if they are to be included
                feature_dict.update(embedding_features)

                data.append(feature_dict)
                targets.append(gold_label)

    return data, targets

def create_classifier(train_features, train_targets, model_name):
    vec = DictVectorizer()
    features_vectorized = vec.fit_transform(train_features)

    if model_name == 'logreg':
        model = LogisticRegression(max_iter=10000, C=0.1)
    elif model_name == 'NB':
        model = MultinomialNB()
    elif model_name == 'SVM':
        model = SVC(probability=True)
    else:
        raise ValueError(f"Unsupported model type: {model_name}")

    model.fit(features_vectorized, train_targets)
    return model, vec

def classify_and_evaluate(model, vec, test_features, test_labels):
    features_vectorized = vec.transform(test_features)
    predictions = model.predict(features_vectorized)
    print(classification_report(test_labels, predictions))
    return predictions


    

   

    



In [3]:
training_file = "data/pre.conll2003.train.conll"
dev_file = "data/pre.conll2003.dev.conll"

In [5]:
# Extract features and labels from the training and development files
print("extracting train")
train_features, train_labels = extract_features_and_labels(training_file, include_embeddings=True)
print("extracting dev")
dev_features, dev_labels = extract_features_and_labels(dev_file, include_embeddings=True)

extracting train
extracting dev


In [5]:
# Train and evaluate models
for model_name in ['logreg', 'NB', 'SVM']:
    print(f"Running {model_name}")
    model, vec = create_classifier(train_features, train_labels, model_name)
    predictions = classify_and_evaluate(model, vec, dev_features, dev_labels)

    # Save the model and vectorizer
    model_filename = f"models/{model_name}_model.joblib"
    vec_filename = f"models/{model_name}_vectorizer.joblib"
    dump(model, model_filename)
    dump(vec, vec_filename)

    # Output predictions
    output_file = f"data/output_{model_name}.conll"
    with open(output_file, 'w') as outfile:
        for feature, prediction in zip(dev_features, predictions):
            outfile.write(f"{feature}\t{prediction}\n")

Running logreg
              precision    recall  f1-score   support

       B-LOC       0.81      0.80      0.80      1827
      B-MISC       0.83      0.68      0.75       914
       B-ORG       0.69      0.65      0.67      1335
       B-PER       0.78      0.77      0.77      1818
       I-LOC       0.81      0.61      0.70       257
      I-MISC       0.92      0.46      0.61       342
       I-ORG       0.71      0.58      0.64       748
       I-PER       0.65      0.94      0.77      1294
           O       0.95      0.97      0.96      5209

    accuracy                           0.83     13744
   macro avg       0.79      0.72      0.74     13744
weighted avg       0.83      0.83      0.82     13744

Running NB
              precision    recall  f1-score   support

       B-LOC       0.69      0.89      0.78      1827
      B-MISC       0.88      0.75      0.81       914
       B-ORG       0.67      0.77      0.72      1335
       B-PER       0.83      0.79      0.81      181

In [6]:
# Train and evaluate models
for model_name in ['SVM']:
    print(f"Running {model_name}")
    model, vec = create_classifier(train_features, train_labels, model_name)
    predictions = classify_and_evaluate(model, vec, dev_features, dev_labels)

    # Save the model and vectorizer
    model_filename = f"models/{model_name}_model.joblib"
    vec_filename = f"models/{model_name}_vectorizer.joblib"
    dump(model, model_filename)
    dump(vec, vec_filename)

    # Output predictions
    output_file = f"data/output_{model_name}_embedded.conll"
    with open(output_file, 'w') as outfile:
        for feature, prediction in zip(dev_features, predictions):
            outfile.write(f"{feature}\t{prediction}\n")

Running SVM
              precision    recall  f1-score   support

       B-LOC       0.89      0.87      0.88      1827
      B-MISC       0.87      0.81      0.84       914
       B-ORG       0.78      0.79      0.78      1335
       B-PER       0.90      0.89      0.89      1818
       I-LOC       0.78      0.71      0.74       257
      I-MISC       0.83      0.61      0.70       342
       I-ORG       0.72      0.63      0.67       748
       I-PER       0.83      0.92      0.87      1294
           O       0.96      0.99      0.98      5209

    accuracy                           0.89     13744
   macro avg       0.84      0.80      0.82     13744
weighted avg       0.89      0.89      0.89     13744

