In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import sys
import csv
import pandas as pd
import sys
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from joblib import dump
from collections import Counter
from gensim.models import KeyedVectors

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\teode\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\teode\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:


def get_wordnet_pos(treebank_tag):
    """ Convert Treebank tags to WordNet tags """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to NOUN

def extract_features_and_labels(file_path, include_embeddings=True):
    data = []
    targets = []
    lemmatizer = WordNetLemmatizer()
    word_vectors = KeyedVectors.load_word2vec_format("embeddings/GoogleNews-vectors-negative300.bin.gz", binary=True)

    with open(file_path, 'r', encoding='utf8') as infile:
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) == 10:
                token, preceding_token, next_token, lemma, capitalization, word_shape, word_length, pos_tag, chunk_label, gold_label = components

                # Prepare embeddings only if needed
                embedding_features = {}
                if include_embeddings:
                    # Get embedding if available, else use a zero vector
                    embedding = word_vectors[token] if token in word_vectors else [0]*300
                    embedding_features = {f'emb_{i}': emb for i, emb in enumerate(embedding)}

                feature_dict = {
                    'token': token,
                    'preceding_token': preceding_token,
                    'next_token': next_token,
                    'lemma': lemma,
                    'capitalization': capitalization,
                    'word_shape': word_shape,
                    'word_length': str(word_length),
                    'pos_tag': pos_tag,
                    'chunk_label': chunk_label
                }

                # Add embedding features if they are to be included
                feature_dict.update(embedding_features)

                data.append(feature_dict)
                targets.append(gold_label)

    return data, targets

def create_classifier(train_features, train_targets, parameters):
    vec = DictVectorizer()
    features_vectorized = vec.fit_transform(train_features)
    model = SVC(parameters)
    return model, vec

from sklearn.metrics import f1_score

def classify_and_evaluate(model, vec, features, labels):
    # Transform the features using the vectorizer
    features_vectorized = vec.transform(features)

    # Make predictions
    predictions = model.predict(features_vectorized)

    # Calculate F1-score
    f1 = f1_score(labels, predictions, average='weighted')  # or choose another appropriate average method
    return f1


    

   

    



In [4]:
import random
def create_smaller_dataset(input_file, output_file, num_samples):
    # Initialize an empty list to store valid lines
    valid_lines = []

    # Read the CoNLL-2003 file line by line and filter valid lines
    with open(input_file, 'r', encoding='utf-8') as file:
        for line in file:
            fields = line.strip().split('\t')
            if len(fields) == 10:  # Expecting 10 fields
                valid_lines.append(line)

    # Shuffle the valid lines
    random.shuffle(valid_lines)

    # Select the specified number of samples
    smaller_lines = valid_lines[:num_samples]

    # Save the smaller dataset to a new file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        outfile.writelines(smaller_lines)

In [5]:

input_file = 'data/pre.conll2003.train.conll'
output_file = 'data/pre.conll2003.train_small.conll'
num_samples = 10000


create_smaller_dataset(input_file, output_file, num_samples)

In [6]:

input_file = 'data/pre.conll2003.dev.conll'
output_file = 'data/pre.conll2003.dev_small.conll'
num_samples = 10000


create_smaller_dataset(input_file, output_file, num_samples)

In [11]:
training_file = "data/pre.conll2003.train_small.conll"
dev_file = "data/pre.conll2003.dev_small.conll"

In [12]:
# Extract features and labels from the training and development files
print("extracting train")
train_features, train_labels = extract_features_and_labels(training_file, include_embeddings=False)
print("extracting dev")
dev_features, dev_labels = extract_features_and_labels(dev_file, include_embeddings=False)

extracting train
extracting dev


In [13]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ParameterGrid

def create_classifier(features, labels, parameters):
    # Vectorize features
    vec = DictVectorizer()
    vec_features = vec.fit_transform(features)

    # Create and train the SVC model
    model = SVC(**parameters)
    model.fit(vec_features, labels)

    return model, vec

# Define the parameter grid for SVC
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'poly'],
    'class_weight': [None, 'balanced']
}


feature_list = ['OriginalWord', 'PreviousWord', 'NextWord', 'Lemma', 'Capitalization', 
                    'WordShape', 'WordLength', 'POS', 'ChunkTag', 'NamedEntityTag']

best_score = 0
best_parameters = None

parameter_combinations = list(ParameterGrid(param_grid))

# Iterate over each parameter combination
for parameters in parameter_combinations:
    print(f"Running with parameters: {parameters}")
    model, vec = create_classifier(train_features, train_labels, parameters)
    score = classify_and_evaluate(model, vec, dev_features, dev_labels)
    print(f"score: {score}")
    # Ensure that 'score' is numeric
    if not isinstance(score, (int, float)):
        raise ValueError(f"The returned score is not numeric: {score}")

    if score > best_score:
        best_score = score
        best_parameters = parameters

print(f"Best parameters: {best_parameters} with a score of {best_score}")

Running with parameters: {'C': 0.1, 'class_weight': None, 'gamma': 0.001, 'kernel': 'linear'}
score: 0.7413394765488079
Running with parameters: {'C': 0.1, 'class_weight': None, 'gamma': 0.001, 'kernel': 'rbf'}
score: 0.21213159797541575
Running with parameters: {'C': 0.1, 'class_weight': None, 'gamma': 0.001, 'kernel': 'poly'}
score: 0.21213159797541575
Running with parameters: {'C': 0.1, 'class_weight': None, 'gamma': 0.01, 'kernel': 'linear'}
score: 0.7413394765488079
Running with parameters: {'C': 0.1, 'class_weight': None, 'gamma': 0.01, 'kernel': 'rbf'}
score: 0.453911167206041
Running with parameters: {'C': 0.1, 'class_weight': None, 'gamma': 0.01, 'kernel': 'poly'}
score: 0.21213159797541575
Running with parameters: {'C': 0.1, 'class_weight': None, 'gamma': 0.1, 'kernel': 'linear'}
score: 0.7413394765488079
Running with parameters: {'C': 0.1, 'class_weight': None, 'gamma': 0.1, 'kernel': 'rbf'}
score: 0.5518822070916575
Running with parameters: {'C': 0.1, 'class_weight': None, 