In [94]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

def extract_features_and_labels(file_path, include_embeddings=True, omit_feature=None):
    print(f"Extracting features and labels from {file_path}, omitting feature: {omit_feature}")
    data = []
    data = []
    targets = []
    lemmatizer = WordNetLemmatizer()
    word_vectors = KeyedVectors.load_word2vec_format("embeddings/GoogleNews-vectors-negative300.bin.gz", binary=True)

    with open(file_path, 'r', encoding='utf8') as infile:
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) == 10:
                token, preceding_token, next_token, lemma, capitalization, word_shape, word_length, pos_tag, chunk_label, gold_label = components

                # Prepare embeddings only if needed
                embedding_features = {}
                if include_embeddings and omit_feature != 'embeddings':
                    # Get embedding if available, else use a zero vector
                    embedding = word_vectors[token] if token in word_vectors else [0]*300
                    embedding_features = {f'emb_{i}': emb for i, emb in enumerate(embedding)}

                feature_dict = {
                    'OriginalWord': token,
                    'PreviousWord': preceding_token,
                    'NextWord': next_token,
                    'Lemma': lemma,
                    'Capitalization': capitalization,
                    'WordShape': word_shape,
                    'WordLength': str(word_length),
                    'POS': pos_tag,
                    'ChunkTag': chunk_label
                }

                # Remove omitted feature, if any
                if omit_feature in feature_dict:
                    del feature_dict[omit_feature]
                    #print("deleted ",omit_feature)

                # Add embedding features if they are to be included
                if omit_feature != 'embeddings':
                    feature_dict.update(embedding_features)

                data.append(feature_dict)
                targets.append(gold_label)

    print(f"Extracted {len(data)} data points.")

    return data, targets

# Train and evaluate the SVM model
def train_and_evaluate_svm(train_features, train_labels, dev_features, dev_labels):
    print("Training and evaluating SVM...")
    vec = DictVectorizer()
    X_train = vec.fit_transform(train_features)
    X_dev = vec.transform(dev_features)

    model = SVC(C=1, gamma=0.001, kernel='linear', class_weight= "balanced")
    model.fit(X_train, train_labels)

    predictions = model.predict(X_dev)
    f1 = f1_score(dev_labels, predictions, average='weighted')
    print(f"Model evaluation completed. F1 Score: {f1}")
    return f1


# Load your datasets
train_file_path = "data/pre.conll2003.train_small.conll"
dev_file_path = "data/pre.conll2003.dev_small.conll"


# Establish baseline with all features
print("\n----- Establishing Baseline (All Features Included) -----")
train_features, train_labels = extract_features_and_labels(train_file_path)
dev_features, dev_labels = extract_features_and_labels(dev_file_path)
baseline_f1 = train_and_evaluate_svm(train_features, train_labels, dev_features, dev_labels)
print(f"Baseline F1 Score (All Features): {baseline_f1}")

# Perform ablation study
f1_scores = {"Baseline (All Features)": baseline_f1}

for feature_to_omit in features:
    print(f"\n----- Ablation study for feature: {feature_to_omit} -----")
    train_features, train_labels = extract_features_and_labels(train_file_path, omit_feature=feature_to_omit)
    dev_features, dev_labels = extract_features_and_labels(dev_file_path, omit_feature=feature_to_omit)

    f1 = train_and_evaluate_svm(train_features, train_labels, dev_features, dev_labels)
    f1_scores[feature_to_omit] = f1
print("\nFeature Ablation Study Results:")
# Analyzing and Ranking Features
sorted_features = sorted(f1_scores.items(), key=lambda x: x[1], reverse=True)

print("Feature Ablation Study Results:")
for feature, f1 in sorted_features:
    print(f"Feature: {feature}, F1 Score: {f1}")

# Identify the most and least important features based on F1 score
most_important_feature = sorted_features[0][0]
least_important_feature = sorted_features[-1][0]

print(f"\nMost Important Feature: {most_important_feature}")
print(f"Least Important Feature: {least_important_feature}")


----- Establishing Baseline (All Features Included) -----
Extracting features and labels from data/pre.conll2003.train_small.conll, omitting feature: None
Extracted 10000 data points.
Extracting features and labels from data/pre.conll2003.dev_small.conll, omitting feature: None
Extracted 10000 data points.
Training and evaluating SVM...
Model evaluation completed. F1 Score: 0.8693131600069762
Baseline F1 Score (All Features): 0.8693131600069762

----- Ablation study for feature: OriginalWord -----
Extracting features and labels from data/pre.conll2003.train_small.conll, omitting feature: OriginalWord
Extracted 10000 data points.
Extracting features and labels from data/pre.conll2003.dev_small.conll, omitting feature: OriginalWord
Extracted 10000 data points.
Training and evaluating SVM...
Model evaluation completed. F1 Score: 0.8675746246721072

----- Ablation study for feature: PreviousWord -----
Extracting features and labels from data/pre.conll2003.train_small.conll, omitting featu