# COMP34812 Natural Language Understanding Courseworklow key lemming an stemming


## Install required packages

In [None]:
!pip install pandas nltk numpy matplotlib scikit-learn sentencepiece tokenizers



In [None]:
import pandas as pd
import regex as re
import numpy as np
import nltk
import os
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from tokenizers import ByteLevelBPETokenizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zaccu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zaccu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
if not os.path.exists('glove_embeddings'):
  !wget https://nlp.stanford.edu/data/glove.6B.zip
  !unzip glove.6B.zip -d glove_embeddings

## Load dataset

In [None]:
dev_set = pd.read_csv('dev.csv')
dev_set.head()

Unnamed: 0,premise,hypothesis,label
0,"By starting at the soft underbelly, the 16,000...","General Nelson A. Miles had 30,000 troops in h...",0
1,"The class had broken into a light sweat, but w...",The class grew more tense as time went on.,1
2,"Samson had his famous haircut here, but he wou...",It was unknown where exactly within the town S...,1
3,A man with a black shirt holds a baby while a ...,A darkly dressed man passes a crying baby to a...,0
4,I know that many of you are interested in addr...,The problems must be addressed,1


In [None]:
train_set = pd.read_csv('train.csv')
train_set.head()

Unnamed: 0,premise,hypothesis,label
0,yeah i don't know cut California in half or so...,Yeah. I'm not sure how to make that fit. Maybe...,1
1,actual names will not be used,"For the sake of privacy, actual names are not ...",1
2,The film was directed by Randall Wallace.,The film was directed by Randall Wallace and s...,1
3,"""How d'you know he'll sign me on?""Anse studie...",Anse looked at himself in a cracked mirror.,1
4,In the light of the candles his cheeks looked ...,Drew regarded his best friend and noted that i...,1


In [None]:
stop_words = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

def clean_text(text):
    text = str(text)

    text = text.lower()

    text = re.sub(r'[^\w\s]', ' ', text)

    text = nltk.word_tokenize(text)

    processed = []
    for word in text:
        if word in stop_words:
            continue

        word = lemmatizer.lemmatize(word)

        word = word.strip()

        if len(word) < 2:
            continue

        processed.append(word)

    return processed

In [None]:
dev_set['premise_tokens'] = dev_set['premise'].apply(clean_text)
dev_set['hypothesis_tokens'] = dev_set['hypothesis'].apply(clean_text)

train_set['premise_tokens'] = train_set['premise'].apply(clean_text)
train_set['hypothesis_tokens'] = train_set['hypothesis'].apply(clean_text)

In [None]:
dev_set.head()

Unnamed: 0,premise,hypothesis,label,premise_tokens,hypothesis_tokens
0,"By starting at the soft underbelly, the 16,000...","General Nelson A. Miles had 30,000 troops in h...",0,"[starting, soft, underbelly, 16, 000, troop, g...","[general, nelson, mile, 30, 000, troop, attack]"
1,"The class had broken into a light sweat, but w...",The class grew more tense as time went on.,1,"[class, broken, light, sweat, gasping, air]","[class, grew, tense, time, went]"
2,"Samson had his famous haircut here, but he wou...",It was unknown where exactly within the town S...,1,"[samson, famous, haircut, would, find, hard, r...","[unknown, exactly, within, town, samson, recei..."
3,A man with a black shirt holds a baby while a ...,A darkly dressed man passes a crying baby to a...,0,"[man, black, shirt, hold, baby, blue, shirted,...","[darkly, dressed, man, pass, cry, baby, man, l..."
4,I know that many of you are interested in addr...,The problems must be addressed,1,"[know, many, interested, addressing, issue, le...","[problem, must, addressed]"


In [None]:
train_set.head()

Unnamed: 0,premise,hypothesis,label,premise_tokens,hypothesis_tokens
0,yeah i don't know cut California in half or so...,Yeah. I'm not sure how to make that fit. Maybe...,1,"[yeah, know, cut, california, half, something]","[yeah, sure, make, fit, maybe, could, cut, cal..."
1,actual names will not be used,"For the sake of privacy, actual names are not ...",1,"[actual, name, used]","[sake, privacy, actual, name, used]"
2,The film was directed by Randall Wallace.,The film was directed by Randall Wallace and s...,1,"[film, directed, randall, wallace]","[film, directed, randall, wallace, star, mel, ..."
3,"""How d'you know he'll sign me on?""Anse studie...",Anse looked at himself in a cracked mirror.,1,"[know, sign, anse, studied, unkempt, clean, re...","[anse, looked, cracked, mirror]"
4,In the light of the candles his cheeks looked ...,Drew regarded his best friend and noted that i...,1,"[light, candle, cheek, looked, even, hollow, t...","[drew, regarded, best, friend, noted, light, l..."


Dataset analysis

In [None]:
# Randomly select 90% of the rows for the training set
train_df = train_set.sample(frac=0.9, random_state=42)
# The rest of the rows will form the test set
test_df = train_set.drop(train_df.index)
train_df.head()



Unnamed: 0,premise,hypothesis,label,premise_tokens,hypothesis_tokens
2166,"No, he waits until he has had a violent quarre...",He was clear and free from suspicion.,0,"[wait, violent, quarrel, whole, household, cog...","[clear, free, suspicion]"
10333,oh yeah but that's that's a good way to make a...,A marriage can work when you do dishonest thin...,0,"[oh, yeah, good, way, make, big, problem, marr...","[marriage, work, dishonest, thing]"
10835,Land that is acquired for or in connection wit...,Land that is acquired has no connection with i...,0,"[land, acquired, connection, item, general, pp...","[land, acquired, connection, item, general, pp]"
4688,and my wife is from Plains if you know where P...,My wife went to school in Plains.,1,"[wife, plain, know, plain]","[wife, went, school, plain]"
17561,Since 1980 the building has housed the Museum ...,The building has never been able to house any ...,0,"[since, 1980, building, housed, museum, ethnog...","[building, never, able, house, museum]"


# BPE tokinizer (might use)

In [None]:
# Initialize the tokenizer
tokenizer = ByteLevelBPETokenizer()


if not os.path.exists("byte_bpe"):
    os.makedirs("byte_bpe")
    # Combine tokens from premise and hypothesis into one string per sample for both train and dev sets
    train_set['combined_text'] = train_set.apply(
    lambda row: ' '.join(row['premise_tokens'] + row['hypothesis_tokens']), axis=1)
    dev_set['combined_text'] = dev_set.apply(
    lambda row: ' '.join(row['premise_tokens'] + row['hypothesis_tokens']), axis=1)
    # Combine both sets into one list of sentences
    combined_sentences = pd.concat([train_set['combined_text'], dev_set['combined_text']])

    # Save the combined sentences to a file for SentencePiece training
    combined_sentences.to_csv("combined_train_dev.txt", index=False, header=False)
    # Train on your combined file
    tokenizer.train(files="combined_train_dev.txt", vocab_size=10000, min_frequency=2)
    # Save the tokenizer model
    tokenizer.save_model("byte_bpe")

#tokenizer = ByteLevelBPETokenizer("byte_bpe/vocab.json", "byte_bpe/merges.txt")



# TF-IDF word embeddings

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert tokenized text back to strings for TF-IDF processing
train_premise_texts = train_set['premise_tokens'].apply(lambda x: ' '.join(x))
train_hypothesis_texts = train_set['hypothesis_tokens'].apply(lambda x: ' '.join(x))

dev_premise_texts = dev_set['premise_tokens'].apply(lambda x: ' '.join(x))
dev_hypothesis_texts = dev_set['hypothesis_tokens'].apply(lambda x: ' '.join(x))

# Combine premises & hypotheses for TF-IDF training
all_texts = pd.concat([train_premise_texts, train_hypothesis_texts])

# Initialize and fit TF-IDF on training data
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
tfidf.fit(all_texts)

# Transform training and dev sets
train_premise_tfidf = tfidf.transform(train_premise_texts).toarray()
train_hypothesis_tfidf = tfidf.transform(train_hypothesis_texts).toarray()

dev_premise_tfidf = tfidf.transform(dev_premise_texts).toarray()
dev_hypothesis_tfidf = tfidf.transform(dev_hypothesis_texts).toarray()



# Glove embeddings and tfif wieghted word emeddings plus named eneitiy

In [None]:
glove = "./glove_embeddings/glove.6B.100d.txt"
def load_glove(glove_file):
    embeddings_dict = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)  # <-- Convert to float32
            embeddings_dict[word] = vector
    return embeddings_dict

embedding_dim = 100
loaded_glove = load_glove(glove)



In [None]:
def sentence_embedding(tokens, embeddings_dict, embedding_dim):
    valid_embeddings = [embeddings_dict[token] for token in tokens if token in embeddings_dict]
    if not valid_embeddings:
        # Return zero-vector if no embeddings found
        return np.zeros(embedding_dim)
    sentence_emb = np.mean(valid_embeddings, axis=0)
    return sentence_emb

def pairwise_embedding(premise_tokens, hypothesis_tokens, premise_tfidf, hypothesis_tfidf,  embeddings_dict,embedding_dim):
    premise_emb = sentence_embedding(premise_tokens, embeddings_dict,embedding_dim)
    hypothesis_emb = sentence_embedding(hypothesis_tokens, embeddings_dict,embedding_dim)
    # Concatenate multiple useful features
    combined_emb = np.concatenate([
        premise_emb,
        hypothesis_emb,
        np.abs(premise_emb - hypothesis_emb), # capture difference
        premise_emb * hypothesis_emb           # capture interactions
    ]).astype(np.float32)

        # Concatenate TF-IDF features
    combined_emb = np.concatenate([combined_emb, premise_tfidf, hypothesis_tfidf]).astype(np.float32)

    return combined_emb



In [None]:
import numpy as np
from tqdm import tqdm
tqdm.pandas()

# Convert list of numpy arrays into a single 2D numpy array
train_embeddings = np.stack(train_set.apply(
    lambda x: pairwise_embedding(x['premise_tokens'], x['hypothesis_tokens'], train_premise_tfidf[x.name] ,train_hypothesis_tfidf[x.name] , loaded_glove, embedding_dim=100), axis=1))

dev_embeddings = np.stack(dev_set.apply(
    lambda x: pairwise_embedding(x['premise_tokens'], x['hypothesis_tokens'], dev_premise_tfidf[x.name] ,dev_hypothesis_tfidf[x.name], loaded_glove, embedding_dim=100), axis=1))


# Traditional Approach

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 0.2],  # Regularization strength
    'solver': ['lbfgs',],  # Different solvers for logistic regression
    'max_iter': [500, 1000]  # More iterations for convergence
}

clf = GridSearchCV(LogisticRegression(), param_grid, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
#clf = (solver='lbfgs', 'C'= 0.01, max_iter=500, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
clf.fit(train_embeddings, train_set["label"].values)  # Train on enhanced embeddings

# Print Best Parameters
print("Best Parameters:", clf.best_params_)

# Evaluate on validation set
preds = clf.best_estimator_.predict(dev_embeddings)
print(classification_report(dev_set['label'].values, preds, target_names=['entailment', 'contradiction']))


Fitting 3 folds for each of 6 candidates, totalling 18 fits
