# COMP34812 Natural Language Understanding Courseworklow key lemming an stemming


## Install required packages

In [None]:
!pip install  tensorflow  pandas nltk numpy matplotlib scikit-learn sentencepiece tokenizers
!pip install -U spacy[cuda12x]
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import tensorflow as tf
print("GPUs:", tf.config.list_physical_devices('GPU'))

GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
import pandas as pd
import regex as re
import numpy as np
import nltk
import os
import tensorflow as tf
import spacy

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from tokenizers import ByteLevelBPETokenizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [6]:
print("GPUs:", tf.config.list_physical_devices('GPU'))
spacy.require_gpu()
nlp = spacy.load("en_core_web_sm")

GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [7]:
from google.colab import drive
drive.mount('/content/drive')
if not os.path.exists('glove_embeddings'):
  !mkdir glove_embeddings
  !cp /content/drive/MyDrive/glove.6B.200d.txt glove_embeddings/
  !cp /content/drive/MyDrive/glove.6B.300d.txt glove_embeddings/

Mounted at /content/drive


In [8]:
if not os.path.exists('glove_embeddings'):
  !wget http://nlp.stanford.edu/data/glove.6B.zip -O glove.zip
  !unzip -j glove.zip "glove.6B.200d.txt" -d glove_embeddings

## Load dataset

In [9]:
dev_set = pd.read_csv('dev.csv')
dev_set.head()

Unnamed: 0,premise,hypothesis,label
0,"By starting at the soft underbelly, the 16,000...","General Nelson A. Miles had 30,000 troops in h...",0
1,"The class had broken into a light sweat, but w...",The class grew more tense as time went on.,1
2,"Samson had his famous haircut here, but he wou...",It was unknown where exactly within the town S...,1
3,A man with a black shirt holds a baby while a ...,A darkly dressed man passes a crying baby to a...,0
4,I know that many of you are interested in addr...,The problems must be addressed,1


In [10]:
train_set = pd.read_csv('train.csv')
train_set.head()

Unnamed: 0,premise,hypothesis,label
0,yeah i don't know cut California in half or so...,Yeah. I'm not sure how to make that fit. Maybe...,1
1,actual names will not be used,"For the sake of privacy, actual names are not ...",1
2,The film was directed by Randall Wallace.,The film was directed by Randall Wallace and s...,1
3,"""How d'you know he'll sign me on?""Anse studie...",Anse looked at himself in a cracked mirror.,1
4,In the light of the candles his cheeks looked ...,Drew regarded his best friend and noted that i...,1


In [11]:
stop_words = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()

    # Keep basic punctuation (.,!?'), remove obscure punctuation
    text = re.sub(r"[^a-z0-9,.!?'\s]", ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize without removing stopwords or lemmatizing
    text = nltk.word_tokenize(text)

    """
    processed = []
    for word in text:
        if word in stop_words:
            continue

        word = lemmatizer.lemmatize(word)

        word = word.strip()

        if len(word) < 2:
            continue

        processed.append(word)
    """


    return text

In [12]:
dev_set['premise_tokens'] = dev_set['premise'].apply(clean_text)
dev_set['hypothesis_tokens'] = dev_set['hypothesis'].apply(clean_text)

train_set['premise_tokens'] = train_set['premise'].apply(clean_text)
train_set['hypothesis_tokens'] = train_set['hypothesis'].apply(clean_text)

In [13]:
# Remove rows with NaN labels in both train and dev sets
train_set = train_set.dropna(subset=['label'])
dev_set = dev_set.dropna(subset=['label'])

# Remove rows where premise or hypothesis are empty
train_set = train_set[train_set['premise'].notna() & train_set['premise'].str.strip().ne('')]
dev_set = dev_set[dev_set['premise'].notna() & dev_set['premise'].str.strip().ne('')]

train_set = train_set[train_set['hypothesis'].notna() & train_set['hypothesis'].str.strip().ne('')]
dev_set = dev_set[dev_set['hypothesis'].notna() & dev_set['hypothesis'].str.strip().ne('')]

# Normalize labels for both sets
train_set['label'] = train_set['label'].apply(lambda x: int(x))
dev_set['label'] = dev_set['label'].apply(lambda x: int(x))

In [14]:
dev_set.head()

Unnamed: 0,premise,hypothesis,label,premise_tokens,hypothesis_tokens
0,"By starting at the soft underbelly, the 16,000...","General Nelson A. Miles had 30,000 troops in h...",0,"[by, starting, at, the, soft, underbelly, ,, t...","[general, nelson, a., miles, had, 30,000, troo..."
1,"The class had broken into a light sweat, but w...",The class grew more tense as time went on.,1,"[the, class, had, broken, into, a, light, swea...","[the, class, grew, more, tense, as, time, went..."
2,"Samson had his famous haircut here, but he wou...",It was unknown where exactly within the town S...,1,"[samson, had, his, famous, haircut, here, ,, b...","[it, was, unknown, where, exactly, within, the..."
3,A man with a black shirt holds a baby while a ...,A darkly dressed man passes a crying baby to a...,0,"[a, man, with, a, black, shirt, holds, a, baby...","[a, darkly, dressed, man, passes, a, crying, b..."
4,I know that many of you are interested in addr...,The problems must be addressed,1,"[i, know, that, many, of, you, are, interested...","[the, problems, must, be, addressed]"


In [15]:
train_set.head()

Unnamed: 0,premise,hypothesis,label,premise_tokens,hypothesis_tokens
0,yeah i don't know cut California in half or so...,Yeah. I'm not sure how to make that fit. Maybe...,1,"[yeah, i, do, n't, know, cut, california, in, ...","[yeah, ., i, 'm, not, sure, how, to, make, tha..."
1,actual names will not be used,"For the sake of privacy, actual names are not ...",1,"[actual, names, will, not, be, used]","[for, the, sake, of, privacy, ,, actual, names..."
2,The film was directed by Randall Wallace.,The film was directed by Randall Wallace and s...,1,"[the, film, was, directed, by, randall, wallac...","[the, film, was, directed, by, randall, wallac..."
3,"""How d'you know he'll sign me on?""Anse studie...",Anse looked at himself in a cracked mirror.,1,"[how, d'you, know, he, 'll, sign, me, on, ?, a...","[anse, looked, at, himself, in, a, cracked, mi..."
4,In the light of the candles his cheeks looked ...,Drew regarded his best friend and noted that i...,1,"[in, the, light, of, the, candles, his, cheeks...","[drew, regarded, his, best, friend, and, noted..."


Dataset analysis

In [16]:
for idx, row in train_set.iterrows():
      if len(row['premise_tokens']) == [] or len(row['hypothesis_tokens']) == []:
        print(row)

# Glove embeddings

In [17]:
embedding_dim = 300
glove = f"./glove_embeddings/glove.6B.{embedding_dim}d.txt"
def load_glove(glove_file):
    embeddings_dict = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)  # <-- Convert to float32
            embeddings_dict[word] = vector
    return embeddings_dict


loaded_glove = load_glove(glove)



In [18]:
from sklearn.metrics.pairwise import cosine_similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-5)


# Create sequence embeddings

In [19]:
def tokens_to_embedding_sequence(tokens, glove, dim):
    return [glove.get(tok, np.zeros(dim)) for tok in tokens]

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def create_sequence_embedding(row, glove, dim, max_len):
    premise_seqs = []
    hypothesis_seqs = []

    for idx, row in row.iterrows():
        prem_seq = tokens_to_embedding_sequence(row['premise_tokens'], glove, dim)
        hyp_seq = tokens_to_embedding_sequence(row['hypothesis_tokens'], glove, dim)
        # Pad separately
        prem_seq = pad_sequences([prem_seq], maxlen=max_len, dtype='float32', padding='post', truncating='post')[0]
        hyp_seq = pad_sequences([hyp_seq], maxlen=max_len, dtype='float32', padding='post', truncating='post')[0]
        if (prem_seq.shape != (max_len, dim) or hyp_seq.shape != (max_len, dim)):
            print("Heres the issue")
            print(row['premise_tokens'])
            print(row['hypothesis_tokens'])
        premise_seqs.append(prem_seq)
        hypothesis_seqs.append(hyp_seq)

    # Explicit casting ensures consistent shape
    premise_seqs = np.stack(premise_seqs)
    hypothesis_seqs = np.stack(hypothesis_seqs)

    return premise_seqs, hypothesis_seqs



In [21]:
max_len = 60
embedding_dim = 300

train_prem_seq, train_hyp_seq = create_sequence_embedding(train_set, loaded_glove, embedding_dim, max_len)
dev_prem_seq, dev_hyp_seq = create_sequence_embedding(dev_set, loaded_glove, embedding_dim, max_len)


# Compute numeric featuer vectors
- NER
- CoSiegn Sim

In [22]:
def sentence_embedding(tokens, glove, dim):
    valid_embeddings = [glove[token] for token in tokens if token in glove]
    if not valid_embeddings:
        return np.zeros(dim)
    return np.mean(valid_embeddings, axis=0)

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-5)

def ner_features(text1, text2):
    doc1, doc2 = nlp(text1), nlp(text2)
    ents1 = set(ent.text.lower() for ent in doc1.ents)
    ents2 = set(ent.text.lower() for ent in doc2.ents)
    overlap = len(ents1 & ents2) / (len(ents1 | ents2) + 1e-5)
    return overlap

In [23]:
def prepare_numeric_features_fast(df, glove, dim, batch_size=256):
    numeric_feats = []

    texts1 = df['premise'].tolist()
    texts2 = df['hypothesis'].tolist()

    docs1 = list(nlp.pipe(texts1, batch_size=batch_size))
    docs2 = list(nlp.pipe(texts2, batch_size=batch_size))

    for idx in range(len(df)):
        prem_emb = sentence_embedding(df.iloc[idx]['premise_tokens'], glove, dim)
        hyp_emb = sentence_embedding(df.iloc[idx]['hypothesis_tokens'], glove, dim)
        cos_sim = cosine_similarity(prem_emb, hyp_emb)

        ents1 = set(ent.text.lower() for ent in docs1[idx].ents)
        ents2 = set(ent.text.lower() for ent in docs2[idx].ents)
        ner_overlap = len(ents1 & ents2) / (len(ents1 | ents2) + 1e-5)

        numeric_feats.append([cos_sim, ner_overlap])

    return np.array(numeric_feats)

In [24]:
# numeric features
train_numeric_feats = prepare_numeric_features_fast(train_set, loaded_glove, embedding_dim)
dev_numeric_feats = prepare_numeric_features_fast(dev_set, loaded_glove, embedding_dim)

#LSTM RNN appaorch


In [85]:
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import Callback
import numpy as np

class F1ScoreCallback(Callback):
    def __init__(self, validation_data, patience=5):
        super().__init__()
        self.X_val, self.y_val = validation_data
        self.best_weights = None
        self.best_f1 = 0
        self.patience = patience
        self.wait = 0

    def on_epoch_end(self, epoch, logs=None):
        val_pred = self.model.predict(self.X_val)
        val_pred = (val_pred.flatten() > 0.5).astype(int)
        f1 = f1_score(self.y_val, val_pred)

        print(f"\nEpoch {epoch + 1}: F1 Score = {f1:.4f}")

        if f1 > self.best_f1:
            self.best_f1 = f1
            self.best_weights = self.model.get_weights()
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                print("Early stopping triggered based on F1")
                self.model.stop_training = True
                self.model.set_weights(self.best_weights)


In [25]:
import random

seed_value = 42
# Python's built-in random module
random.seed(seed_value)

# NumPy
np.random.seed(seed_value)

# TensorFlow
tf.random.set_seed(seed_value)

In [26]:
import tensorflow as tf
from tensorflow.keras import backend as K

def binary_f1(y_true, y_pred):
    y_pred = K.cast(K.round(y_pred), 'float32')  # round and cast prediction
    y_true = K.cast(y_true, 'float32')
    y_pred = K.round(y_pred)  # ← This thresholds the sigmoid to 0 or 1
    tp = K.sum(y_true * y_pred)
    fp = K.sum((1 - y_true) * y_pred)
    fn = K.sum(y_true * (1 - y_pred))

    precision = tp / (tp + fp + K.epsilon())
    recall = tp / (tp + fn + K.epsilon())

    f1 = 2 * precision * recall / (precision + recall + K.epsilon())
    return f1


In [99]:
import tensorflow as tf
from keras.metrics import categorical_accuracy
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Bidirectional, Concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Subtract, Multiply
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, Dot, Softmax, LayerNormalization, MultiHeadAttention


# Input layers
prem_input = Input(shape=(max_len, embedding_dim), name='premise_input')
hyp_input = Input(shape=(max_len, embedding_dim), name='hypothesis_input')
numeric_input = Input(shape=(2,), name='numeric_features_input')

# Shared BiLSTM Encoder
shared_bilstm = Bidirectional(LSTM(164, return_sequences=True))  # return full sequence

prem_seq = shared_bilstm(prem_input)
hyp_seq = shared_bilstm(hyp_input)

# Attention weights (dot-product)
attention = Dot(axes=-1)([prem_seq, hyp_seq])  # shape: (batch, max_len, max_len)

# Attention distribution (normalize over hyp tokens)
prem_attn = Softmax(axis=-1)(attention)
hyp_attn = Softmax(axis=1)(attention)  # reverse attention

# Attended representations
prem_aligned = Dot(axes=[2, 1])([prem_attn, hyp_seq])  # (batch, max_len, 256)
hyp_aligned = Dot(axes=[2, 1])([hyp_attn, prem_seq])


def combine(seq, aligned):
    return Concatenate()([
        seq,
        aligned,
        Subtract()([seq, aligned]),
        Multiply()([seq, aligned])
    ])

prem_combined = combine(prem_seq, prem_aligned)
hyp_combined = combine(hyp_seq, hyp_aligned)

composition_bilstm = Bidirectional(LSTM(128, return_sequences=True))

prem_composed = composition_bilstm(prem_combined)
hyp_composed = composition_bilstm(hyp_combined)



prem_avg = GlobalAveragePooling1D()(prem_composed)
prem_max = GlobalMaxPooling1D()(prem_composed)

hyp_avg = GlobalAveragePooling1D()(hyp_composed)
hyp_max = GlobalMaxPooling1D()(hyp_composed)

combined = Concatenate()([prem_avg, prem_max, hyp_avg, hyp_max, numeric_input])

# Classification head

x = Dense(256, activation='relu')(combined)
#x = LayerNormalization()(x)
x = Dropout(0.3)(x)

#x = Dense(256, activation='relu')(x)
#x = LayerNormalization()(x)
#x = Dropout(0.2)(x)


output = Dense(1, activation='sigmoid')(x)

# Model definition
model = Model(inputs=[prem_input, hyp_input, numeric_input], outputs=output)

# Compile model
model.compile(optimizer=tf.keras.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-4),
              loss='binary_crossentropy',
              metrics=[ Precision(name='precision'), Recall(name='recall')])

model.summary()


In [100]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight

train_labels = train_set['label'].values
dev_labels = dev_set['label'].values
#early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, mode= 'min')
# Compute class weights
classes = np.unique(train_labels)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_labels)
class_weight_dict = dict(zip(classes, class_weights))



f1_callback = F1ScoreCallback(
    validation_data=([dev_prem_seq, dev_hyp_seq, dev_numeric_feats], dev_labels),
    patience=3
)
model.fit(
    [train_prem_seq, train_hyp_seq, train_numeric_feats], train_labels,
    validation_data=([dev_prem_seq, dev_hyp_seq, dev_numeric_feats], dev_labels),
    epochs=20,
    batch_size=240,
    callbacks=[f1_callback],
    class_weight=class_weight_dict
)


Epoch 1/20
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step

Epoch 1: F1 Score = 0.6773
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 150ms/step - loss: 0.6477 - precision: 0.6168 - recall: 0.6068 - val_loss: 0.5940 - val_precision: 0.6866 - val_recall: 0.6682
Epoch 2/20
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step

Epoch 2: F1 Score = 0.6900
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 117ms/step - loss: 0.5770 - precision: 0.6900 - recall: 0.7064 - val_loss: 0.5775 - val_precision: 0.7037 - val_recall: 0.6768
Epoch 3/20
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step

Epoch 3: F1 Score = 0.7133
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 118ms/step - loss: 0.5405 - precision: 0.7244 - recall: 0.7392 - val_loss: 0.5735 - val_precision: 0.6960 - val_recall: 0.7315
Epoch 4/20
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s

<keras.src.callbacks.history.History at 0x7bef10c6c850>

In [101]:
from sklearn.metrics import classification_report
import numpy as np

preds = model.predict([dev_prem_seq, dev_hyp_seq, dev_numeric_feats])
pred_labels = (preds.flatten() > 0.5).astype(int)

print(classification_report(dev_labels, pred_labels, target_names=['contradiction', 'entailment']))


[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step
               precision    recall  f1-score   support

contradiction       0.70      0.66      0.68      3258
   entailment       0.70      0.73      0.71      3478

     accuracy                           0.70      6736
    macro avg       0.70      0.70      0.70      6736
 weighted avg       0.70      0.70      0.70      6736



In [103]:
#model.save('/content/drive/MyDrive/best_esim_model.keras')

# Traditional Approach

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

"""
param_grid = {
    'C': [0.01, 0.1, 0.2],  # Regularization strength
    'solver': ['lbfgs','saga','newton-cg','sag' ],  # Different solvers for logistic regression
    'max_iter': [500, 1000]  # More iterations for convergence
}


scaler = StandardScaler()
train_emb_scaled = scaler.fit_transform(train_embeddings)
dev_emb_scaled = scaler.transform(dev_embeddings)

param_grid = {
    'C': [0.2,0.1],  # Regularization strength
    'solver': ['saga','newton-cg','sag' ],  # Different solvers for logistic regression
    'max_iter': [200, 300, ]  # More iterations for convergence
}

clf = GridSearchCV(LogisticRegression(), param_grid, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
#clf = (solver='lbfgs', 'C'= 0.01, max_iter=500, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
clf.fit(train_emb_scaled, train_set["label"].values)  # Train on enhanced embeddings

# Print Best Parameters
print("Best Parameters:", clf.best_params_)

# Evaluate on validation set
preds = clf.best_estimator_.predict(dev_emb_scaled)
print(classification_report(dev_set['label'].values, preds, target_names=['entailment', 'contradiction']))
"""


'\nparam_grid = {\n    \'C\': [0.01, 0.1, 0.2],  # Regularization strength\n    \'solver\': [\'lbfgs\',\'saga\',\'newton-cg\',\'sag\' ],  # Different solvers for logistic regression\n    \'max_iter\': [500, 1000]  # More iterations for convergence\n}\n\n\nscaler = StandardScaler()\ntrain_emb_scaled = scaler.fit_transform(train_embeddings)\ndev_emb_scaled = scaler.transform(dev_embeddings)\n\nparam_grid = {\n    \'C\': [0.2,0.1],  # Regularization strength\n    \'solver\': [\'saga\',\'newton-cg\',\'sag\' ],  # Different solvers for logistic regression\n    \'max_iter\': [200, 300, ]  # More iterations for convergence\n}\n\nclf = GridSearchCV(LogisticRegression(), param_grid, cv=3, scoring=\'f1_macro\', verbose=1, n_jobs=-1)\n#clf = (solver=\'lbfgs\', \'C\'= 0.01, max_iter=500, cv=3, scoring=\'f1_macro\', verbose=1, n_jobs=-1)\nclf.fit(train_emb_scaled, train_set["label"].values)  # Train on enhanced embeddings\n\n# Print Best Parameters\nprint("Best Parameters:", clf.best_params_)\n\n# 