# NLP TASK PART 1
TASK A) Word Embeddings
TASK B) Phrase Embedding (static)

(rest done in other parts)

## Installing required libraries

In [None]:
import gensim as gs
import gensim
import nltk
nltk.download('stopwords')
nltk.download('punkt') #required for preprocess function to work
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('brown')
import multiprocessing
import matplotlib.pyplot as plt
import pandas as pd
import spacy
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import plotly.express as p
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Part 1: Word Embeddings

### 1.a Constrained

#### Preprocessing text:

In [None]:
def read_path(file_name):
  with open(file_name, 'r') as file:
    text = file.read()
  return text

In [None]:
def preprocess_text(text):
  sentences = nltk.sent_tokenize(text)
  # stop_words = set(stopwords.words('english'))
  processed_sentences = []
  for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    lemmatizer = WordNetLemmatizer()
    # filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    filtered_words = [word.lower() for word in words]
    filtered_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    processed_sentences.append(filtered_words)
  return processed_sentences

In [None]:
afinn_111 = r"datasets/AFINN-111.txt"

In [None]:

brown_corpus = nltk.corpus.brown.words()
brown_corpus_processed = preprocess_text(' '.join(brown_corpus))


In [None]:
print(brown_corpus_processed)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#### Model

In [None]:
cores = multiprocessing.cpu_count()
model = gs.models.Word2Vec(window = 10, vector_size = 300, negative = 5,
                           min_count = 2, workers = cores-1,
                           shrink_windows = True)
model.build_vocab(brown_corpus_processed)

In [None]:
model.train(brown_corpus_processed, total_examples = model.corpus_count, epochs = model.epochs)

(4085162, 5869235)

In [None]:
test = r"datasets/SimLex-999.txt"
def try_similarity(word1, word2, model):
    try:
        return model.wv.similarity(word1, word2)
    except KeyError:
        return 0

def grade_similarity(similarity):
  if(similarity > 0.8):
    return 1
  elif(similarity > 0.6):
    return 2
  elif(similarity > 0.4):
    return 3
  elif(similarity > 0.2):
    return 4
  else:
    return 5

In [None]:
df = pd.read_csv(test, sep="\t")
df = df[['word1', 'word2', 'SimLex999']]
df['SimLex999'] = df['SimLex999']/10
df['ModelSimilarity'] = df.apply(lambda row: try_similarity(row['word1'], row['word2'], model), axis=1)
print(df.head())

   word1        word2  SimLex999  ModelSimilarity
0    old          new      0.158         0.319609
1  smart  intelligent      0.920         0.744467
2   hard    difficult      0.877         0.735163
3  happy     cheerful      0.955         0.638997
4   hard         easy      0.095         0.910724


In [None]:

def add_afinn_scores(df, afinn_file):
    afinn_dict = {}
    with open(afinn_file) as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                word, score = parts
                afinn_dict[word] = float(score) / 5

    df['word1_afinn'] = df['word1'].map(afinn_dict).fillna(0)
    df['word2_afinn'] = df['word2'].map(afinn_dict).fillna(0)

    return df

df = add_afinn_scores(df, afinn_111)
print(len(df[df['word1_afinn']*df['word2_afinn']==0])/len(df))
print(df)


0.9209209209209209
      word1        word2  SimLex999  ModelSimilarity  word1_afinn  word2_afinn
0       old          new      0.158         0.319609          0.0          0.0
1     smart  intelligent      0.920         0.744467          0.2          0.4
2      hard    difficult      0.877         0.735163         -0.2         -0.2
3     happy     cheerful      0.955         0.638997          0.6          0.4
4      hard         easy      0.095         0.910724         -0.2          0.2
..      ...          ...        ...              ...          ...          ...
994    join      acquire      0.285         0.871304          0.2          0.0
995    send       attend      0.167         0.840495          0.0          0.0
996  gather       attend      0.480         0.882083          0.0          0.0
997  absorb     withdraw      0.297         0.758678          0.0          0.0
998  attend       arrive      0.608         0.775865          0.0          0.0

[999 rows x 6 columns]


Since all the values are graded between -5 and 5, normalizing them to fall between -1 and 1 would make score*10 be only even values. We consider that while making the function for our new model score. Also we can see from the percentage we printed that most of the words are not accounted for.


In [None]:

def new_model_score(df):
    df['NewSim'] = np.NaN
    for index, row in df.iterrows():
        new_sim = row['ModelSimilarity'] #def
        if row['word1_afinn'] == 0 or row['word2_afinn'] == 0 or row['word1_afinn'] == row['word2_afinn']:
            pass
        elif row['word1_afinn'] * row['word2_afinn'] < 0:
            # Opposite polarity handling
            max_val = max(abs(row['word1_afinn']), abs(row['word2_afinn']))
            min_val = min(abs(row['word1_afinn']), abs(row['word2_afinn']))
            if row['ModelSimilarity'] - max_val > 0:
                new_sim = row['ModelSimilarity'] - max_val
            elif row['ModelSimilarity'] - min_val > 0:
                new_sim = row['ModelSimilarity'] - min_val
        elif abs(row['word1_afinn'] - row['word2_afinn']) == 0.2:
            # Difference in polarity is one grade
            if row['ModelSimilarity'] - 0.1 > 0:
                new_sim = row['ModelSimilarity'] - 0.1
        elif abs(row['word1_afinn'] - row['word2_afinn']) > 0.2:
            # Significant difference in polarity
            if row['ModelSimilarity'] - 0.2 > 0:
                new_sim = row['ModelSimilarity'] - 0.2
            elif row['ModelSimilarity'] - 0.1 > 0:
                new_sim = row['ModelSimilarity'] - 0.1

        df.loc[index, 'NewSim'] = new_sim

    return df

df = new_model_score(df)
print(df)


      word1        word2  SimLex999  ModelSimilarity  word1_afinn  \
0       old          new      0.158         0.319609          0.0   
1     smart  intelligent      0.920         0.744467          0.2   
2      hard    difficult      0.877         0.735163         -0.2   
3     happy     cheerful      0.955         0.638997          0.6   
4      hard         easy      0.095         0.910724         -0.2   
..      ...          ...        ...              ...          ...   
994    join      acquire      0.285         0.871304          0.2   
995    send       attend      0.167         0.840495          0.0   
996  gather       attend      0.480         0.882083          0.0   
997  absorb     withdraw      0.297         0.758678          0.0   
998  attend       arrive      0.608         0.775865          0.0   

     word2_afinn    NewSim  
0            0.0  0.319609  
1            0.4  0.644467  
2           -0.2  0.735163  
3            0.4  0.638997  
4            0.2  0.710724

In [None]:
def calculate_accuracy(df, threshold):
    df['Difference'] = abs(df['SimLex999'] - df['ModelSimilarity'])
    df['IsAccurate'] = df['Difference'] <= threshold
    accuracy_percentage = (df['IsAccurate'].sum() / len(df)) * 100
    df.drop(columns = ['Difference', 'IsAccurate'], inplace = True)
    return accuracy_percentage

def calc_accuracy_grades(df):
    df["test_grade"] = df.apply(lambda row: grade_similarity(row['SimLex999']), axis = 1)
    df["model_grade"] = df.apply(lambda row: int(grade_similarity(row['NewSim'])), axis =1 )
    df['Difference'] = abs(df['test_grade'] - df['model_grade'])
    df['IsAccurate'] = df['Difference'] <= 1
    accuracy_percentage = (df['IsAccurate'].sum() / len(df)) * 100
    df.drop(columns = ['Difference', 'IsAccurate', 'test_grade', 'model_grade'], inplace = True)
    return accuracy_percentage

def calc_normalized_acc(df, threshold = 0.15):
    df['Difference'] = abs(df['SimLex999'] - df['NewSim'])
    df['IsAccurate'] = df['Difference'] <= threshold
    accuracy_percentage = (df['IsAccurate'].sum() / len(df)) * 100
    df.drop(columns = ['Difference', 'IsAccurate'], inplace = True)
    return accuracy_percentage


accuracy_30 = calculate_accuracy(df, 0.3)
accuracy_20 = calculate_accuracy(df, 0.2)
accuracy_10 = calculate_accuracy(df, 0.1)
accuracy_grade = calc_accuracy_grades(df)
accuracy_new = calc_normalized_acc(df)

print(f"Accuracy with 30% difference : {accuracy_30:.2f}%")
print(f"Accuracy with 20% difference : {accuracy_20:.2f}%")
print(f"Accuracy with 10% difference : {accuracy_10:.2f}%")
print(f"Accuracy with polarity accounted for (15% diff) : {accuracy_new:.2f}%")
print(f"Accuracy with grades and polarity accounted for : {accuracy_grade:.2f}%")


Accuracy with 30% difference : 47.65%
Accuracy with 20% difference : 33.03%
Accuracy with 10% difference : 16.02%
Accuracy with polarity accounted for (15% diff) : 26.33%
Accuracy with grades and polarity accounted for : 47.35%


In [None]:

mse_w2v = mean_squared_error(df['SimLex999'], df['ModelSimilarity'])
mse_w_polarity = mean_squared_error(df['SimLex999'], df['NewSim'])
print("Mean squared error without accounting for polarity: ", mse_w2v)
print("Mean squared error with accounting for polarity: " , mse_w_polarity)

Mean squared error without accounting for polarity:  0.1796314143992172
Mean squared error with accounting for polarity:  0.17155386703660036


In [None]:
#using PCA to come to 2 dimension, and we can keep polarity as the third dimension and plot
y = model.wv.index_to_key
pca = PCA(n_components =3)
X = pca.fit_transform(model.wv.get_normed_vectors())

In [None]:
fig = p.scatter_3d(X[500:1000], x=0, y=1, z=2, color = y[500:1000])
fig.show()

### 1.b Unconstrained representation of words

#### Preprocessing

In [None]:
!python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")




Traceback (most recent call last):
  File "/usr/lib/python3.10/importlib/metadata/__init__.py", line 927, in read_text
    return self._path.joinpath(filename).read_text(encoding='utf-8')
  File "/usr/lib/python3.10/pathlib.py", line 1134, in read_text
    with self.open(mode='r', encoding=encoding, errors=errors) as f:
  File "/usr/lib/python3.10/pathlib.py", line 1119, in open
    return self._accessor.open(self, mode, buffering, encoding, errors,
FileNotFoundError: [Errno 2] No such file or directory: '/usr/local/lib/python3.10/dist-packages/sphinxcontrib_serializinghtml-1.1.10.dist-info/entry_points.txt'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 187, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "/usr/lib/python3.10/runpy.py", line 146, in _get_module_details
    return _get_module_details(pkg_main_name, error)
  File "/usr/

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2024-02-13 19:04:22--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-02-13 19:04:22--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-02-13 19:04:23--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
def load_glove_embeddings(path):
    embeddings_index = {}
    with open(path) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_path = 'glove.6B.100d.txt'

#### Word2Vec

In [None]:


def try_similarity_model1(word1, word2, model):
    try:
        word1_enc = nlp(word1)
        word2_enc = nlp(word2)
        return word1_enc.similarity(word2_enc)
    except KeyError:
        return 0

# df.drop(columns = ['ModelSimilarity', 'word1_afinn', 'word2_afinn', 'NewSim', 'test_grade', 'model_grade'], inplace = True)
df['Spacy_lm'] = df.apply(lambda row: try_similarity_model1(row['word1'], row['word2'], model), axis=1)
print(df)


      word1        word2  SimLex999  ModelSimilarity  word1_afinn  \
0       old          new      0.158         0.319609          0.0   
1     smart  intelligent      0.920         0.744467          0.2   
2      hard    difficult      0.877         0.735163         -0.2   
3     happy     cheerful      0.955         0.638997          0.6   
4      hard         easy      0.095         0.910724         -0.2   
..      ...          ...        ...              ...          ...   
994    join      acquire      0.285         0.871304          0.2   
995    send       attend      0.167         0.840495          0.0   
996  gather       attend      0.480         0.882083          0.0   
997  absorb     withdraw      0.297         0.758678          0.0   
998  attend       arrive      0.608         0.775865          0.0   

     word2_afinn    NewSim  Spacy_lm  
0            0.0  0.319609  0.157152  
1            0.4  0.644467  0.592976  
2           -0.2  0.735163  0.698994  
3            0.

In [None]:
def calculate_accuracy_spacy(df, threshold):
    df['Difference'] = abs(df['SimLex999'] - df['Spacy_lm'])
    df['IsAccurate'] = df['Difference'] <= threshold
    accuracy_percentage = (df['IsAccurate'].sum() / len(df)) * 100
    df.drop(columns = ['Difference', 'IsAccurate'], inplace = True)
    return accuracy_percentage

def calc_accuracy_grades_spacy(df):
    df["test_grade"] = df.apply(lambda row: grade_similarity(row['SimLex999']), axis = 1)
    df["model_grade"] = df.apply(lambda row: int(grade_similarity(row['Spacy_lm'])), axis =1 )
    df['Difference'] = abs(df['test_grade'] - df['model_grade'])
    df['IsAccurate'] = df['Difference'] <= 1
    accuracy_percentage = (df['IsAccurate'].sum() / len(df)) * 100
    df.drop(columns = ['Difference', 'IsAccurate', 'test_grade', 'model_grade'], inplace = True)
    return accuracy_percentage


accuracy_perc_spacy = calculate_accuracy_spacy(df, 0.15)
accuracy_grade_spacy = calc_accuracy_grades_spacy(df)


print("Accuracy of Spacy unconstricted model: ", accuracy_perc_spacy)
print("Accuracy of Spacy unconstricted model with grades: ", accuracy_grade_spacy)


mse_spacy = mean_squared_error(df['SimLex999'], df['Spacy_lm'])
print("Mse spacy: ", mse_spacy)

Accuracy of Spacy unconstricted model:  47.647647647647645
Accuracy of Spacy unconstricted model with grades:  76.87687687687688
Mse spacy:  0.06030644678805455


#### GloVe

In [None]:
embeddings_index = load_glove_embeddings(glove_path) #embeddings_index
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1)* np.linalg.norm(vec2))

def try_similarity_glove(word1, word2, embeddings):
    try:
        word1_emb = embeddings[word1]
        word2_emb = embeddings[word2]
        return cosine_similarity(word1_emb, word2_emb)

    except KeyError:
        return 0



df['Glove_Sim'] = df.apply(lambda row: try_similarity_glove(row['word1'], row['word2'], embeddings_index), axis=1)
print(df)

      word1        word2  SimLex999  ModelSimilarity  word1_afinn  \
0       old          new      0.158         0.319609          0.0   
1     smart  intelligent      0.920         0.744467          0.2   
2      hard    difficult      0.877         0.735163         -0.2   
3     happy     cheerful      0.955         0.638997          0.6   
4      hard         easy      0.095         0.910724         -0.2   
..      ...          ...        ...              ...          ...   
994    join      acquire      0.285         0.871304          0.2   
995    send       attend      0.167         0.840495          0.0   
996  gather       attend      0.480         0.882083          0.0   
997  absorb     withdraw      0.297         0.758678          0.0   
998  attend       arrive      0.608         0.775865          0.0   

     word2_afinn    NewSim  Spacy_lm  Glove_Sim  
0            0.0  0.319609  0.157152   0.643249  
1            0.4  0.644467  0.592976   0.755273  
2           -0.2  0.7

In [None]:
def calculate_accuracy_glove(df, threshold):
    df['Difference'] = abs(df['SimLex999'] - df['Glove_Sim'])
    df['IsAccurate'] = df['Difference'] <= threshold
    accuracy_percentage = (df['IsAccurate'].sum() / len(df)) * 100
    df.drop(columns = ['Difference', 'IsAccurate'], inplace = True)
    return accuracy_percentage

def calc_accuracy_grades_glove(df):
    df["test_grade"] = df.apply(lambda row: grade_similarity(row['SimLex999']), axis = 1)
    df["model_grade"] = df.apply(lambda row: int(grade_similarity(row['Glove_Sim'])), axis =1 )
    df['Difference'] = abs(df['test_grade'] - df['model_grade'])
    df['IsAccurate'] = df['Difference'] <= 1
    accuracy_percentage = (df['IsAccurate'].sum() / len(df)) * 100
    df.drop(columns = ['Difference', 'IsAccurate', 'test_grade', 'model_grade'], inplace = True)
    return accuracy_percentage

accuracy_perc_glove = calculate_accuracy_glove(df, 0.15)
accuracy_grade_glove = calc_accuracy_grades_glove(df)

print("Accuracy of Glove unconstricted model: ", accuracy_perc_glove)
print("Accuracy of Glove unconstricted model with grades: ", accuracy_grade_glove)

mse_glove = mean_squared_error(df['SimLex999'], df['Glove_Sim'])

print("Mse glove: ", mse_glove)

Accuracy of Glove unconstricted model:  41.64164164164164
Accuracy of Glove unconstricted model with grades:  71.47147147147147
Mse glove:  0.07515397461959626


## Part 2: Phrase Embeddings


### Preprocessing


In [None]:
!pip install datasets




In [None]:
from datasets import load_dataset

dataset = load_dataset("PiC/phrase_similarity")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/202k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/403k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7004 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 7004
    })
    validation: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2000
    })
})

In [None]:

train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']
train_df = train_dataset.to_pandas()
validation_df = validation_dataset.to_pandas()
test_df = test_dataset.to_pandas()
print(train_df)


                   phrase1                                    phrase2  \
0        newly formed camp                   recently made encampment   
1                 one data                     a particular statistic   
2     particular structure                           specific edifice   
3          involved people                  participating individuals   
4          different cross                         opposing inquiries   
...                    ...                                        ...   
6999        similar notice                    comparable notification   
7000             color map                              painted chart   
7001            dutch tool  device from the people of the Netherlands   
7002     secondary concern                         less serious issue   
7003            full order                   comprehensive commission   

                                              sentence1  \
0     newly formed camp is released from the membran...   
1    

In [None]:

nlp = spacy.load('en_core_web_lg')
embeddings_index = load_glove_embeddings(glove_path) #for GloVe

### Word2Vec: Hybrid (compositional and distributional method)

#### NN with similarity scores 

In [None]:

def vectorize_add(df):
    X = []
    y = []
    for index, row in df.iterrows():
        phrase1_vec = nlp(row['phrase1'])
        phrase2_vec = nlp(row['phrase2'])
        sim = phrase1_vec.similarity(phrase2_vec)

        X.append(sim)
        y.append(row['label'])

    return np.array(X).reshape(-1, 1), np.array(y)

X_train, y_train = vectorize_add(train_df)
X_test, y_test = vectorize_add(test_df)
X_validate, y_validate = vectorize_add(validation_df)


In [None]:


model = Sequential()
model.add(Dense(512, input_dim=1, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_validate, y_validate))
predictions = model.predict(X_test) > 0.5

print(classification_report(y_test, predictions.flatten()))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.80      0.01      0.02      1000
           1       0.50      1.00      0.67      1000

    accuracy                           0.50      2000
   macro avg       0.65      0.50      0.34      2000
weighted avg       0.65      0.50      0.34      2000



#### NN with the phrase embeddings


In [None]:

def add_preprocessing(df):
    phrase1_embeddings = []
    phrase2_embeddings = []
    y = []
    for index, row in df.iterrows():
        phrase1_vec = sum(nlp(token).vector for token in row['phrase1'].split(' '))
        phrase2_vec = sum(nlp(token).vector for token in row['phrase2'].split(' '))
        norm_phrase1_vec = phrase1_vec / np.linalg.norm(phrase1_vec) if np.linalg.norm(phrase1_vec) != 0 else np.zeros(300)
        norm_phrase2_vec = phrase2_vec / np.linalg.norm(phrase2_vec) if np.linalg.norm(phrase2_vec) != 0 else np.zeros(300)

        phrase1_embeddings.append(norm_phrase1_vec)
        phrase2_embeddings.append(norm_phrase2_vec)
        y.append(row['label'])

    return np.array(phrase1_embeddings), np.array(phrase2_embeddings), np.array(y)

p1_train, p2_train, yfcnn_train = add_preprocessing(train_df)
p1_test, p2_test, yfcnn_test = add_preprocessing(test_df)
p1_validate, p2_validate, yfcnn_validate = add_preprocessing(validation_df)

In [None]:


def create_binary_classification_model(embedding_dim, fcnn_units, dropout_rate, l2_lambda):
    input_phrase1 = Input(shape=(embedding_dim,))
    input_phrase2 = Input(shape=(embedding_dim,))
    fcnn_en1 = Dense(fcnn_units, activation='relu', kernel_regularizer=l2(l2_lambda))(input_phrase1)
    fcnn_en2 = Dense(fcnn_units, activation='relu', kernel_regularizer=l2(l2_lambda))(input_phrase2)

    combined = concatenate([fcnn_en1, fcnn_en2])
    combined = Dropout(dropout_rate)(combined)

    classification = Dense(1, activation='sigmoid')(combined)

    model = Model(inputs=[input_phrase1, input_phrase2], outputs=classification)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

embedding_dim = 300
fcnn_units = 128
dropout_rate = 0.5
l2_lambda = 0.001  # Example L2 regularization factor

model = create_binary_classification_model(embedding_dim, fcnn_units, dropout_rate, l2_lambda)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    [p1_train, p2_train], yfcnn_train,
    epochs=50,
    batch_size=32,
    validation_data=([p1_validate, p2_validate], yfcnn_validate),
    callbacks=[early_stopping]
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50


In [None]:

test_loss, test_accuracy = model.evaluate([p1_test, p2_test], yfcnn_test)
predictions = model.predict([p1_test, p2_test])

thresholded_predictions = (predictions > 0.5).astype(int)
print(classification_report(yfcnn_test, thresholded_predictions))
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

              precision    recall  f1-score   support

           0       0.50      0.99      0.66      1000
           1       0.55      0.02      0.03      1000

    accuracy                           0.50      2000
   macro avg       0.52      0.50      0.35      2000
weighted avg       0.52      0.50      0.35      2000

Test Loss: 0.6932875514030457
Test Accuracy: 0.5015000104904175


#### NN with embedding map


In [None]:


def rnn_preprocessing(df, max_len):
    tokenizer = Tokenizer(oov_token="<OOV>")
    tokenizer.fit_on_texts(df['phrase1'].tolist() + df['phrase2'].tolist())
    sequences_1 = tokenizer.texts_to_sequences(df['phrase1'].tolist())
    sequences_2 = tokenizer.texts_to_sequences(df['phrase2'].tolist())

    padded_sequences_1 = pad_sequences(sequences_1, maxlen=max_len, padding='post')
    padded_sequences_2 = pad_sequences(sequences_2, maxlen=max_len, padding='post')

    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, nlp.vocab.vectors_length))
    for word, i in tokenizer.word_index.items():
        embedding_vector = nlp(word).vector
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                nlp.vocab.vectors_length,
                                weights=[embedding_matrix],
                                input_length=max_len,
                                trainable=False)

    global_average_layer = GlobalAveragePooling1D()

    input_1 = Input(shape=(max_len,))
    input_2 = Input(shape=(max_len,))

    embedded_sequences_1 = embedding_layer(input_1)
    embedded_sequences_2 = embedding_layer(input_2)

    pooled_output_1 = global_average_layer(embedded_sequences_1)
    pooled_output_2 = global_average_layer(embedded_sequences_2)

    concatenated = tf.keras.layers.concatenate([pooled_output_1, pooled_output_2])
    predictions = Dense(1, activation='sigmoid')(concatenated)
    model = Model(inputs=[input_1, input_2], outputs=predictions)

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    y = df['label'].values
    model.fit([padded_sequences_1, padded_sequences_2], y, epochs=10, validation_split=0.1)

    return model

max_len = 30
model = rnn_preprocessing(train_df, max_len)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Using Glove for Hybrid Methods of phrase embeddings

#### Adding word embeddings to get phrase embeddings

In [None]:

def add_preprocessing(df, embeddings):
    oov_vec = np.zeros(100)
    phrase1_embeddings = []
    phrase2_embeddings = []
    y = []

    for index, row in df.iterrows():
        phrase1_vec = sum(embeddings.get(token, oov_vec) for token in row['phrase1'].split(' '))
        phrase2_vec = sum(embeddings.get(token, oov_vec) for token in row['phrase2'].split(' '))

        norm_phrase1_vec = phrase1_vec / np.linalg.norm(phrase1_vec) if np.linalg.norm(phrase1_vec) != 0 else oov_vec
        norm_phrase2_vec = phrase2_vec / np.linalg.norm(phrase2_vec) if np.linalg.norm(phrase2_vec) != 0 else oov_vec

        phrase1_embeddings.append(norm_phrase1_vec)
        phrase2_embeddings.append(norm_phrase2_vec)
        y.append(row['label'])

    return np.array(phrase1_embeddings), np.array(phrase2_embeddings), np.array(y)

p1_traing, p2_traing, y_train_g1 = add_preprocessing(train_df, embeddings_index)
p1_testg, p2_testg, y_test_g1 = add_preprocessing(test_df, embeddings_index)
p1_validateg, p2_validateg,  y_validate_g1 = add_preprocessing(validation_df, embeddings_index)

In [None]:


def create_binary_classification_model(embedding_dim, fcnn_units, dropout_rate, l2_lambda):
    input_phrase1 = Input(shape=(embedding_dim,))
    input_phrase2 = Input(shape=(embedding_dim,))
    fcnn_en1 = Dense(fcnn_units, activation='relu', kernel_regularizer=l2(l2_lambda))(input_phrase1)
    fcnn_en2 = Dense(fcnn_units, activation='relu', kernel_regularizer=l2(l2_lambda))(input_phrase2)

    combined = concatenate([fcnn_en1, fcnn_en2])
    combined = Dropout(dropout_rate)(combined)

    classification = Dense(1, activation='sigmoid')(combined)

    model = Model(inputs=[input_phrase1, input_phrase2], outputs=classification)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

embedding_dim = 100 #glove vectors are 100d
units = 128
dropout_rate = 0.5
l2_lambda = 0.001

model = create_binary_classification_model(embedding_dim, units, dropout_rate, l2_lambda)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    [p1_traing, p2_traing], y_train_g1,
    epochs=50,
    batch_size=32,
    validation_data=([p1_validateg, p2_validateg], y_validate_g1),
    callbacks=[early_stopping]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50


In [None]:

test_loss, test_accuracy = model.evaluate([p1_testg, p2_testg], y_test_g1)
predictions = model.predict([p1_testg, p2_testg])

thresholded_predictions = (predictions > 0.5).astype(int)
print(classification_report(y_test_g1, thresholded_predictions))
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

              precision    recall  f1-score   support

           0       0.49      0.88      0.63      1000
           1       0.39      0.07      0.13      1000

    accuracy                           0.48      2000
   macro avg       0.44      0.48      0.38      2000
weighted avg       0.44      0.48      0.38      2000

Test Loss: 0.6937527656555176
Test Accuracy: 0.4779999852180481


#### Using tensorflow embeddings to map

In [None]:


def preprocess_texts(df, tokenizer, max_len):
    sequences = tokenizer.texts_to_sequences(df.tolist())
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
    return padded_sequences

def glove_model_2_preprocessing(train_df, validation_df, max_len, embeddings_index):
    tokenizer = Tokenizer(oov_token="<OOV>")
    tokenizer.fit_on_texts(train_df['phrase1'].tolist() + train_df['phrase2'].tolist())

    train_sequences_1 = preprocess_texts(train_df['phrase1'], tokenizer, max_len)
    train_sequences_2 = preprocess_texts(train_df['phrase2'], tokenizer, max_len)
    val_sequences_1 = preprocess_texts(validation_df['phrase1'], tokenizer, max_len)
    val_sequences_2 = preprocess_texts(validation_df['phrase2'], tokenizer, max_len)

    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
    embeddings_index['<OOV>'] = np.zeros(100)
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word, embeddings_index['<OOV>'])
        embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(tokenizer.word_index) + 1, 100, weights=[embedding_matrix], input_length=max_len, trainable=False)
    global_average_layer = GlobalAveragePooling1D()

    input_1 = Input(shape=(max_len,))
    input_2 = Input(shape=(max_len,))

    embedded_sequences_1 = embedding_layer(input_1)
    embedded_sequences_2 = embedding_layer(input_2)

    pooled_output_1 = global_average_layer(embedded_sequences_1)
    pooled_output_2 = global_average_layer(embedded_sequences_2)

    concatenated = tf.keras.layers.concatenate([pooled_output_1, pooled_output_2])
    predictions = Dense(1, activation='sigmoid')(concatenated)
    model = Model(inputs=[input_1, input_2], outputs=predictions)

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    y_train = train_df['label'].values
    y_val = validation_df['label'].values
    model.fit([train_sequences_1, train_sequences_2], y_train, epochs=10, validation_data=([val_sequences_1, val_sequences_2], y_val))
    return model, tokenizer

def preprocess_test_data(test_df, tokenizer, max_len):
    test_sequences_1 = preprocess_texts(test_df['phrase1'], tokenizer, max_len)
    test_sequences_2 = preprocess_texts(test_df['phrase2'], tokenizer, max_len)
    y_test = test_df['label'].values
    return [test_sequences_1, test_sequences_2], y_test

# Example usage
max_len = 15
model, tokenizer = glove_model_2_preprocessing(train_df, validation_df, max_len, embeddings_index)
test_data, y_test = preprocess_test_data(test_df, tokenizer, max_len)
y_pred = model.predict(test_data)
y_pred_classes = (y_pred > 0.5).astype("int32")
print(classification_report(y_test, y_pred_classes))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.47      0.60      0.53      1000
           1       0.44      0.32      0.37      1000

    accuracy                           0.46      2000
   macro avg       0.46      0.46      0.45      2000
weighted avg       0.46      0.46      0.45      2000

