In [2]:
!python --version

Python 3.8.8


In [3]:
!pip install glove-python-binary


Collecting glove-python-binary
  Downloading glove_python_binary-0.2.0-cp38-cp38-win_amd64.whl (244 kB)
Installing collected packages: glove-python-binary
Successfully installed glove-python-binary-0.2.0


In [4]:
!pip install ir_datasets

In [1]:
import glove

In [2]:
import ir_datasets
dataset = ir_datasets.load('cranfield')

In [3]:
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary nltk resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NAS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NAS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NAS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\NAS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
def preprocess_text(text, remove_stopwords: bool):
    # Convert text to lower case
    text = text.lower()
    text = ''.join(c for c in text if c.isalnum() or c.isspace())

    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize text
    tokens = nltk.word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Stem tokens
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]

    # Join tokens back into a single string
    #preprocessed_text = ' '.join(stemmed_tokens)

    return stemmed_tokens

In [5]:
import pandas as pd
df_docs = pd.DataFrame(columns=['doc_id', 'processed_text'])
df_q = pd.DataFrame(columns=['q_id', 'processed_text'])
for item in dataset.docs_iter():
  proc1 = preprocess_text(item[1], remove_stopwords=True)
  proc2 = preprocess_text(item[2], remove_stopwords=True)
  #df.loc[len(df.index)] = [item[0], [preprocess_text(item[1], remove_stopwords=True), preprocess_text(item[2], remove_stopwords=True)]] 
  proc1.extend(proc2)
  df_docs.loc[len(df_docs.index)] = [item[0], proc1]
for item in dataset.queries_iter():
  df_q.loc[len(df_q.index)] = [item[0], preprocess_text(item[1], remove_stopwords=True)] 
result = df_q.head(10)
print("First 10 rows of the DataFrame:")
print(result)

First 10 rows of the DataFrame:
  q_id                                     processed_text
0    1  [similar, law, must, obey, construct, aeroelas...
1    2  [structur, aeroelast, problem, associ, flight,...
2    4  [problem, heat, conduct, composit, slab, solv,...
3    8  [criterion, develop, show, empir, valid, flow,...
4    9  [chemic, kinet, system, applic, hyperson, aero...
5   10  [theoret, experiment, guid, turbul, couett, fl...
6   12  [possibl, relat, avail, pressur, distribut, og...
7   13  [method, dash, exact, approxim, dash, present,...
8   15  [paper, intern, slip, flow, heat, transfer, st...
9   18  [realga, transport, properti, air, avail, wide...


In [6]:
from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus()

In [7]:
glove_model = Glove(no_components=100, learning_rate=0.05)

In [8]:
texts1 = df_docs.processed_text.tolist()
texts2 = df_q.processed_text.tolist()
texts1.extend(texts2)

In [9]:
for item in texts1:
    item = ' '.join(item)


In [10]:
corpus.fit(texts1, window=10)


In [11]:
glove = Glove(no_components=100, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [12]:
word = 'obey'
vector = glove.word_vectors[glove.dictionary[word]]
print(vector)

[ 0.00323711  0.02157388  0.02013845  0.05786494  0.02666519 -0.05733531
  0.01794501 -0.0173818   0.00369853 -0.02493317  0.03123947  0.01160073
  0.00741265 -0.0371027   0.07284716  0.03153904 -0.05933648  0.00953027
 -0.06518831 -0.02914302  0.04494675  0.01566875 -0.03119379 -0.00547448
  0.0258231  -0.06966225 -0.01717727  0.04970566 -0.04653273 -0.01066145
  0.08944519  0.06836338  0.06253167 -0.00469795  0.03439438 -0.05933104
  0.03320623  0.02144831 -0.00131006  0.00375011  0.02226873  0.00848228
  0.04017506 -0.09067384 -0.04334787 -0.01733106 -0.08960709 -0.04704098
  0.03996418  0.02780876 -0.0347527   0.03649128 -0.02689545 -0.0345197
  0.02025353 -0.03379716 -0.04788756 -0.05448413  0.06855027  0.00630814
 -0.02884657 -0.03271238  0.02273263 -0.00884147  0.00305772  0.08825389
  0.04358397  0.03927837 -0.04344946 -0.02475672  0.00704874  0.05484252
  0.08617181 -0.00557333 -0.03564884 -0.0018755  -0.0265557   0.00471218
  0.05913074  0.04606015  0.00658584 -0.04712975 -0.

In [13]:
import numpy as np
def get_embedding(query,model):
  #print(query)
  wordList = query.split()
  #print(wordList)
  total = np.zeros(100)
  for word in wordList:
    total = total + glove.dictionary[word]
  average = total / len(wordList)
  return average

In [22]:
import numpy as np

def cosine_similarity_calc(vec_1,vec_2):

   sim = np.dot(vec_1,vec_2)/(np.linalg.norm(vec_1)*np.linalg.norm(vec_2))

   return sim

In [30]:
def cosine_similarity2(a, b):
    nominator = np.dot(a, b)
    
    a_norm = np.sqrt(np.sum(a**2))
    b_norm = np.sqrt(np.sum(b**2))
    
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
def get_score(document,query,model):
  query_emb = get_embedding(query,model)
  document_emb = get_embedding(document,model)
  similarity = cosine_similarity(query_emb.reshape(1, -1), document_emb.reshape(1, -1))
  #similarity = cosine_similarity2(query_emb, document_emb)
  return similarity

In [15]:
def calculateAllScores(model):
  df_qrel = pd.DataFrame(columns=['query_id','doc_id','score','relevance'])
  for item in dataset.qrels_iter():
    df_qrel.loc[len(df_qrel.index)] = [item[0], item[1], 0,item[2]] 
  
  for item in df_qrel.iterrows():
    try:
     s_doc = ' '.join((((df_docs.loc[df_docs['doc_id'] == item[1].doc_id]).processed_text).tolist())[0])
     s_query = ' '.join((((df_q.loc[df_q['q_id'] == item[1].query_id]).processed_text).tolist())[0])
     item[1].score = get_score(s_doc,s_query,model)[0][0]
     #print(item[1].score)
    except:
      #print("exception")
      continue
  return df_qrel

#df_qrel.head(200)

In [16]:
def sortScores(model):
  df_qrel = calculateAllScores(model)
  df_qrel['query_id'] = pd.to_numeric(df_qrel['query_id'])
  df_qrel['doc_id'] = pd.to_numeric(df_qrel['doc_id'])
  df_qrel.head(200)
  
  df_sorted = df_qrel.sort_values(by=['query_id', 'score'], ascending=[True, False])
  return df_sorted

In [19]:
import math

def DCG(model):
  df_sorted = sortScores(model)
  unique_queries = df_sorted['query_id'].unique()
  original_queries = (pd.to_numeric(df_q['q_id'])).unique()
  listOfQueries = list(set(unique_queries) & set(original_queries))
  
  mean_DCG = 0
  for item in listOfQueries:
    docs = df_sorted.loc[df_sorted['query_id'] == item]
    #print(docs)
    DCG = 0
    k = len(docs)
    for i in range(1,k+1):
      DCG += ((2 ** (docs.iloc[i-1].relevance)) / math.log(i+1, 2))
    mean_DCG += DCG
  mean_DCG /= len(unique_queries)
  #print(mean_DCG)
  return (mean_DCG)

In [20]:
DCG(glove)

18.14683159716936