# Vector Space Model

In [1]:
import glob
import nltk
nltk.download('popular');
from nltk.corpus import stopwords
import string
from collections import Counter
import numpy as np
from collections import OrderedDict
from collections import Counter 
from collections import defaultdict
import time
from tqdm import tqdm

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

## Import and Extract Data from Google Drive to current working directory

In [2]:
# importing required modules 
from zipfile import ZipFile 
path="/content/drive/My Drive/IR_A1/"
# specifying the zip file name and path
file_name = path+"ACL txt.zip"
  
# opening the zip file in READ mode 
with ZipFile(file_name, 'r') as zip1: 
    # extracting all the files in current folder
    print('Extracting all the files now...') 
    zip1.extractall() 
    print('Done!') 

Extracting all the files now...
Done!


## Read files, remove punctuations and stop words

In [3]:
def wordList_removePuncs(fld_path):                             #give path of the folder containing all documents
    dic = {}
    stop_words = set(stopwords.words('english')) # All stop words
    file_names = glob.glob(fld_path)  # Read all names of documents in the list
    for file in tqdm(file_names,position=0):
        name = file.split('/')[-1]
        with open(file, 'r', errors='ignore') as f: #ignore files with errors
            x = f.read()
        x = x.split() # split in list
        x = [''.join(c for c in s if c not in string.punctuation) for s in x] #remove punctuations
        x = [w for w in x if not w in stop_words and len(w) >= 3 and w.isalpha()] #remove stop words, digits and 2 letter words
        x = [s for s in x if s] #Remove empty list elements
        dic[name] =' '.join(x)  #Join list to make a string
    return dic

## Calculate Term frequency.

In [4]:
def termFrequencyInDoc(doc_dict):
    tf_docs = {}  # create a dict for storing term frequency in document
    tf_docs = {doc: {} for doc in doc_dict.keys()}
    for doc_id,doc in tqdm(doc_dict.items(),position=0):
        tf_docs[doc_id] = dict(Counter(doc.lower().split()))
    return tf_docs

## Calculate Word Document Frequency

In [5]:
def wordDocFre(doc_dict):
  DF = defaultdict(int) # create a default dict for storing word document frequency
  for doc in tqdm(doc_dict.values(),position=0):
      words = doc.lower().split()
      for word in set(words):
        DF[word] += 1  # defaultdict simplifies "if key in word_idf: ..." part
  return DF

## Calculate Inverse Document Frequency

In [6]:
def inverseDocFre(doc_fre,length):
    idf= {} # create a dict for storing inverse word document frequency
    for word in list(df.keys()):     
        idf[word] = np.log2((length+1) / doc_fre[word])
    return idf

## Calculate TF-IDF

In [7]:
def tfidf(tf,idf_scr,doc_dict):
    tf_idf_scr = {} # create a dict for storing tf-idf values
    tf_idf_scr={doc: {} for doc in doc_dict.keys()}
    for doc_id,doc in tqdm(doc_dict.items(),position=0):
      for word in doc.lower().split():
        tf_idf_scr[doc_id][word] = tf[doc_id].get(word, 0) * idf_scr[word]
    return tf_idf_scr

## Vector Space Model

In [8]:
def vectorSpaceModel(query, doc_dict,tfidf_scr):
    query_vocab = [] # create a list for storing query vocabulary
    for word in query.lower().split():
        if word not in query_vocab:
            query_vocab.append(word)

    query_wc = {} # create a dict for storing query word count
    for word in query_vocab:
        query_wc[word] = query.lower().split().count(word)
    
    relevance_scores = {} # create a dict for storing relevance scores for query words
    for doc_id in doc_dict.keys():
        score = 0
        for word in query_vocab:
            try:
              score += query_wc[word] * tfidf_scr[doc_id][word]
            except:
              continue
        relevance_scores[doc_id] = score
        #sort relevance scores and return top 5 for the query.
    sorted_value = OrderedDict(sorted(relevance_scores.items(), key=lambda x: x[1], reverse = True))
    top_5 = {k: sorted_value[k] for k in list(sorted_value)[:5]}
    return top_5

## Retreive Top 5 Query Relevent Documents

In [9]:
t0 = time.time()                       #starting time
doc_dict=wordList_removePuncs('/content/ACL txt/*.txt')     #return document dictionary         
tf=termFrequencyInDoc( doc_dict)       #return term frequency
df=wordDocFre(doc_dict)                #return word document frequency
M = len(doc_dict)                      #Total number of documents
idf_dict = inverseDocFre(df,M)         #returns idf scores
tf_idf = tfidf(tf,idf_dict,doc_dict)   #returns tf-idf socres

#All Query Words
query1 = 'LDA'
query2 = 'Topic modelling'
query3 = 'Generative models'
query4 = 'Semantic relationships between terms'
query5 = 'Natural Language Processing'
query6 = 'Text Mining'
query7 = 'Translation model'
query8 = 'Learning procedures for the lexicon'
query9 = 'Semantic evaluations'
query10 = 'System results and combination'

#Top 5 documents for all Queries using Vector Space Model
top1 = vectorSpaceModel(query1, doc_dict,tf_idf)    
top2 = vectorSpaceModel(query2, doc_dict,tf_idf)    
top3 = vectorSpaceModel(query3, doc_dict,tf_idf)    
top4 = vectorSpaceModel(query4, doc_dict,tf_idf)    
top5 = vectorSpaceModel(query5, doc_dict,tf_idf)   
top6 = vectorSpaceModel(query6, doc_dict,tf_idf)  
top7 = vectorSpaceModel(query7, doc_dict,tf_idf)  
top8 = vectorSpaceModel(query8, doc_dict,tf_idf)  
top9 = vectorSpaceModel(query9, doc_dict,tf_idf)   
top10 = vectorSpaceModel(query10, doc_dict,tf_idf)   

t1 = time.time() #Ending Time
print('\n')
print('Total time of execution : ',str(t1-t0),' seconds') #print total time for program execution
print('\n')
#Print top 5 documents for all queries
print('Top 5 Documents for LDA: \n', list(top1))
print('\n')
print('Top 5 Documents for Topic modelling: \n', list(top2))
print('\n')
print('Top 5 Documents for Generative models: \n', list(top3))
print('\n')
print('Top 5 Documents for Semantic relationships between terms: \n', list(top4))
print('\n')
print('Top 5 Documents for Natural Language Processing: \n', list(top5))
print('\n')
print('Top 5 Documents for Text Mining: \n', list(top6))
print('\n')
print('Top 5 Documents for Translation model: \n', list(top7))
print('\n')
print('Top 5 Documents for Learning procedures for the lexicon: \n', list(top8))
print('\n')
print('Top 5 Documents for Semantic evaluations: \n', list(top9))
print('\n')
print('Top 5 Documents for System results and combination: \n', list(top10))

100%|██████████| 21941/21941 [02:29<00:00, 146.60it/s]
100%|██████████| 21941/21941 [00:13<00:00, 1667.16it/s]
100%|██████████| 21941/21941 [00:17<00:00, 1219.43it/s]
100%|██████████| 21941/21941 [00:52<00:00, 418.46it/s]




Total time of execution :  235.8566906452179  seconds


Top 5 Documents for LDA: 
 ['J14-2003.pdf.txt', 'D09-1026.pdf.txt', 'D11-1050.pdf.txt', 'N10-1070.pdf.txt', 'P10-1117.pdf.txt']


Top 5 Documents for Topic modelling: 
 ['J14-2003.pdf.txt', 'P12-1079.pdf.txt', 'Q15-1004.pdf.txt', 'N15-1074.pdf.txt', 'W10-4104.pdf.txt']


Top 5 Documents for Generative models: 
 ['W06-1668.pdf.txt', 'W11-0100.pdf.txt', 'J03-4003.pdf.txt', 'D09-1111.pdf.txt', 'D09-1058.pdf.txt']


Top 5 Documents for Semantic relationships between terms: 
 ['W11-0100.pdf.txt', 'J08-2004.pdf.txt', 'W15-3808.pdf.txt', 'W09-2004.pdf.txt', 'J09-2003.pdf.txt']


Top 5 Documents for Natural Language Processing: 
 ['W11-0100.pdf.txt', 'J14-1005.pdf.txt', 'J87-1020.pdf.txt', 'W14-55.x.pdf.txt', 'J86-2001.pdf.txt']


Top 5 Documents for Text Mining: 
 ['D09-1162.pdf.txt', 'P06-1062.pdf.txt', 'P12-1062.pdf.txt', 'W09-2609.pdf.txt', 'P09-1098.pdf.txt']


Top 5 Documents for Translation model: 
 ['J85-2006.pdf.txt', 'J03-3003

## Document names with their Relevance Scores

In [10]:
print('Document Names with their Relevance Scores\n')
print('Top 5 Documents for LDA: \n', list(top1.items()))
print('\n')
print('Top 5 Documents for Topic modelling: \n', list(top2.items()))
print('\n')
print('Top 5 Documents for Generative models: \n', list(top3.items()))
print('\n')
print('Top 5 Documents for Semantic relationships between terms: \n', list(top4.items()))
print('\n')
print('Top 5 Documents for Natural Language Processing: \n', list(top5.items()))
print('\n')
print('Top 5 Documents for Text Mining: \n', list(top6.items()))
print('\n')
print('Top 5 Documents for Translation model: \n', list(top7.items()))
print('\n')
print('Top 5 Documents for Learning procedures for the lexicon: \n', list(top8.items()))
print('\n')
print('Top 5 Documents for Semantic evaluations: \n', list(top9.items()))
print('\n')
print('Top 5 Documents for System results and combination: \n', list(top10.items()))

Document Names with their Relevance Scores

Top 5 Documents for LDA: 
 [('J14-2003.pdf.txt', 381.11829876445427), ('D09-1026.pdf.txt', 351.8015065518039), ('D11-1050.pdf.txt', 342.0292424809205), ('N10-1070.pdf.txt', 312.7124502682702), ('P10-1117.pdf.txt', 298.05405416194503)]


Top 5 Documents for Topic modelling: 
 [('J14-2003.pdf.txt', 375.99023016444085), ('P12-1079.pdf.txt', 335.6400591224033), ('Q15-1004.pdf.txt', 300.7921841315527), ('N15-1074.pdf.txt', 298.9580854478237), ('W10-4104.pdf.txt', 278.7829999268049)]


Top 5 Documents for Generative models: 
 [('W06-1668.pdf.txt', 187.482857604789), ('W11-0100.pdf.txt', 184.37811943982004), ('J03-4003.pdf.txt', 151.6418584247996), ('D09-1111.pdf.txt', 132.91153581921373), ('D09-1058.pdf.txt', 132.148611029166)]


Top 5 Documents for Semantic relationships between terms: 
 [('W11-0100.pdf.txt', 1010.6154364316802), ('J08-2004.pdf.txt', 200.0156245489287), ('W15-3808.pdf.txt', 182.73011439548486), ('W09-2004.pdf.txt', 171.03532256989