In [1]:
#@author - Tejasvi Sharma
#import libraries
import gensim
import nltk
import nltk.data
from os import listdir
from os.path import isfile, join
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import re

In [2]:
#code to read file one by one, save data in a list

sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()

dir_base = "./data/"


def read_file(filename):
    input_file_text = open(filename , encoding='utf-8').read()
    return input_file_text

    
def read_directory_files(directory):
    file_texts = []
    files = [f for f in listdir(directory) if isfile(join(directory, f))]
    for f in files:
        file_texts.append(read_file(join(directory, f) ))
    return file_texts
    
text_corpus = read_directory_files(dir_base)


In [3]:
#tokenize data, tokenize whole document once
token_corpus= [treebank_tokenizer.tokenize(document) for document in text_corpus]

In [4]:
#remove stop words
#use lemmatization to find root words, as some might use different forms of same word.
lemmatizer = WordNetLemmatizer()
pattern = re.compile("[A-Za-z./-]*") #only takes words which have only letters, can also have acronyms, which can be a jargon
#stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english')) 
corpus_filtered=[]
for document in token_corpus: 
    document_corpus=[]
    for word in document:
        if(word not in stop_words):
            #removing alphanumeric, so that alphanumeric wont be added in dictionary
            if(pattern.fullmatch(word)):
                document_corpus.append(lemmatizer.lemmatize(str.lower(word).replace(".", ""))) #removed . as many words had to forms. example example.
    corpus_filtered.append(document_corpus)
            

In [8]:
#as corpus has acronyms, check whether the acronyms are present in corpus or not
#this step is required if corpus acronyms
#corpus_filtered[0]

In [32]:
#create a dictionary from corpus
dictionary = gensim.corpora.Dictionary(corpus_filtered)
print(dictionary[5])
print("Number of words in dictionary:",len(dictionary))
# for i in range(len(dictionary)):
#     print(i, dictionary[i])

audit
Number of words in dictionary: 3829


In [33]:
#getting numbers of the word and the number of time it occurs in that document.(Term frequency)
corpus = [dictionary.doc2bow(doc) for doc in corpus_filtered]

In [34]:
#create a tf-idf model from corpus
tf_idf = gensim.models.TfidfModel(corpus)

In [35]:
sims = gensim.similarities.Similarity('/media/tejasvi-ts/My_Files/f18_ds_nlp/homework/homework_1/dir_sim/',tf_idf[corpus],
                                      num_features=len(dictionary))

In [36]:
#getting similarity
#for document in corpus_filtered:

query_doc_bow = dictionary.doc2bow(corpus_filtered[2] + corpus_filtered[3]+corpus_filtered[4])
query_doc_tf_idf = tf_idf[query_doc_bow]
print(sims[query_doc_tf_idf])

[ 0.06732434  0.04589299  0.54607368  0.59901953  0.63795596  0.01676748
  0.078996    0.01244059  0.02219053  0.04291663  0.00741527  0.07085299
  0.01845528  0.02686126  0.04910208  0.10107413  0.08841018  0.08548425
  0.00860582  0.01216373  0.0143261   0.01580157  0.0264921   0.02096275
  0.02392753  0.04421113  0.02416983  0.06441585  0.0298551   0.037147
  0.01229415  0.01202513  0.03113357  0.00759161  0.02138072  0.01551744
  0.05650915  0.03298256  0.04764617  0.02986821  0.04929609  0.0409489
  0.00524068  0.06515984  0.25890231  0.05219238  0.05204027  0.02736784
  0.09526758  0.26274782  0.00462185  0.01548065  0.04341775  0.06014821
  0.01636594  0.0312711   0.01634337  0.0057677   0.03937663  0.00815066
  0.04056709  0.07860883  0.02613876  0.02070126  0.02881328  0.02674234
  0.03058406  0.02696064  0.03458256  0.01010247  0.0427427   0.00676735
  0.01015245  0.03466513  0.06519137  0.08855504  0.0397654   0.0331906
  0.0288224   0.00443691  0.01032401  0.02330595  0.035