In [1]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

file_docs = []

with open ('blog1.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file_docs.append(line)

print("Number of documents:",len(file_docs))

Number of documents: 12


[nltk_data] Downloading package punkt to C:\Users\MY
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Open file and tokenize sentences

In [2]:
gen_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in file_docs]

# Tokenize words and create dictionary

In [3]:
from gensim import corpora



In [4]:
dictionary = corpora.Dictionary(gen_docs)
print(dictionary.token2id)

{'.': 0, 'a': 1, 'across': 2, 'bit': 3, 'come': 4, 'different': 5, 'from': 6, 'is': 7, 'my': 8, 'other': 9, 'placement': 10, 'stories': 11, 'story': 12, 'that': 13, 'the': 14, 'will': 15, 'you': 16, 'began': 17, 'break': 18, 'during': 19, 'for': 20, 'i': 21, 'internship': 22, 'preparation': 23, 'semester': 24, 'session': 25, 'two-month': 26, ',': 27, '10': 28, 'able': 29, 'an': 30, 'and': 31, 'any': 32, 'appeared': 33, 'but': 34, 'clear': 35, 'companies': 36, 'company': 37, 'due': 38, 'even': 39, 'favoring': 40, 'first': 41, 'flaws': 42, 'get': 43, 'hope': 44, 'in': 45, 'intern': 46, 'luck': 47, 'many': 48, 'me': 49, 'not': 50, 'of': 51, 'round': 52, 'strategy': 53, 't': 54, 'to': 55, 'wasn': 56, 'with': 57, 'written': 58, '’': 59, 'all': 60, 'combined': 61, 'correct': 62, 'factors': 63, 'goal': 64, 'higher': 65, 'led': 66, 'path': 67, 'pursuing': 68, 'should': 69, 'stick': 70, 'studies': 71, 'these': 72, 'think': 73, 'this': 74, '3rd': 75, 'continued': 76, 'never': 77, 'placements': 7

# Create a bag of words

In [5]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

In [6]:
from gensim import models
from gensim import *
import numpy as np

## Tf-Idf is calculated by multiplying a local component (TF) with a global component (IDF) and optionally normalizing the result to unit length. Term frequency is how often the word shows up in the document and inverse document frequency scales the value by how rare the word is in the corpus. In simple terms, words that occur more frequently across the documents get smaller weights.

In [19]:
tf_idf = models.TfidfModel(corpus)
for doc in tf[corpus]:
    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

[['a', 0.11], ['across', 0.31], ['bit', 0.31], ['come', 0.31], ['different', 0.31], ['from', 0.31], ['is', 0.31], ['my', 0.04], ['other', 0.31], ['placement', 0.17], ['stories', 0.31], ['story', 0.31], ['that', 0.11], ['the', 0.05], ['will', 0.22], ['you', 0.14]]
[['my', 0.05], ['the', 0.15], ['began', 0.45], ['break', 0.33], ['during', 0.2], ['for', 0.13], ['i', 0.05], ['internship', 0.16], ['preparation', 0.25], ['semester', 0.45], ['session', 0.33], ['two-month', 0.45]]
[['a', 0.07], ['my', 0.02], ['the', 0.1], ['i', 0.05], ['preparation', 0.11], [',', 0.07], ['10', 0.14], ['able', 0.2], ['an', 0.14], ['and', 0.06], ['any', 0.2], ['appeared', 0.2], ['but', 0.11], ['clear', 0.14], ['companies', 0.11], ['company', 0.2], ['due', 0.2], ['even', 0.2], ['favoring', 0.2], ['first', 0.29], ['flaws', 0.2], ['get', 0.2], ['hope', 0.2], ['in', 0.22], ['intern', 0.2], ['luck', 0.2], ['many', 0.2], ['me', 0.09], ['not', 0.14], ['of', 0.09], ['round', 0.29], ['strategy', 0.14], ['t', 0.09], ['to'

## Now, we are going to create similarity object. The main class is Similarity, which builds an index for a given set of documents.The Similarity class splits the index into several smaller sub-indexes, which are disk-based. Let's just create similarity object then you will understand how we can use it for comparing.

In [20]:
sims = similarities.Similarity('',tf_idf[corpus],
                                        num_features=len(dictionary))

# Second Document where I have stored another text file as a blog

In [21]:
file2_docs = []

with open ('blog2.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file2_docs.append(line)

print("Number of documents:",len(file2_docs))  
for line in file2_docs:
    query_doc_tf_idf = [w.lower() for w in word_tokenize(line)]
    query_doc_bow = dictionary.doc2bow(query_doc_tf_idf) #update an existing dictionary and

Number of documents: 19


# Document similarities to query


In [23]:
query_doc_tf_idf = tf_idf[query_doc_bow]
print('Comparing Result:', sims[query_doc_tf_idf]) 

Comparing Result: [0.11946748 0.06188476 0.05472439 0.07977556 0.12668498 0.32534105
 0.02892251 0.10277206 0.1436688  0.09660573 0.08641316 0.01215397]


In [24]:
import numpy as np
sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
print(sum_of_sims)


1.2384144


# Next below code is for calculating the average similarity

In [25]:
percentage_of_similarity = round(float((sum_of_sims / len(file_docs)) * 100))
print(f'Average similarity float: {float(sum_of_sims / len(file_docs))}')
print(f'Average similarity percentage: {float(sum_of_sims / len(file_docs)) * 100}')
print(f'Average similarity rounded percentage: {percentage_of_similarity}')

Average similarity float: 0.10320120056470235
Average similarity percentage: 10.320120056470234
Average similarity rounded percentage: 10


# We find the similarity between the two blog is 10%