In [3]:
import nltk
import urllib.request
import urllib.parse
import urllib.error
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rishabhjain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
def importData(site):
    print('Downloading books')
    print('Extracting ', site)
    uh1 = urllib.request.urlopen(site)
    data = uh1.read().decode('utf8')
    return data

In [7]:
def preProcessing(data):
    data = data.lower()
    data = re.sub('^Section [1-9].', '', data)
    data = re.sub(r'==.*?==+', '', data)
    data = re.sub(r'CHAPTER \d+', '', data)
    data = re.sub('[\(\[].*?[\)\]]', '', data)
    data = re.sub(r'[^a-zA-Z0-9\s]', '', data)
    data = data.replace('\n', '')
    return data

In [6]:
data1 = importData('http://www.gutenberg.org/files/1342/1342-0.txt')
data2 = importData('http://www.gutenberg.org/files/829/829-0.txt')
data3 = importData("https://www.gutenberg.org//cache/epub/22381/pg22381.txt")

Downloading books
Extracting  http://www.gutenberg.org/files/1342/1342-0.txt
Downloading books
Extracting  http://www.gutenberg.org/files/829/829-0.txt
Downloading books
Extracting  https://www.gutenberg.org//cache/epub/22381/pg22381.txt


In [13]:
from nltk.corpus import stopwords
data1 = preProcessing(data1)

In [9]:
data2 = preProcessing(data2)

In [10]:
data3 = preProcessing(data3)

In [14]:
stop_words = set(stopwords.words('english'))

In [16]:
corpus = [data1,data2,data3]

vectorizer = TfidfVectorizer(min_df=0.1,stop_words=stop_words)
trsfm = vectorizer.fit_transform(corpus)
pd.DataFrame(trsfm.toarray(),columns=vectorizer.get_feature_names(),index=['B1','B2','B3'])



Unnamed: 0,10,100,101,102,103,104,105,106,107,108,...,zealots,zealous,zenith,zephyrs,zephyrus,zetes,zethus,zeus,zodiac,zwin
B1,0.000739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B2,0.000622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001054,0.001602,0.001054,0.0,0.0,0.0,0.0,0.0,0.001054,0.001054
B3,0.001949,0.004401,0.004401,0.0022,0.0011,0.007701,0.011002,0.007701,0.007701,0.009902,...,0.0,0.000837,0.0,0.0011,0.007701,0.005501,0.005501,0.236545,0.0,0.0


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(trsfm[0],trsfm)


array([[1.        , 0.47258612, 0.2607772 ]])

In [18]:
cosine_similarity(trsfm[1],trsfm)

array([[0.47258612, 1.        , 0.47534742]])

In [20]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatizetext(sentence):
    token_words=nltk.word_tokenize(sentence) 
    lemma_sentence=[]
    for word in token_words:
        lemma_sentence.append(lemmatizer.lemmatize(word))
        lemma_sentence.append(" ")
    return "".join(lemma_sentence)

data1 = lemmatizetext(data1)
data2 = lemmatizetext(data2)
data3 = lemmatizetext(data3)

In [22]:
corpus = [data1,data2,data3]

vectorizer = TfidfVectorizer(min_df=0.1,stop_words=stop_words)
trsfm = vectorizer.fit_transform(corpus)
pd.DataFrame(trsfm.toarray(),columns=vectorizer.get_feature_names(),index=['B1','B2','B3'])



Unnamed: 0,10,100,101,102,103,104,105,106,107,108,...,zealot,zealous,zenith,zephyr,zephyrus,zetes,zethus,zeus,zodiac,zwin
B1,0.000573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B2,0.000469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000795,0.001209,0.000795,0.0,0.0,0.0,0.0,0.0,0.000795,0.000795
B3,0.001303,0.002941,0.002941,0.00147,0.000735,0.005146,0.007351,0.005146,0.005146,0.006616,...,0.0,0.000559,0.0,0.000735,0.005146,0.003676,0.003676,0.158055,0.0,0.0


In [23]:
cosine_similarity(trsfm[0],trsfm)

array([[1.        , 0.61686472, 0.52198365]])

In [24]:
cosine_similarity(trsfm[1],trsfm)

array([[0.61686472, 1.        , 0.69378468]])