In [1]:
#import the dependencies
import json
import pandas as pd
import numpy as np
import spacy
pd.set_option('display.max_colwidth', 50)

In [2]:
#load the spacy model
snl = spacy.load("en_core_web_lg")

In [3]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
punctuations = "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n"

In [4]:
#Loading the data with pandas
df = pd.read_json('data/Export_DataFrame.json')


In [5]:
def spacy_tokenizer(sentence):
    #remove new lines
    # Create token object
    mytokens = snl(sentence)
    # Case normalization and Lemmatization
    mytokens = [ word.lemma_.lower() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    # Remove stop words and punctuations
    mytokens = [ word.strip(".") for word in mytokens if word not in stopWords and len(word) > 1 and word not in punctuations ]
    # return preprocessed list of tokens
    return mytokens

In [6]:
#Removing empty string 
def remove_empty(x):
    if type(x) is str:
        x = x.split(",")
        x = [ y for y in x if y.strip()]
        return ",".join(x)
    elif type(x) is list:
        return [ y for y in x if y.strip()]

df['cleanText'] = df['cleanText'].apply(remove_empty)

In [7]:
def preprocess(data):
    data = spacy_tokenizer(data)
    #data = remove_punctuation(data)
    data = remove_empty(data)
    return data

In [8]:
df.head()

Unnamed: 0,title,text,url,cleanText
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic,"[pandemic, greek, πᾶν, pan, δῆμος, demo, peopl..."
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...,"[hiv, aids, human, immunodeficiency, virus, co..."
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague,"[antonine, plague, 165, 180, ad, also, know, p..."
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...,"[epidemiology, basic, reproduction, number, ba..."
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality,"[bill, mortality, weekly, mortality, statistic..."


In [9]:
# Concatenate all tokenized text into a single list
corpus_ = [text for text in df.cleanText]

In [10]:
corpus_[:2]

[['pandemic',
  'greek',
  'πᾶν',
  'pan',
  'δῆμος',
  'demo',
  'people',
  'epidemic',
  'infectious',
  'disease',
  'spread',
  'across',
  'large',
  'region',
  'instance',
  'multiple',
  'continent',
  'worldwide',
  'affect',
  'substantial',
  'number',
  'people',
  'widespread',
  'endemic',
  'disease',
  'stable',
  'number',
  'infected',
  'people',
  'pandemic',
  'widespread',
  'endemic',
  'disease',
  'stable',
  'number',
  'infected',
  'people',
  'recurrence',
  'seasonal',
  'influenza',
  'generally',
  'exclude',
  'occur',
  'simultaneously',
  'large',
  'region',
  'globe',
  'rather',
  'spread',
  'worldwide',
  'throughout',
  'human',
  'history',
  'number',
  'pandemic',
  'disease',
  'smallpox',
  'tuberculosis',
  'fatal',
  'pandemic',
  'record',
  'history',
  'black',
  'death',
  'also',
  'know',
  'plague',
  'kill',
  'estimate',
  '75–200',
  'million',
  'people',
  '14th',
  'century',
  'term',
  'use',
  'yet',
  'later',
  'pandemi

In [11]:
#Flatten the list of list
import itertools
vocabulary = list(itertools.chain(*corpus_))

In [12]:
vocabulary

['pandemic',
 'greek',
 'πᾶν',
 'pan',
 'δῆμος',
 'demo',
 'people',
 'epidemic',
 'infectious',
 'disease',
 'spread',
 'across',
 'large',
 'region',
 'instance',
 'multiple',
 'continent',
 'worldwide',
 'affect',
 'substantial',
 'number',
 'people',
 'widespread',
 'endemic',
 'disease',
 'stable',
 'number',
 'infected',
 'people',
 'pandemic',
 'widespread',
 'endemic',
 'disease',
 'stable',
 'number',
 'infected',
 'people',
 'recurrence',
 'seasonal',
 'influenza',
 'generally',
 'exclude',
 'occur',
 'simultaneously',
 'large',
 'region',
 'globe',
 'rather',
 'spread',
 'worldwide',
 'throughout',
 'human',
 'history',
 'number',
 'pandemic',
 'disease',
 'smallpox',
 'tuberculosis',
 'fatal',
 'pandemic',
 'record',
 'history',
 'black',
 'death',
 'also',
 'know',
 'plague',
 'kill',
 'estimate',
 '75–200',
 'million',
 'people',
 '14th',
 'century',
 'term',
 'use',
 'yet',
 'later',
 'pandemic',
 'include',
 '1918',
 'influenza',
 'pandemic',
 'spanish',
 'flu',
 'curre

In [13]:
# remove duplicate from corpus_vocab
def unique_list(l):
    corpus = []
    [corpus.append(x) for x in vocabulary if x not in corpus]
    return corpus

In [14]:
corpus = unique_list(vocabulary)

In [15]:
print('Number of words in the corpus:',len(vocabulary))
print('The number of unique words in the corpus:', len(corpus))

Number of words in the corpus: 3713
The number of unique words in the corpus: 1522


In [16]:
N = len(corpus_)
N

26

In [17]:
vocabulary[20:30]

['number',
 'people',
 'widespread',
 'endemic',
 'disease',
 'stable',
 'number',
 'infected',
 'people',
 'pandemic']

In [18]:
# Save the vocabulary as a json file
with open('data/corpus.json', 'w') as outfile:
    json.dump(vocabulary, outfile)

In [19]:
vocabulary[:20]

['pandemic',
 'greek',
 'πᾶν',
 'pan',
 'δῆμος',
 'demo',
 'people',
 'epidemic',
 'infectious',
 'disease',
 'spread',
 'across',
 'large',
 'region',
 'instance',
 'multiple',
 'continent',
 'worldwide',
 'affect',
 'substantial']

# Compute TfIdfs of the documents

In [20]:
#Term/word  frequency in the corpus
DF = {}
for i in range(len(corpus_)):
    tokens = corpus_[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
for i in DF:
    DF[i] = len(DF[i])
    

In [24]:
DF['pandemic']

17

In [25]:
total_vocabulary = [x for x in DF]


In [26]:
def doc_freq(word):
    c=0
    try:
        c = DF[word]
        
    except:
        pass
    return c
    

In [27]:
doc_freq('pandemic')

17

In [31]:
from collections import Counter
doc = 0
tf_idf = {}
for i in range(N):
    tokens = corpus_[i]
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        print(round(tf, 6))
        df = doc_freq(token)
        print(df)
        idf = np.log(N/(df + 1))
        tf_idf[doc, token] = tf * idf
        print(round(tf_idf[doc, token], 5))
        print('*******')
    doc += 1

0.010753
1
0.02758
*******
0.010753
2
0.02322
*******
0.010753
1
0.02758
*******
0.010753
1
0.02758
*******
0.010753
8
0.01141
*******
0.010753
6
0.01411
*******
0.010753
12
0.00745
*******
0.010753
1
0.02758
*******
0.010753
5
0.01577
*******
0.010753
1
0.02758
*******
0.010753
3
0.02013
*******
0.010753
6
0.01411
*******
0.010753
2
0.02322
*******
0.010753
11
0.00831
*******
0.010753
1
0.02758
*******
0.043011
16
0.01827
*******
0.021505
3
0.04025
*******
0.010753
10
0.00925
*******
0.010753
10
0.00925
*******
0.010753
1
0.02758
*******
0.010753
1
0.02758
*******
0.010753
4
0.01773
*******
0.010753
3
0.02013
*******
0.010753
1
0.02758
*******
0.010753
1
0.02758
*******
0.021505
4
0.03546
*******
0.010753
6
0.01411
*******
0.010753
11
0.00831
*******
0.021505
13
0.01331
*******
0.021505
6
0.02822
*******
0.010753
6
0.01411
*******
0.021505
10
0.0185
*******
0.010753
1
0.02758
*******
0.010753
4
0.01773
*******
0.010753
10
0.00925
*******
0.021505
7
0.02535
*******
0.010753
3
0.02013
*

0.003831
1
0.00983
*******
0.007663
2
0.01655
*******
0.003831
1
0.00983
*******
0.038314
1
0.09827
*******
0.007663
1
0.01965
*******
0.003831
1
0.00983
*******
0.003831
1
0.00983
*******
0.003831
1
0.00983
*******
0.007663
1
0.01965
*******
0.003831
1
0.00983
*******
0.003831
1
0.00983
*******
0.003831
6
0.00503
*******
0.003831
4
0.00632
*******
0.003831
2
0.00827
*******
0.003831
2
0.00827
*******
0.003831
1
0.00983
*******
0.007663
3
0.01434
*******
0.015326
11
0.01185
*******
0.003831
2
0.00827
*******
0.007663
1
0.01965
*******
0.003831
1
0.00983
*******
0.003831
4
0.00632
*******
0.003831
3
0.00717
*******
0.011494
1
0.02948
*******
0.003831
1
0.00983
*******
0.015326
16
0.00651
*******
0.003831
1
0.00983
*******
0.003831
1
0.00983
*******
0.003831
6
0.00503
*******
0.003831
1
0.00983
*******
0.003831
1
0.00983
*******
0.003831
1
0.00983
*******
0.003831
1
0.00983
*******
0.003831
3
0.00717
*******
0.003831
10
0.0033
*******
0.003831
4
0.00632
*******
0.003831
1
0.00983
*******

0.003623
1
0.00929
*******
0.003623
5
0.00531
*******
0.003623
3
0.00678
*******
0.003623
5
0.00531
*******
0.007246
4
0.01195
*******
0.003623
2
0.00782
*******
0.007246
5
0.01063
*******
0.007246
3
0.01356
*******
0.01087
6
0.01426
*******
0.003623
3
0.00678
*******
0.003623
2
0.00782
*******
0.003623
1
0.00929
*******
0.003623
2
0.00782
*******
0.003623
2
0.00782
*******
0.01087
10
0.00935
*******
0.003623
1
0.00929
*******
0.007246
1
0.01859
*******
0.003623
3
0.00678
*******
0.007246
2
0.01565
*******
0.003623
2
0.00782
*******
0.007246
1
0.01859
*******
0.003623
2
0.00782
*******
0.007246
5
0.01063
*******
0.003623
1
0.00929
*******
0.003623
3
0.00678
*******
0.003623
1
0.00929
*******
0.003623
8
0.00384
*******
0.003623
17
0.00133
*******
0.003623
2
0.00782
*******
0.007246
11
0.0056
*******
0.007246
3
0.01356
*******
0.003623
3
0.00678
*******
0.003623
1
0.00929
*******
0.003623
1
0.00929
*******
0.003623
1
0.00929
*******
0.003623
1
0.00929
*******
0.003623
1
0.00929
*******
0

*******
0.009709
2
0.02097
*******
0.009709
3
0.01817
*******
0.009709
3
0.01817
*******
0.019417
2
0.04193
*******
0.019417
2
0.04193
*******
0.009709
1
0.0249
*******
0.009709
2
0.02097
*******
0.009709
3
0.01817
*******
0.009709
2
0.02097
*******
0.009709
6
0.01274
*******
0.038835
4
0.06403
*******
0.009709
3
0.01817
*******
0.009709
8
0.0103
*******
0.009709
1
0.0249
*******
0.009709
1
0.0249
*******
0.009709
12
0.00673
*******
0.009709
10
0.00835
*******
0.019417
3
0.03635
*******
0.009709
1
0.0249
*******
0.009709
3
0.01817
*******
0.009709
4
0.01601
*******
0.009709
1
0.0249
*******
0.019417
2
0.04193
*******
0.009709
6
0.01274
*******
0.009709
17
0.00357
*******
0.009709
3
0.01817
*******
0.009709
1
0.0249
*******
0.009709
2
0.02097
*******
0.009709
2
0.02097
*******
0.009709
2
0.02097
*******
0.009709
5
0.01424
*******
0.009709
1
0.0249
*******
0.009709
1
0.0249
*******
0.009709
2
0.02097
*******
0.009709
1
0.0249
*******
0.019417
4
0.03201
*******
0.009709
1
0.0249
*******
0

0.007407
2
0.016
*******
0.007407
2
0.016
*******
0.007407
1
0.019
*******
0.014815
1
0.038
*******
0.007407
2
0.016
*******
0.007407
1
0.019
*******
0.007407
5
0.01086
*******
0.007407
5
0.01086
*******
0.007407
4
0.01221
*******
0.007407
12
0.00513
*******
0.111111
6
0.1458
*******
0.037037
11
0.02864
*******
0.007407
1
0.019
*******
0.007407
3
0.01387
*******
0.003067
8
0.00325
*******
0.003067
3
0.00574
*******
0.003067
1
0.00787
*******
0.003067
1
0.00787
*******
0.003067
1
0.00787
*******
0.003067
2
0.00662
*******
0.003067
1
0.00787
*******
0.003067
2
0.00662
*******
0.003067
6
0.00403
*******
0.003067
2
0.00662
*******
0.003067
12
0.00213
*******
0.003067
5
0.0045
*******
0.003067
1
0.00787
*******
0.009202
2
0.01987
*******
0.003067
2
0.00662
*******
0.003067
1
0.00787
*******
0.003067
1
0.00787
*******
0.003067
1
0.00787
*******
0.003067
1
0.00787
*******
0.009202
2
0.01987
*******
0.003067
2
0.00662
*******
0.003067
1
0.00787
*******
0.003067
1
0.00787
*******
0.003067
4
0.0

In [32]:
tf_idf

{(0, '14th'): 0.027580100617865987,
 (0, '1918'): 0.023220260745735185,
 (0, '75–200'): 0.027580100617865987,
 (0, 'across'): 0.027580100617865987,
 (0, 'affect'): 0.011407225383712501,
 (0, 'aids'): 0.014109531064152353,
 (0, 'also'): 0.007453195489891885,
 (0, 'black'): 0.027580100617865987,
 (0, 'century'): 0.0157670652558433,
 (0, 'continent'): 0.027580100617865987,
 (0, 'cov-2'): 0.0201269051279741,
 (0, 'covid-19'): 0.014109531064152353,
 (0, 'current'): 0.023220260745735185,
 (0, 'death'): 0.008313869765951417,
 (0, 'demo'): 0.027580100617865987,
 (0, 'disease'): 0.018274545977000685,
 (0, 'endemic'): 0.0402538102559482,
 (0, 'epidemic'): 0.009249475970140985,
 (0, 'estimate'): 0.009249475970140985,
 (0, 'exclude'): 0.027580100617865987,
 (0, 'fatal'): 0.027580100617865987,
 (0, 'flu'): 0.017727512103090128,
 (0, 'generally'): 0.0201269051279741,
 (0, 'globe'): 0.027580100617865987,
 (0, 'greek'): 0.027580100617865987,
 (0, 'history'): 0.035455024206180255,
 (0, 'hiv'): 0.014109

In [33]:
doc
def unique_list(l):
    corpus = []
    [corpus.append(x) for x in vocabulary if x not in corpus]
    return corpus

# Vectorize query 

In [34]:
query = "highest pandemic casualties"
query1 = preprocess(query)

In [35]:
query

'highest pandemic casualties'

In [63]:
def unique_list(l):
    query_corpus = []
    [query_corpus.append(x) for x in query if x not in query_corpus]
    return query_corpus


In [64]:
def vectorize(query, vocab=corpus_):
    
    from collections import Counter
    
    tf_idf = {}
    tokens = preprocess(query)
    counter = Counter(tokens)
    words_count = len(tokens)
    #print(counter)
    #print(words_count)
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        #print(round(tf, 6))
        df = doc_freq(token)
        #print(df)
        idf = np.log(N/(df + 1))
        #print(round(idf, 4))
        tf_idf[token] = tf * idf
    return tf_idf

In [65]:
vectorize(query, corpus_)

{'casualty': 0.8549831191538455,
 'high': 0.3536239868950875,
 'pandemic': 0.12257492670843911}

In [None]:
tf_idf

In [None]:
DFs

In [None]:
df_tf = pd.DataFrame.from_records(DFs).fillna(0)
df_tf.head()

In [None]:
df_tf.info

In [None]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [None]:
from collections import Counter
doc = 0

tf_idf = {}

for i in range(N):
    
    tokens = corpus_[i]
    
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

In [None]:
tf_idf

In [None]:
df_tfidf = pd.DataFrame.from_records([tf_idf])
df_tfidf.head()

In [None]:
# #print(tf)
# #print(df)
# #print(tf_idf)
#df_tfidf = pd.DataFrame.from_records(tf_idf)

In [None]:
#df_tfidf.head()

In [None]:
total = set(vocabulary)
wordDict = dict.fromkeys(total, 0)
for word in vocabulary:
    wordDict[word] +=1

In [None]:
df_tf = pd.DataFrame([wordDict]).transpose()
df_tf = df_tf.rename(columns={0: 'frequency'})
df_tf.head()

In [None]:
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(vocabulary)
    for word, count in wordDict.items():
        tfDict[word] = count/float(corpusCount)
    return(tfDict)
#running our sentences through the tf function:
tf = computeTF(wordDict, vocabulary)
#Converting to dataframe for visualization
dfTf = pd.DataFrame([tf]).transpose()

In [None]:
dfTf.head()

In [None]:
import math
def computeIDF(corpus_):
    idfDict = {}
    N = len(corpus_)
    
    idfDict = dict.fromkeys(corpus_[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1))
        
    return(idfDict)
#inputing our sentences in the log file
idfs = computeIDF([wordDict])

In [None]:
dict.fromkeys(corpus_[0].keys)

In [None]:
# # Compute Term Frequency (TF)
# for i in range(n_docs):
#     words = corpus[i].split(" ") # Words in the document
#     for w in words:
#         df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
# df_tf

In [None]:
total = set(res)
wordDict = dict.fromkeys(total, 0)
for word in res:
    wordDict[word] +=1

In [None]:
df_tf = pd.DataFrame([wordDict]).transpose()
df_tf.head()

In [None]:
df_tf.head()

In [None]:
df_tf = df_tf.rename(columns={0: 'frequency'})

In [None]:
df_tf.head()

In [None]:
dfTf = dfTf.rename(columns={0: 'term_frequency'})
dfTf.head()

In [None]:
def computeIDF(corpus_vocab):
    idfDict = {}
    N = len(corpus_vocab)
    
    idfDict = dict.fromkeys(corpus_vocab[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1))
        
    return(idfDict)
#inputing our sentences in the log file
idfs = computeIDF([corpus_vocab])

In [None]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)
#running our two sentences through the IDF:
idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)
#putting it in a dataframe
idf= pd.DataFrame([idfFirst, idfSecond])
print(idf)

In [None]:
len(flat_token_texts)

Compute Tf-Idf vectors for every document

In [None]:
df = pd.read_json('Export_DataFrame.json')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# Building the corpus vocabulary from the dataframe
def build_corpus_vocabulary(documents):
    '''
    Build a vocabulary of all unique tokens found in the document library
    '''
    corpus_vocabulary = [" ".join(text) for text in df['tokenized_text'].values]
    return corpus_vocabulary
    

In [None]:
corpus_vocabulary = build_corpus_vocabulary(df.tokenized_text)

In [None]:
corpus_vocabulary[:2]

In [None]:
#building tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
#X = vectorizer.fit_transform(corpus)
X = vectorizer.fit_transform(corpus_vocabulary)
#converting the result to array causes the prolem
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
#X = pd.DataFrame(X.toarray())

In [None]:
df_tfidf.head()

In [None]:
df_tfidf.tail()

In [None]:
X.shape

In [None]:
X[:2]

In [None]:
# Once we've created the instance, we can "transform" our counts
results = tfidf.fit_transform(corpus)

In [None]:
doc = df['tokenized_text']

In [None]:
doc[:200]

In [None]:
#doc[:200]

In [None]:
corpus[:200]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(doc)

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)

In [None]:
# compute and print the cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

In [None]:
df['title'][:3]

In [None]:
#indices = pd.Series(df.index, index=df['tokenized_text']).drop_duplicates()
def get_recommendations(title, cosine_sim, indices):
    # Get the index of the movie that matches the title
    idx = indices[title]
    # Get the pairwsie similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    tokens_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return df['title'].iloc[tokens_indices]

In [None]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
doc = df['tokenized_text']

In [None]:
indices[20:26]

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(doc)

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Generate recommendations
print(get_recommendations("Swine influenza", cosine_sim, indices))

In [None]:
get_recommendations("Swine influenza", cosine_sim, indices)[:5]

In [None]:
# Counting unique words in the documents
from collections import Counter
bag_of_words = Counter(tokens)
bag_of_words

In [None]:
bag_of_words.most_common(4)

In [None]:

>>> times_harry_appears = bag_of_words['harry']
>>> num_unique_words = len(bag_of_words)
>>> tf = times_harry_appears / num_unique_words
>>> round(tf, 4)
0.1818

# Apply TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial import distance
import pandas as pd
import numpy as np

In [None]:
X = df.iloc[:, -1]
X = X.dropna(how = 'any')

In [None]:
X.head()

In [None]:
## Converting 3D array of array into 1D array
def arr_convert_1d(arr):
    arr = np.array(arr)
    arr = np.concatenate( arr, axis=0 )
    arr = np.concatenate( arr, axis=0 )
    return arr
   
## Cosine Similarity
cos = []
def cosine(trans):
    cos.append(cosine_similarity(trans[0], trans[1]))
   
## Manhatten Distance
manhatten = []
def manhatten_distance(trans):
    manhatten.append(pairwise_distances(trans[0], trans[1], 
                                        metric = 'manhattan'))
   
## Euclidean Distance
euclidean = []
def euclidean_function(vectors):
    euc=euclidean_distances(vectors[0], vectors[1])
    euclidean.append(euc)
   
# This Function finds the similarity between two 
# sentences by using above functions.
  
## TF - IDF
def tfidf(str):
    ques = []
    # You have to provide the dataset. Link of the dataset 
    # is given in the end of this article. 
    # and if you are using a different dataset then adjust 
    # it according to your dataset's columns and rows
    df = pd.read_json('Export_DataFrame.json')
      
    x = df.iloc[:, 1:4]
    x = x.dropna(how = 'any')
      
    for k in range(len(x)):
        for j in [-1]:
            ques.append(x.iloc[k, j])
    vect = TfidfVectorizer()
    # Fit the your whole dataset. After all, this'll 
    # produce the vectors which is based on words in corpus/dataset
    vect.fit(ques)
   
    corpus = [str]
    trans = vect.transform(corpus)
   
    euclidean_function(trans)
    cosine(trans)
    manhatten_distance(trans)
    return convert()

   
# def convert():
#     dataf = pd.DataFrame()
#     lis2 = arr_convert_1d(manhatten)
#     dataf['manhatten'] = lis2
#     lis2 = arr_convert_1d(cos)
#     dataf['cos_sim'] = lis2
#     lis2 = arr_convert_1d(euclidean)
#     dataf['euclidean'] = lis2
#     return dataf
   
# newData = pd.DataFrame(); 
# str = "hello i am pulkit"
# newData = tfidf(str);
# print(newData)

In [None]:
corpus[:200]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(preprocessor=nlp.clean_tf_idf_text)
docs_tfidf = vectorizer.fit_transform(allDocs)

def get_tf_idf_query_similarity(vectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosineSimilarities