### Term Frequency

In [4]:
documents=["This Document focuses on processing free text data",
          "This also includes natural language processing",
          "text processing via bag of words"]

In [5]:
document_words=[doc.split() for doc in documents]
vocab=sorted(set(sum(document_words,[])))
vocab_dict={k:i for i,k in enumerate(vocab)}
print(vocab)

['Document', 'This', 'also', 'bag', 'data', 'focuses', 'free', 'includes', 'language', 'natural', 'of', 'on', 'processing', 'text', 'via', 'words']


In [6]:
print(vocab_dict)

{'Document': 0, 'This': 1, 'also': 2, 'bag': 3, 'data': 4, 'focuses': 5, 'free': 6, 'includes': 7, 'language': 8, 'natural': 9, 'of': 10, 'on': 11, 'processing': 12, 'text': 13, 'via': 14, 'words': 15}


In [7]:
import numpy as np

In [8]:
# Creating  a matrix that contains word counts (term frequencies) for all the documents
X=np.zeros((len(documents),len(vocab)), dtype=int)
for i,doc in enumerate (document_words):
    for word in doc:
        X[i,vocab_dict[word]]+=1
print(X)

[[1 1 0 0 1 1 1 0 0 0 0 1 1 1 0 0]
 [0 1 1 0 0 0 0 1 1 1 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 1 0 1 1 1 1]]


### Inverse Document Frequency

In [9]:
# Compute inverse document frequency for our data set as follows, which mainly just requires counting how many documents contain each word
idf=np.log(X.shape[0]/X.astype(bool).sum(axis=0))
print (idf) 

[1.09861229 0.40546511 1.09861229 1.09861229 1.09861229 1.09861229
 1.09861229 1.09861229 1.09861229 1.09861229 1.09861229 1.09861229
 0.         0.40546511 1.09861229 1.09861229]


### TFIDF

In [10]:
# Scales the columns of the term frequency matrix by their inverse document frequency
Xidf=X*idf
print(Xidf)

[[1.09861229 0.40546511 0.         0.         1.09861229 1.09861229
  1.09861229 0.         0.         0.         0.         1.09861229
  0.         0.40546511 0.         0.        ]
 [0.         0.40546511 1.09861229 0.         0.         0.
  0.         1.09861229 1.09861229 1.09861229 0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         1.09861229 0.         0.
  0.         0.         0.         0.         1.09861229 0.
  0.         0.40546511 1.09861229 1.09861229]]


### Cosine similarity

In [11]:
# The cosine similarity is a number between zero (meaning the two documents share no terms in common) and one (meaning the two documents have the exact same term frequency or TFIDF representation)
Xnorm=Xidf/np.linalg.norm(Xidf,axis=1)[:,None]
M=Xnorm@Xnorm.T
print(M)

[[1.         0.02916832 0.02916832]
 [0.02916832 1.         0.        ]
 [0.02916832 0.         1.        ]]


### Word embedding and word2vec

In [12]:
documents = [
    "pittsburgh has some excellent new restaurants",
    "boston is a city with great cuisine",
    "postgresql is a relational database management system"
]

In [13]:
document_words = [doc.split() for doc in documents]
vocab = sorted(set(sum(document_words, [])))
vocab_dict = {k:i for i,k in enumerate(vocab)}
print(vocab_dict, "\n")

{'a': 0, 'boston': 1, 'city': 2, 'cuisine': 3, 'database': 4, 'excellent': 5, 'great': 6, 'has': 7, 'is': 8, 'management': 9, 'new': 10, 'pittsburgh': 11, 'postgresql': 12, 'relational': 13, 'restaurants': 14, 'some': 15, 'system': 16, 'with': 17} 



In [14]:
epit=np.zeros(len(vocab))
epit[vocab_dict["pittsburgh"]]=1
print(epit)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]


### TFIDF in gensim

In [15]:
import gensim as gs
import numpy as np

In [16]:
dictionary = gs.corpora.Dictionary(document_words)
corpus = [dictionary.doc2bow(doc) for doc in document_words]
tfidf = gs.models.TfidfModel(corpus)
X_tfidf = gs.matutils.corpus2csc(tfidf[corpus])
print(X_tfidf.todense().T)

[[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.16073254 0.43550663 0.43550663 0.43550663 0.43550663 0.16073254
  0.43550663 0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.16073254 0.         0.         0.         0.         0.16073254
  0.         0.43550663 0.43550663 0.43550663 0.43550663 0.43550663]]


In [17]:
M = gs.similarities.MatrixSimilarity(tfidf[corpus])
print(M.get_similarities(tfidf[corpus]))

[[1.        0.        0.       ]
 [0.        1.        0.0516699]
 [0.        0.0516699 1.       ]]


  if np.issubdtype(vec.dtype, np.int):
