In [None]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.corpus import inaugural

### Read the inaugural corpus into a dataframe with 2 columns: docid and doc
**docid = simple index**

**doc = raw text**

In [None]:
df = pd.DataFrame()

i=0
for fileid in inaugural.fileids():
    df = df.append({'docid':i, 'doc':inaugural.raw(fileid)}, ignore_index=True)
    i += 1

In [None]:
df.head()

In [None]:
stop_words = set(stopwords.words('english'))
new_stops = ['"', '(', ')', ',', '-', '--', '.', '."', '14th', ':', ';', "'s",\
             '!', '";', '"?', '?"', '$', "'", '),', ',"', '.)', '', '...', '....',\
             '[', '?', ']', '000']
stop_words.update(new_stops)

### scikit-learn contains a very convenient TF-IDF function
see https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# initialize TFidfVectorizer, including stop words
tfidf_vectorizer=TfidfVectorizer(use_idf=True,  stop_words=stop_words )
 
# fit_transform returns the tf-idf transformed corpus in a document-by-term matrix
X=tfidf_vectorizer.fit_transform(df['doc'])

In [None]:
# shape?


In [None]:
# call stop_words to inspect stop words
tfidf_vectorizer.stop_words

In [None]:
# call get_feature_names() to see retained terms
tfidf_vectorizer.get_feature_names()

### Compute the truncated SVD
see https://scikit-learn.org/stable/modules/decomposition.html#truncated-singular-value-decomposition-and-latent-semantic-analysis

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

In [None]:
svd.fit(X)

**With the SVD computed, we can now extract the factors**

`svd.transform(X)` produces $U * \Sigma$

`svd.components_` generates $V^T$

`svd.singular_values_` generates $\Sigma$ in vector form

**With some math we can get $U$, along with $\Sigma$ and $V^T$**

In [None]:
U  = svd.fit_transform(X) / svd.singular_values_
S  = svd.singular_values_
VT = svd.components_

### Looking at $\sigma_i, i=1,\ldots,k$, we can see the diminishing signal as $k$ increases

In [None]:
import matplotlib.pyplot as plt

plt.plot([x for x in range(1,len(S)+1)], S, 'ro')
plt.show()

### To understand the meaning of each _feature vector_ we can look at the terms with largest positive values
`argsort()` when applied to a list sorts by value

In [None]:
(-VT[0]).argsort()[:5]

**for each singular vector (feature vector), find the index of the 5 largest terms, and extract the corresponding terms**

In [None]:
terms = tfidf_vectorizer.get_feature_names()
for k in range(0,5):
    for i in (-VT[k]).argsort()[:5]:
        print(k,terms[i])

---
### Let's look at performing IR with a query

In [None]:
query = 'peace and prosperity and love and care'

**We need to transform the query using the corpus tfidf**

`sklearn.feature_extraction.text.TfidfVectorizer` has a function `transform()` that does exaclty this

In [None]:
query_xform = tfidf_vectorizer.transform([query])

In [None]:
print(query_xform)

**Now we project the transformed query into the document-by-feature space**

Let's invert $\Sigma$ by first converting the vector to a diagonal matrix with `np.diag()`
and then inverting it using `np.linalg.inv()`

In [None]:
Sinv = np.linalg.inv(np.diag(S))

In [None]:
Sinv

In [None]:
1/3.481631

In [None]:
np.matmul(np.diag(S),Sinv)

In [None]:
Z = np.matmul(np.transpose(VT),Sinv)

In [None]:
Z

In [None]:
qk = np.matmul(query_xform.toarray(),Z)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity(qk,qk)

In [None]:
U[0]

In [None]:
maxindex = -1
maxcos   = 0
it = np.nditer(U, flags=['f_index'])

for i in range(U.shape[0]):
    d=U[i]
    cos = cosine_similarity([d],qk)
    print(i,cos,maxcos,maxindex)
    if cos>maxcos:
        maxcos=cos
        maxindex=i

In [None]:
print(maxcos, maxindex)

In [None]:
pd.set_option('display.max_colwidth', -1)
print(df[df['docid']==39])

In [None]:
df[]