# Latent Semantic Analysis

## Singular Value Decomposition
(4.4.2)

- 5,000 SMS messages: spam or ham
- 16 topics

In [None]:
# Importing dependencies

import pandas as pd
import numpy as np

from nltk.tokenize.casual import casual_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# from nlpia.data.loaders import get_data

In [None]:
# Load the SMS dataset
pd.options.display.width = 120  # Just for displaying purposes
# the (outdated) nlpia way
# sms = get_data('sms-spam')
# We can download it directly from their repo
url = "https://raw.githubusercontent.com/totalgood/nlpia/master/src/nlpia/data/sms-spam.csv"
sms = pd.read_csv(url)

# Same as before: ! for positive (spam) instances
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]
sms.index = index
sms.head(6)["text"]

In [None]:
# Compute the tf-idf matrix
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
print("number of instances:\t", len(tfidf_docs))
print("size of the vocabulary:\t", len(tfidf.vocabulary_))

In [None]:
print(tfidf_docs[:10])
print("="*80)

print(tfidf_docs[-10:])
print("="*80)

print(tfidf_docs[2000:2010])
print("="*80)

print(len(tfidf_docs[2000]))
print(len(tfidf_docs))

In [None]:
tfidf_docs = pd.DataFrame(tfidf_docs)
# Normalization: centers the vectorized documents (BOW vectors) by subtracting the mean
# (not the best alternative, but enough in this case)
tfidf_docs = tfidf_docs - tfidf_docs.mean()
tfidf_docs.shape

In [None]:
print(tfidf_docs[:10])
print("="*80)

print(tfidf_docs[-10:])
print("="*80)

print(tfidf_docs[2000:2010])
print("="*80)

In [None]:
sms.spam.sum()

So, in summary, this is what we have:

- 4,837 SMS messages
- 9,232 different 1-grams
- 638 spam messages (13%)
- 8:1 ham to spam distribution

By consolidating the dimensions (words) into a smaller number of dimensions (topics), the NLP
pipeline will become more “general”

## Principal Component Analysis on SMS

In [None]:
from sklearn.decomposition import PCA, TruncatedSVD

# This is a crucial parameter. It tells PCA the number of "topics" we want
pca = PCA(n_components=16)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)

columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, columns=columns, index=index)
pca_topic_vectors.round(3).head(6)

In [None]:
# We are going to recover the unique index identifiers from our tfidf This is just an index number
tfidf.vocabulary_

In [None]:
# Sort the vocabulary by term count (we are sorting by value and then displaying the keys)
column_nums, terms = zip(*sorted(zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys())))
terms

In [None]:
# This just maps the types to a (unique) index number
for c in ["!", "\"", "between", ":)"]:
    print("'{}':\t{}".format(c, tfidf.vocabulary_[c]))

In [None]:
# Sort the vocabulary by term count (we are sorting by value and then displaying the keys)
column_nums, terms = zip(*sorted(zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys())))
terms

In [None]:
for i in range(10):
    print(column_nums[i], terms[i])

In [None]:
# Pandas DataFrame with weights, the words on each topic
weights = pd.DataFrame(pca.components_, columns=terms, index=['topic{}'.format(i) for i in range(16)])
pd.options.display.max_columns = 8
weights.head(5).round(3)

Checking the topic values for some _typical_ spam words

In [None]:
pd.options.display.max_columns = 12
deals = weights['! ;) :) half off free crazy deal only $ 80 %'.split()].round(3) * 100
deals

Could you identify "pro-deal" or "anti-deal" topics?

In [None]:
# Topics 4, 8, and 9 appear to all contain positive “deal” topic sentiment
# Topics 0, 3, 5, and 10 appear to be “anti-deal” topics
deals.T.sum()

## Truncated SVD for SMS message semantic analysis

(Ideal for sparse matrices $\rightarrow$ better for large datasets)

In [None]:
# 16 topics
# Iterate through the data 100 times (default is 5)
svd = TruncatedSVD(n_components= 16, n_iter=100)
# Decomposes TF-IDF vectors and transforms them into topic vectors
svd_topic_vectors = svd.fit_transform(tfidf_docs.values)
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, columns=columns, index=index)
# Same as those produced by PCA
svd_topic_vectors.round(3).head(6)

In [None]:
# 2 topics
# Iterate through the data 100 times (default is 5)
svd = TruncatedSVD(n_iter=100)
# Decomposes TF-IDF vectors and transforms them into topic vectors
svd_topic_vectors = svd.fit_transform(tfidf_docs.values)
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, columns=["topic0", "topic1"], index=index)
# Same as those produced by PCA
svd_topic_vectors.round(3).head(6)

Computing the cosine similarity **over topic vectors** to see how close (or far) the vectors are

In [None]:
# Normalizing each topic vector by its length (L2-norm) allows
# to compute the cosine similarity with a dot product
svd_topic_vectors = (svd_topic_vectors.T / np.linalg.norm(svd_topic_vectors, axis=1)).T
svd_topic_vectors.iloc[:10].dot(svd_topic_vectors.iloc[:10].T).round(1)
# Let's analyse columns sms0 and sms2!

### Homework:

1. Build a Naive Bayes' classifier using LSA vectors of different dimensions (e.g., 2, 4, 8, 16, 32)
2. Build a search engine (semantic search) by integrating our keyword-based retrieval system with this representation