In [8]:
import numpy as np
zip('cat dog apple lion NYC love'.split(), np.random.rand(6))

<zip at 0x7fdd70bcc0c0>

In [9]:
list(zip('cat dog apple lion NYC love'.split(), np.random.rand(6)))

[('cat', 0.3180158844946426),
 ('dog', 0.3757127774440463),
 ('apple', 0.5332539392393864),
 ('lion', 0.9083701256434001),
 ('NYC', 0.7216707434632881),
 ('love', 0.6602352366798563)]

# 4.2 Latent Semantic Analysis

In [10]:
from nlpia.book.examples.ch04_catdog_lsa_3x6x16 import word_topic_vectors
word_topic_vectors.T.round(1)

ModuleNotFoundError: No module named 'nlpia.book'

In [None]:
from nlpia.book.examples.ch04_catdog_lsa_sorted import lsa_models, prettify_tdm
bow_svd, tfidf_svd = lsa_models()
prettify_tdm(**bow_svd)

In [None]:
tdm = bow_svd['tdm']
tdm

# Singular Value Decomposition

In [None]:
import numpy as np
U, s, Vt = np.linalg.svd(tdm)
import pandas as pd
pd.DataFrame(U, index=tdm.index).round(2)

In [None]:
s.round(1)

In [None]:
S = np.zeros((len(U), len(Vt)))
S

In [None]:
err = []
for numdim in range(len(s), 0, -1):
    S[numdim-1, numdim-1] = 0
    reconstructed_tdm = U.dot(S).dot(Vt)
    err.append(np.sqrt(((reconstructed_tdm - tdm).values.flatten() ** 2).sum() / np.product(tdm.shape)))
np.array(err).round(2)

# 4.4 Principal Component Analysis

## Load SMS Messages

In [None]:
import pandas as pd
from nlpia.data.loaders import get_data
sms = get_data('sms-spam')
index = ['sms{}{}'.format(i, '!'*j)
        for (i,j) in zip(range(len(sms)), sms.spam)]
index

In [None]:
sms.index = index
sms.head(6)

## Calculate TF-IDF vectors for each messages

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize

tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text)
tfidf_docs

In [None]:
tfidf_docs = tfidf_docs.toarray()

In [None]:
len(tfidf.vocabulary_)

In [None]:
tfidf_docs = pd.DataFrame(tfidf_docs)
tfidf_docs = tfidf_docs - tfidf_docs.mean()
tfidf_docs.shape

In [None]:
sms.spam.sum()

### We have an imbalanced dataset and more features (words) than messages. Therefore, we are more likely to overfit and our spam filter will only dependent on spammy words being in the spammy messages. But the spammers could just use synonyms. 

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=16)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, columns=columns, index=index)

In [None]:
pca_topic_vectors.round(3).head(6)

In [None]:
pca.components_

In [None]:
# TFIDFVectorizer stores the vocab as a dicgtionary that maps each term to an index number
tfidf.vocabulary_

In [None]:
tfidf.vocabulary_.values()

In [None]:
zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys())

In [None]:
sorted(zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys()))

In [None]:
column_nums, terms = zip(*sorted(zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys())))