# Week 6 Topic Modeling

In [None]:
import pandas as pd
import numpy as np
import gensim
import nltk
nltk.download('punkt')

## Truncated SVD example

In [None]:
# Get TruncatedSVD

sample = [[1,0,0],
          [1,1,2],
          [1,1,0],
          [1,1,0],
          [1,1,0],
          [1,0,0],
          [1,0,1],
          [0,1,1],
          [0,0,1]
          ]
from sklearn.decomposition import TruncatedSVD
svd_model_sample = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=15, random_state=None)
svd_model_sample.fit_transform(sample)

# Generate Sigma
sigma = svd_model_sample.singular_values_
sigma

In [None]:
# Generate VT
VT = svd_model_sample.components_
VT

In [None]:
# Generate U
U = svd_model_sample.transform(sample).dot(np.linalg.inv(np.diag(svd_model_sample.singular_values_)))
U

In [None]:
# Relationship btw words and topics
U*sigma

In [None]:
# Relationship btw topics and documents
np.diag(sigma).dot(VT)

## LSA w/ Fetch_20newsgroups and Sklearn

In [None]:
# Load data
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [None]:
# An example of documents
documents[0]

In [None]:
# Data Cleansing
news_df = pd.DataFrame({'document':documents})
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [None]:
# An example of preprocessed documents
news_df['clean_doc'][0]

In [None]:
# Build TF-IDF Matrix
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000,
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])
X.shape

In [None]:
# Get TruncatedSVD
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(X)
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(svd_model.components_,terms)

## LDA w/ Fetch_20newsgroups and Sklearn

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=20, learning_method='online', random_state=123, max_iter=1)

In [None]:
lda_model.fit_transform(X)

In [None]:
print(lda_model.components_)
print(lda_model.components_.shape) 

In [None]:
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
        
get_topics(lda_model.components_,terms)