In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import glob 
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def load_data(genre):
  documents_list = []

  read_files = glob.glob("/content/drive/My Drive/pg_data/"+genre+"/*.txt")

  for f in read_files:
    with open(f, "r") as doc:
      text = doc.read().strip()
      documents_list.append(text)
  return documents_list

In [6]:
documents_list = load_data("fiction")

# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+') #\w+ matches any word character 

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(documents_list)   

In [28]:
# Define the number of topics or components
num_components=5

# Create SVD object
lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

# Fit SVD model on data
U_SIGMA = lsa.fit_transform(train_data) #returns U * SIGMA

# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

U = U_SIGMA / Sigma

[ 0.0023418  -0.00209438 -0.00485932 -0.00469996 -0.00266133]
Topic 0:  ['s', 'said', 'man', 't', 'time']
Topic 1:  ['artagnan', 'athos', 'porthos', 'd', 'said']
Topic 2:  ['fathom', 'renaldo', 'count', 'ferdinand', 'melvil']
Topic 3:  ['mr', 'lydgate', 'dorothea', 'casaubon', 'bulstrode']
Topic 4:  ['raskolnikov', 'sonia', 't', 'razumihin', 'dounia']


In [24]:
# doc2topic from matrix U
doc2topic = {doc: U[idx].tolist() for idx, doc in enumerate(documents_list)}

# term2topic from matrix V
term2topic = {
    term: V_transpose[idx].tolist() for term, idx in tfidf.vocabulary_.items()
}

In [25]:
import json

# save mappings
with open("doc2topic.json", mode="w") as fp:
    json.dump(doc2topic, fp, ensure_ascii=False, indent=4)

with open("term2topic.json", mode="w") as fp:
    json.dump(term2topic, fp, ensure_ascii=False, indent=4)