In [2]:
import os
import pandas as pd
import glob 
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def load_data(genre):
  documents_list = []

  path = Path("../Dataset/"+genre+"_docs/")
  read_files = glob.glob(os.path.join(path,"*.txt"))

  for f in read_files:
    with open(f, "r", encoding="utf-8") as doc:
      text = doc.read().strip()
      documents_list.append(text)
  return documents_list

In [None]:
import json

def save_embeddings(term2topic, doc2topic, genre, k):
    print("Saving embeddings ... ")
    # save mappings
    with open("doc2topic_"+genre+"_"+str(k)+".json", mode="w", encoding="utf-8") as fp:
        json.dump(doc2topic, fp, ensure_ascii=False, indent=4)

    with open("term2topic_"+genre+"_"+str(k)+".json", mode="w", encoding="utf-8") as fp:
        json.dump(term2topic, fp, ensure_ascii=False, indent=4)

In [None]:
def LSA(text_genre, k):
    print("Training LSA model for genre "+text_genre+" and k="+str(k))
    documents_list = load_data(text_genre)

    # Initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+') #\w+ matches any word character 

    # Vectorize document using TF-IDF
    tfidf = TfidfVectorizer(lowercase=True,
                            stop_words='english',
                            ngram_range = (1,1),
                            tokenizer = tokenizer.tokenize)

    # Fit and Transform the documents
    train_data = tfidf.fit_transform(documents_list)  

    # Define the number of topics or components
    num_components=k

    # Create SVD object
    lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

    # Fit SVD model on data
    U_SIGMA = lsa.fit_transform(train_data) #returns U * SIGMA

    # Get Singular values and Components 
    Sigma = lsa.singular_values_ 
    V_transpose = lsa.components_.T

    U = U_SIGMA / Sigma

    # doc2topic from matrix U
    doc2topic = {doc: U[idx].tolist() for idx, doc in enumerate(documents_list)}

    # term2topic from matrix V
    term2topic = {
        term: V_transpose[idx].tolist() for term, idx in tfidf.vocabulary_.items()
    }

    save_embeddings(term2topic, doc2topic, text_genre, num_components)

In [None]:
for k in range(1, 7): 
    k = 2**k #k ranges from 2 to 64
    LSA("poetry", k)
    LSA("fiction", k)
    LSA("nonfiction", k)