In [1]:
import os
import pandas as pd
import glob 
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from pathlib import Path

def load_data(genre):
  documents_list = []

  path = Path("../Dataset/"+genre+"_docs/")
  read_files = glob.glob(os.path.join(path,"*.txt"))

  for f in read_files:
    with open(f, "r", encoding="utf-8") as doc:
      text = doc.read().strip()
      documents_list.append(text)
  return documents_list

In [15]:
import pickle

def save_embeddings_txt(term2topic, genre, k):
    with open("term2topic_"+genre+"_"+str(k)+".txt", mode="wb") as file:
        file.write(pickle.dumps(term2topic)) # use `pickle.loads` to do the reverse

In [5]:
import json

def save_embeddings_json(term2topic, genre, k):
    print("Saving embeddings ... ")
    # save mappings
    # with open("doc2topic_"+genre+"_"+str(k)+".json", mode="w", encoding="utf-8") as fp:
    #     json.dump(doc2topic, fp, ensure_ascii=False, indent=4)

    with open("term2topic_"+genre+"_"+str(k)+".json", mode="w", encoding="utf-8") as fp:
        json.dump(term2topic, fp, ensure_ascii=False, indent=4)


In [11]:
def LSA(text_genre, k):
    print("Training LSA model for genre "+text_genre+" and k="+str(k))
    documents_list = load_data(text_genre)

    # Initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+') #\w+ matches any word character 

    # Vectorize document using TF-IDF
    tfidf = TfidfVectorizer(lowercase=True,
                            stop_words='english',
                            ngram_range = (1,1),
                            tokenizer = tokenizer.tokenize)

    # Fit and Transform the documents
    train_data = tfidf.fit_transform(documents_list)  

    # Define the number of topics or components
    num_components=k

    # Create SVD object
    lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

    # Fit SVD model on data
    U_SIGMA = lsa.fit_transform(train_data) #returns U * SIGMA

    # Get Singular values and Components 
    Sigma = lsa.singular_values_ 
    V_transpose = lsa.components_.T

    U = U_SIGMA / Sigma

    # doc2topic from matrix U
    # doc2topic = {doc: U[idx].tolist() for idx, doc in enumerate(documents_list)}

    # term2topic from matrix V
    term2topic = {
        term: V_transpose[idx] for term, idx in tfidf.vocabulary_.items()
    }

    save_embeddings_txt(term2topic, text_genre, num_components)

In [16]:
#TEST 
# LSA("poetry", 2)

Training LSA model for genre poetry and k=2


In [22]:
for k in range(1, 7): 
    k = 2**k #k ranges from 2 to 64
    # LSA("poetry", k)
    LSA("fiction", k)
    # LSA("nonfiction", k)

Training LSA model for genre fiction and k=2




Training LSA model for genre fiction and k=4
Training LSA model for genre fiction and k=8
Training LSA model for genre fiction and k=16
Training LSA model for genre fiction and k=32
Training LSA model for genre fiction and k=64


In [18]:
# USE THIS TO UNPICKLE TEXT FILE

# infile = open("term2topic_poetry_2.txt",'rb')
# new_dict = pickle.load(infile)
# infile.close()

In [21]:
# TEST: check that types are correct 

# print(type(new_dict))
# i = 0
# for term, embedding in new_dict.items():
#     if i  == 5:
#         break
#     print(term," : ", embedding)
#     print(type(term),type(embedding))
#     i+=1

<class 'dict'>
complete  :  [0.00536542 0.00082792]
<class 'str'> <class 'numpy.ndarray'>
poetical  :  [0.00261243 0.00082802]
<class 'str'> <class 'numpy.ndarray'>
works  :  [0.00800802 0.01094901]
<class 'str'> <class 'numpy.ndarray'>
edgar  :  [0.00152548 0.00181107]
<class 'str'> <class 'numpy.ndarray'>
allan  :  [0.00152632 0.00140544]
<class 'str'> <class 'numpy.ndarray'>
