In [None]:
import pandas as pd
import numpy
import pickle
import time
import joblib
import gensim
import matplotlib.pyplot as plt

from itertools import islice
from scipy.linalg import block_diag
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim.models.coherencemodel import CoherenceModel

In [None]:
# Functions


# Create a new document term matrix using the topic distribution
def create_matrix(windows_H, windows_terms):
    """
    Create the topic-term matrix from all window topics that have been added so far.
    Parameters:
    ----------
    windows_H: windiws topic distribution of top n words
    windows_terms: windows terms used for each fiscal year
    """
    # Set a list of all terms unique terms across windows (all_terms) and the combine windows terms (all_windows_terms)
    all_windows_terms = sum(windows_terms,[])
    
    # Create a block diagonal matrix of all topics: the number of rows is the same as the length of list_terms
    M = block_diag(*windows_H)
    
    # Identify duplicated terms (columns) and sum them
    # The fastest way is to transform M into data frame with
    dfM = pd.DataFrame(data = M, columns=all_windows_terms).groupby(level=0, axis=1).sum()
    
    # Transform back the dataframe to matrix and get the variable names (in the order in the matrix) as the final all terms
    M_concat = dfM.to_numpy()
    all_terms = list(dfM.columns)
    
    
    print('--- New document-terms have been created ---')
    
    return M_concat, all_terms



# Track the dynamic of a given topic (option topic)
def track_dynamic(topic,W,windows_topic_list):
    """
    Link topics in the first stage with topic in second stage using the matrix W
    Parameters:
    ----------
    topic: topic to track the dynamic
    W: weigth matrix from the second stage
    windows_topic_list: topic list from the first stage
    """
    # For each topic from the first stage (rows) find the topic in the second stage (columns) with the higher weight
    topic_second = []
    for i, topic_first in enumerate(W):
        topic_second.append(topic_first.argmax())
        
    # Split topics classification in the first by year
    it = iter(topic_second)
    topic_first_year = [[next(it) for _ in range(size)] for size in windows_topic]
    
    # For each topic, identify the correspondance for each year
    dynamic_topic_list = []
    for y in range(0, len(year)):
        topic_year = [i for i, e in enumerate(topic_first_year[y]) if e == topic]
        dynamic_topic_list.append(topic_year)

    # Compute the list of list of topics (list of year and list of main topic)
    dynamic_topic = []
    for y in range(0, len(year)):
        dynamic_list = dynamic_topic_list[y]
        fy_topic = [windows_topic_list[y][dynamic_list[i]] for i in range(0,len(dynamic_list))] 
        dynamic_topic.append(fy_topic)
        
    # Print the result in a dataframe
    topic_print = []
    names = []

    # print the dynamic topic
    for y in range(0,len(year)):
        for t in range(0,len(dynamic_topic[y])):
            topic_print.append(dynamic_topic[y][t])
            names.append('Year_'+str(year[y])+'_'+str(t))
        
    df = pd.DataFrame (topic_print).transpose()
    df.columns = names
    
    return df, dynamic_topic_list


In [None]:
# Create a new term-document matrix: Combining all the top term from the windiws nmf
path = '/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/nmf_fullabstract/'
batch = 7
n_topics = list(range(20,61,5))

windows_topic_list = []
windows_W = []
windows_H = []
windows_terms = []

# Build the windows H matrix
for fy in year:
    # Upload the nmf model 
    (nmf_time,topics_list,W_list,H_list) = joblib.load( path+'nmf_out/windows_nmf'+str(fy)+'.pkl' )
    
    # Upload model from the first stage
    (model, max_coherence) = joblib.dump( '/project/biocomplexity/sdad/projects_data/ncses/prd/Dynamic_Topics_Modelling/nmf_fullabstract/first_stage.pkl' )

    # Build the list of terms for all topics (top_n) in a given fiscal year
    k = model(year.index(select_year))
    index = n_topics.index(k)
    fy_topic_list = topics_list[index]
    
    # Get the H and W matrix for the model
    W = W_list[index]
    H = H_list[index]
        
    # select the index of terms that appear in the topics and subset the matrix H to those terms
    topic_terms = list(set(sum(fy_topic_list,[])))
    indcol = [terms.index(i) for i in topic_terms]
    subH = H[:,indcol]
        
    # For each topic (rows) set the weigth of terms that are not listed the topic to 0.
    for i,j in enumerate(subH):
        # by row find the index of top_n terms
        indtopic = [topic_terms.index(p) for p in fy_topic_list[i]]
        notop = [k for k in range(len(topic_terms)) if k not in indtopic]
        j[notop]=0

    # append the result
    windows_topic_list.append(fy_topic_list)
    windows_topic.append(topic_select)
    windows_W.append(W)
    windows_H.append(subH)
    windows_terms.append(topic_terms)
    

In [None]:
    # Build the new document-term matrix M
(M, all_terms) = create_matrix(windows_H, windows_terms)
    
# Run am nmf model from the new document term matrix
(nmf_time,topics_list,W_list,H_list) = nmf_models(doc_term_matrix=M, n_topics=n_topics, vectorizer=all_terms, rand_start = (batch)*len(n_topics))

# Save the result for the second nmf