In [2]:
# IMPORT PACKAGES
import spacy, pandas, numpy, string
from spacy.lang.de import German
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
import pyLDAvis
import pyLDAvis.sklearn

  from collections import Iterable
  from collections import Mapping


In [10]:
# LOAD DATA S.T. 1 LINE IN XLSX = 1 DOCUMENT
def load_data (path, one_doc_per_person):
    data_raw = open(path + '.csv', encoding = 'utf-8').read().replace('\"', '').replace('\ufeff', '')
    data_list = data_raw.split('\n')
    
    data_list_remove_empty_last_line = []
    for row in range(0, len(data_list)-1):
        data_list_remove_empty_last_line.append(data_list[row])
    
    # rebuild the "raw" data by combining records of the same person
    # !!! metadata (date, temporal focus, interlocutor) are invalid
    if one_doc_per_person:
        rec_dict = {}
        age_dict = {}
        num_cols = data_list_remove_empty_last_line[0].count(';') + 1
        for row in data_list_remove_empty_last_line:
            cells = row.split(';')
            words = cells[0]
            id = cells[1]
            age = cells[2]

            if id in rec_dict:
                rec_dict[id] = rec_dict[id] + ' ' + words
            else:
                rec_dict[id] = words

            age_dict[id] = age
        # now we have full dictionaries -> rewrite data_list_remove_empty_last_line
        data_list_remove_empty_last_line = \
            [rec_dict[id] + ';' + id + ';' + age_dict[id] + ';2000_01_01' + (num_cols-4) * ';0' for id in rec_dict]

        
    input_table = [row.split(';') for row in data_list_remove_empty_last_line]      
    
#    print(path + '.csv: data loaded')
    return data_list_remove_empty_last_line, input_table

In [4]:
# CALCULATE TF / IDF / TF-IDF VALUES
def vectorize (vectorizer_type, min_df, max_df):
    
    if vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(max_df = max_df,
                                     min_df = min_df)
#        print('Tfidf-vectorizing done')
        return vectorizer
    
    elif vectorizer_type == 'tf':
        vectorizer = CountVectorizer(max_df = max_df,
                                     min_df = min_df)
#        print('Count-vectorizing done')
        return vectorizer                 
            
    else:
        print('error: unknown vectorizer')
        return None

In [5]:
# GENERATE SPARSE DOC-TERM MATRIX 
def generate_input_matrix(vectorizer, input_table):
    t = time()

    column1 = [row[0] for row in input_table] 
    doc_term_matrix = vectorizer.fit_transform(column1)
    
#    print('doc-term matrix generated in %s s' %round((time() - t), 5) + ', matrix dimensions: ' + str(doc_term_matrix.shape))
    return doc_term_matrix

In [6]:
# NMF, Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
def generate_nmf_topic_model(doc_term_matrix, beta_loss, n_topics, max_iterations):
    t = time()

    # NMF - Frobenius-norm : ||A||_Fro^2 = \sum_{i,j} A_{ij}^2
    # math: d_{Fro}(X, Y) = \frac{1}{2} ||X - Y||_{Fro}^2 = \frac{1}{2} \sum_{i,j} (X_{ij} - {Y}_{ij})^2
    if beta_loss == 'frobenius':
        matrix_factorization = NMF(beta_loss = beta_loss, n_components = n_topics, max_iter = max_iterations)
        nmf_model = matrix_factorization.fit(doc_term_matrix)
#        print('NMF Frobenius topic_model created in %s s' %round((time() - t), 5))

    # NMF - Kullback-Leibler divergence:
    # math: d_{KL}(X, Y) = \sum_{i,j} (X_{ij} log(\frac{X_{ij}}{Y_{ij}}) - X_{ij} + Y_{ij})
    elif beta_loss == 'kullback-leibler':
        matrix_factorization = NMF(beta_loss = beta_loss, n_components = n_topics, max_iter = max_iterations, solver = 'mu')
        nmf_model = matrix_factorization.fit(doc_term_matrix)
#        print('NMF Kullback-Leibler topic_model created in %s s' %round((time() - t), 5))           
    
    else:
        print('error: invalid beta_loss')
        
    return matrix_factorization, nmf_model

In [7]:
# LDA, Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
def generate_lda_topic_model(doc_topic_prior, topic_word_prior, doc_term_matrix, n_topics, max_iterations, learning_method, learning_offset):
#def generate_lda_topic_model(doc_term_matrix, n_topics, max_iterations, learning_method, learning_offset):    
    t = time()
    matrix_factorization = LatentDirichletAllocation(n_components = n_topics,
                                                    doc_topic_prior = doc_topic_prior,
                                                    topic_word_prior = topic_word_prior, 
                                                    learning_method = learning_method,
                                                    learning_offset = learning_offset, 
                                                    max_iter = max_iterations)
    
    lda_model = matrix_factorization.fit(doc_term_matrix)
#    print('LDA topic_model created in %s s' %round((time() - t), 5))     
    
    return matrix_factorization, lda_model

In [8]:
# PRINT TOPICS, Source: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def print_topics (path, topic_model, feature_names, n_top_words, save):

    # calculates the top words of topics                        
    if (save):
        with open(path + '_topics.csv', 'w', encoding = 'latin-1') as doc_out:
            for idx, topic in enumerate(topic_model.components_):
                topic_list = "#%d, " % (idx + 1)
                topic_list += " ".join(["{} ({}),".format(feature_names[idx], str(round(topic[idx], 5))) for idx in topic.argsort()[:-n_top_words - 1:-1]]) 
                doc_out.write(topic_list + '\n')
                
    else:
        for idx, topic in enumerate(topic_model.components_):
            topic_list = "Topic #%d: " % (idx + 1)
            topic_list += " ".join(["{} ({}),".format(feature_names[idx], str(round(topic[idx], 5))) for idx in topic.argsort()[:-n_top_words - 1:-1]]) 
            print(topic_list)

    return topic_list

In [12]:
# CALCULATE AND PRINT TOPIC DISTRIBUTION
def calculate_topic_distribution(path, topic_model, doc_term_matrix, data_list, feature_names): 

    # TRANSFORM DATA INTO DATAFRAME  
    row_index = 1
    rows = []
    for row in data_list:
        rows.append(str(row_index) + ';' + row)
        row_index = row_index + 1
    
    columns = ["Topic #%d: " % (index + 1) for index, topic in enumerate(topic_model.components_)]
    values = numpy.round(topic_model.transform(doc_term_matrix), 5)
    df = pandas.DataFrame(values, rows, columns)
    df.index.names = ['Row;Record;ID;Age;Date;W;Partner;Family;Friend;Stranger;Past;Future']

    # CALCULATE DOMINANT TOPIC / DOC BUT MASK IT WHEN ALL TOPICS HAVE EQUAL WEIGHT
    dominant_topic = numpy.argmax(df.values, axis = 1) + 1 
    flat_distr = numpy.equal(numpy.amax(df.values, axis = 1), numpy.amin(df.values, axis = 1))
    numpy.putmask(dominant_topic, flat_distr, 0)
    df['dominant_topic'] =   dominant_topic
    df = df.sort_values('dominant_topic')

#    topic_total = numpy.sum(df.values, axis = 0)
#    df.loc['total:', columns] = topic_total
#    df.loc['total:', dominant_topic] = numpy.argmax(topic_total, 0) + 1
    
    # PRINT TO CSV
    df.to_csv(path + '_distr.csv', sep=';', columns=None, header=True, index=True, index_label=None, mode='w', encoding='utf-8', compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, date_format=None, doublequote=True, escapechar=None, decimal='.')
    
#    print('topic distribution is done and printed')
    return df

In [19]:
# NMF FILE GENERATOR
def generate_nmf_models(max_dfs, min_dfs, n_top_words, max_iterations, data_list, input_table, save):
    max_dfs = max_dfs
    min_dfs = min_dfs
    
    for max_df in max_dfs:
        for min_df in min_dfs:
            for n_topics in range(1, 41):

                model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_k' + str(n_topics)
                print('in current iteration, min_df=' + str(min_df) + ', max_df=' + str(max_df) + ', number of topics=' + str(n_topics))
    
                tf_vectorizer = vectorize ('tf', min_df, max_df)
                tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 
    
                tfidf_vectorizer = vectorize ('tfidf', min_df, max_df)
                tfidf_doc_term_matrix = generate_input_matrix (tfidf_vectorizer, input_table)

            # PREPARE NMF TOPIC MODEL
                path = './IO_YO/NMF_TFIDF_FR' + model_specification_in_file_name
                matrix_factorization, topic_model = generate_nmf_topic_model (tfidf_doc_term_matrix, 'frobenius', n_topics, max_iterations)
                topic_list = print_topics (path, topic_model, tfidf_vectorizer.get_feature_names(), n_top_words, save)
                df = calculate_topic_distribution(path, topic_model, tfidf_doc_term_matrix, data_list, tfidf_vectorizer.get_feature_names())
                print('nmf with TFIDF and Frobenius done')

            # PREPARE NMF TOPIC MODEL
                path = './IO_YO/NMF_TFIDF_KL' + model_specification_in_file_name
                matrix_factorization, topic_model = generate_nmf_topic_model (tfidf_doc_term_matrix, 'kullback-leibler', n_topics, max_iterations)
                topic_list = print_topics (path, topic_model, tfidf_vectorizer.get_feature_names(), n_top_words, save)
                df = calculate_topic_distribution(path, topic_model, tfidf_doc_term_matrix, data_list, tfidf_vectorizer.get_feature_names())
                print('nmf with TFIDF and Kullback-Leibler done')
    
            # PREPARE NMF TOPIC MODEL
                path = './IO_YO/NMF_TF_FR' + model_specification_in_file_name
                matrix_factorization, topic_model = generate_nmf_topic_model (tf_doc_term_matrix, 'frobenius', n_topics, max_iterations)
                topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
                df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())    
                print('nmf with TF and Frobenius done')

            # PREPARE NMF TOPIC MODEL
                path = './IO_YO/NMF_TF_KL' + model_specification_in_file_name
                matrix_factorization, topic_model = generate_nmf_topic_model (tf_doc_term_matrix, 'kullback-leibler', n_topics, max_iterations)
                topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
                df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())    
                print('nmf with TF and Kullback-Leibler done')
                
    return None

In [33]:
# LDA FILE GENERATOR
def generate_lda_models(max_dfs, min_dfs, n_top_words, alphas, betas, max_iterations, data_list, input_table, save):
    
    alphas = alphas
    betas = betas
    
    for max_df in max_dfs:
        for min_df in min_dfs:
            
            tf_vectorizer = vectorize ('tf', min_df, max_df)
            tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 

            for alpha in alphas:
                for beta in betas:
                    for n_topics in range(1, 41):
                
                        print('in current iteration, min_df=' + str(min_df) + ', max_df=' + str(max_df) + ', alpha=' + str(alpha) + ', beta=' + str(beta) + ', number of topics=' + str(n_topics))
                        path = './IO_YO/lda' + '_min' + str(min_df) + '_max' + str(max_df) + '_alpha' + str(alpha) + '_beta' + str(beta) + '_k' + str(n_topics)
            
                        matrix_factorization, topic_model = generate_lda_topic_model (alpha, beta, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
                        topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
                        df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names()) 

    return None

In [41]:
# TUNERS
#doc_types = ['1norm', '1Snorm', '2norm', '2Snorm']
alphas = [1.0]
betas = [1.0]
max_dfs = [70]
min_dfs = [1, 2]
n_top_words = 5
max_iterations = 200
save = True    #True = prints distr and topics to file, False = only prints topics to console
one_doc_per_person = False     #True: 1doc = 1person, False: 1doc = 1record

# -----------------------------------------------------------
# EXECUTE

data_list, input_table = load_data('./IO_YO/all_2Snorm', one_doc_per_person)

#generate_nmf_models(max_dfs, min_dfs, n_top_words, max_iterations, data_list, input_table, save)

#generate_lda_models(max_dfs, min_dfs, n_top_words, alphas, betas, max_iterations, data_list, input_table, save)