In [1]:
# IMPORT PACKAGES
import spacy, pandas, numpy, string
from spacy.lang.de import German
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
import pyLDAvis
import pyLDAvis.sklearn

  from collections import Iterable
  from collections import Mapping


In [2]:
# LOAD DATA S.T. 1 LINE IN XLSX = 1 DOCUMENT
def load_data (path):
    data_raw = open(path + '.csv', encoding = 'utf-8').read().replace('\"', '').replace('\ufeff', '')
    data_list = data_raw.split('\n')
    
    data_list_remove_empty_last_line = []
    for row in range(0, len(data_list)-1):
        data_list_remove_empty_last_line.append(data_list[row])
    
    input_table = [row.split(';') for row in data_list_remove_empty_last_line]
    
    print(path + '.csv: data loaded')
    return data_list_remove_empty_last_line, input_table

In [3]:
# CALCULATE TF / IDF / TF-IDF VALUES
def vectorize (vectorizer_type, min_df, max_df):
    print(vectorizer_type)
    
    if vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(max_df = max_df,
                                     min_df = min_df)
        print('Tfidf-vectorizing done')
        return vectorizer
    
    elif vectorizer_type == 'tf':
        vectorizer = CountVectorizer(max_df = max_df,
                                     min_df = min_df)
        print('Count-vectorizing done')
        return vectorizer                 
            
    else:
        print('error: unknown vectorizer')
        return None

In [4]:
# GENERATE SPARSE DOC-TERM MATRIX 
def generate_input_matrix(vectorizer, input_table):
    t = time()

    column1 = [row[0] for row in input_table] 
    doc_term_matrix = vectorizer.fit_transform(column1)
    
    print('doc-term matrix generated in %s s' %round((time() - t), 5) + ', matrix dimensions: ' + str(doc_term_matrix.shape))
    return doc_term_matrix

In [5]:
# NMF, Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
def generate_nmf_topic_model(doc_term_matrix, beta_loss, n_topics, max_iterations):
    t = time()

    # NMF - Frobenius-norm : ||A||_Fro^2 = \sum_{i,j} A_{ij}^2
    # math: d_{Fro}(X, Y) = \frac{1}{2} ||X - Y||_{Fro}^2 = \frac{1}{2} \sum_{i,j} (X_{ij} - {Y}_{ij})^2
    if beta_loss == 'frobenius':
        matrix_factorization = NMF(beta_loss = beta_loss, n_components = n_topics, max_iter = max_iterations)
        nmf_model = matrix_factorization.fit(doc_term_matrix)
        print('NMF Frobenius topic_model created in %s s' %round((time() - t), 5))

    # NMF - Kullback-Leibler divergence:
    # math: d_{KL}(X, Y) = \sum_{i,j} (X_{ij} log(\frac{X_{ij}}{Y_{ij}}) - X_{ij} + Y_{ij})
    elif beta_loss == 'kullback-leibler':
        matrix_factorization = NMF(beta_loss = beta_loss, n_components = n_topics, max_iter = max_iterations, solver = 'mu')
        nmf_model = matrix_factorization.fit(doc_term_matrix)
        print('NMF Kullback-Leibler topic_model created in %s s' %round((time() - t), 5))           
    
    else:
        print('error: invalid beta_loss')
        
    return matrix_factorization, nmf_model

In [6]:
# LDA, Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
#def generate_lda_topic_model(doc_topic_prior, doc_term_matrix, n_topics, max_iterations, learning_method, learning_offset):
def generate_lda_topic_model(doc_term_matrix, n_topics, max_iterations, learning_method, learning_offset):    
    t = time()
    matrix_factorization = LatentDirichletAllocation(n_components = n_topics,
                                                #    doc_topic_prior = doc_topic_prior,
                                                #    topic_word_prior = topic_word_prior, 
                                                    learning_method = learning_method,
                                                    learning_offset = learning_offset, 
                                                    max_iter = max_iterations)
    
    lda_model = matrix_factorization.fit(doc_term_matrix)
    print('LDA topic_model created in %s s' %round((time() - t), 5))     
    
    return matrix_factorization, lda_model

In [7]:
# PRINT TOPICS, Source: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def print_topics (path, topic_model, feature_names, n_top_words, save):

    # calculates the top words of topics                        
    if (save):
        with open(path + '_topics.csv', 'w', encoding = 'latin-1') as doc_out:
            for idx, topic in enumerate(topic_model.components_):
                topic_list = "#%d, " % (idx + 1)
                topic_list += " ".join(["{} ({}),".format(feature_names[idx], str(round(topic[idx], 3))) for idx in topic.argsort()[:-n_top_words - 1:-1]]) 
                doc_out.write(topic_list + '\n')
                
    else:
        for idx, topic in enumerate(topic_model.components_):
            topic_list = "Topic #%d: " % (idx + 1)
            topic_list += " ".join(["{} ({}),".format(feature_names[idx], str(round(topic[idx], 3))) for idx in topic.argsort()[:-n_top_words - 1:-1]]) 
            print(topic_list)

    return topic_list

In [8]:
# CALCULATE AND PRINT TOPIC DISTRIBUTION
def calculate_topic_distribution(path, topic_model, doc_term_matrix, data_list, feature_names): 

    # TRANSFORM DATA INTO DATAFRAME  
    row_index = 1
    rows = []
    for row in data_list:
        rows.append(str(row_index) + ';' + row)
        row_index = row_index + 1
    
    columns = ["Topic #%d: " % (index + 1) for index, topic in enumerate(topic_model.components_)]
    values = numpy.round(topic_model.transform(doc_term_matrix), 2)
    df = pandas.DataFrame(values, rows, columns)
    df.index.names = ['Row;Record;ID;Age;Date;W;Partner;Family;Friend;Stranger;Past;Future']

    # CALCULATE DOMINANT TOPIC / DOC
    dominant_topic = numpy.argmax(df.values, axis = 1) + 1  
    df['dominant_topic'] =   dominant_topic
#    df = df.sort_values('dominant_topic')

#    topic_total = numpy.sum(values, axis = 0)
#    df.loc['total:', columns] = topic_total
#    df.loc['total:', dominant_topic] = numpy.argmax(topic_total, 0) + 1
    
    # PRINT TO CSV
    df.to_csv(path + '_distr.csv', sep=';', columns=None, header=True, index=True, index_label=None, mode='w', encoding='utf-8', compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, date_format=None, doublequote=True, escapechar=None, decimal='.')
    
    print('topic distribution is done and printed')
    return df

In [9]:
# TUNERS
n_top_words = 5
max_iterations = 200
save = True
# -----------------------------------------------------------
#doc_types = ['1norm', '1Snorm', '2norm', '2Snorm']

data_list, input_table = load_data('./IO_YO/all_2Snorm')

for i in range(3, 21):
    n_topics = i
    min_df = 1
    max_df = 90
    model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_' + str(n_topics)
    print('number of topics in current iteration: ' + str(i))
    
    tf_vectorizer = vectorize ('tf', min_df, max_df)
    tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 
    
#    tfidf_vectorizer = vectorize ('tfidf', min_df, max_df)
#    tfidf_doc_term_matrix = generate_input_matrix (tfidf_vectorizer, input_table)

# PREPARE LDA TOPIC MODEL
    path = './IO_YO/lda' + model_specification_in_file_name
#    matrix_factorization, topic_model = generate_lda_topic_model (1.0, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
    matrix_factorization, topic_model = generate_lda_topic_model (tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0   
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())
    print('lda done')
    """
# PREPARE NMF TOPIC MODEL
    path = './IO_YO/nmf_fr' + model_specification_in_file_name
    matrix_factorization, topic_model = generate_nmf_topic_model (tfidf_doc_term_matrix, 'frobenius', n_topics, max_iterations)
    topic_list = print_topics (path, topic_model, tfidf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tfidf_doc_term_matrix, data_list, tfidf_vectorizer.get_feature_names())
    print('nmf_frob done')

# PREPARE NMF TOPIC MODEL
    path = './IO_YO/nmf_kl' + model_specification_in_file_name
    matrix_factorization, topic_model = generate_nmf_topic_model (tfidf_doc_term_matrix, 'kullback-leibler', n_topics, max_iterations)
    topic_list = print_topics (path, topic_model, tfidf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tfidf_doc_term_matrix, data_list, tfidf_vectorizer.get_feature_names())
    print('nmf_kl done')
    
# PREPARE NMF TOPIC MODEL
    path = './IO_YO/nmf_tf' + model_specification_in_file_name
    matrix_factorization, topic_model = generate_nmf_topic_model (tf_doc_term_matrix, 'frobenius', n_topics, max_iterations)
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())    
    print('nmf with TF done')    
    """
#pyLDAvis.enable_notebook()
#pyLDAvis.sklearn.prepare(matrix_factorization, doc_term_matrix, vectorizer)

./IO_YO/all_2Snorm.csv: data loaded
number of topics in current iteration: 3
tf
Count-vectorizing done
doc-term matrix generated in 0.06782 s, matrix dimensions: (5769, 7061)
LDA topic_model created in 87.76807 s
topic distribution is done and printed
lda done
number of topics in current iteration: 4
tf
Count-vectorizing done
doc-term matrix generated in 0.08996 s, matrix dimensions: (5769, 7061)
LDA topic_model created in 94.5638 s
topic distribution is done and printed
lda done
number of topics in current iteration: 5
tf
Count-vectorizing done
doc-term matrix generated in 0.05059 s, matrix dimensions: (5769, 7061)
LDA topic_model created in 91.17317 s
topic distribution is done and printed
lda done
number of topics in current iteration: 6
tf
Count-vectorizing done
doc-term matrix generated in 0.12012 s, matrix dimensions: (5769, 7061)
LDA topic_model created in 96.82005 s
topic distribution is done and printed
lda done
number of topics in current iteration: 7
tf
Count-vectorizing don

In [10]:
for i in range(3, 21):
    n_topics = i
    min_df = 2
    max_df = 90
    model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_' + str(n_topics)
    print('number of topics in current iteration: ' + str(i))
    
    tf_vectorizer = vectorize ('tf', min_df, max_df)
    tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 
    
# PREPARE LDA TOPIC MODEL
    path = './IO_YO/lda' + model_specification_in_file_name
#    matrix_factorization, topic_model = generate_lda_topic_model (1.0, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
    matrix_factorization, topic_model = generate_lda_topic_model (tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0   
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())
    print('lda done')

number of topics in current iteration: 3
tf
Count-vectorizing done
doc-term matrix generated in 0.09374 s, matrix dimensions: (5769, 2433)
LDA topic_model created in 73.62526 s
topic distribution is done and printed
lda done
number of topics in current iteration: 4
tf
Count-vectorizing done
doc-term matrix generated in 0.06025 s, matrix dimensions: (5769, 2433)
LDA topic_model created in 72.35988 s
topic distribution is done and printed
lda done
number of topics in current iteration: 5
tf
Count-vectorizing done
doc-term matrix generated in 0.06016 s, matrix dimensions: (5769, 2433)
LDA topic_model created in 73.62006 s
topic distribution is done and printed
lda done
number of topics in current iteration: 6
tf
Count-vectorizing done
doc-term matrix generated in 0.06538 s, matrix dimensions: (5769, 2433)
LDA topic_model created in 81.7235 s
topic distribution is done and printed
lda done
number of topics in current iteration: 7
tf
Count-vectorizing done
doc-term matrix generated in 0.059

In [11]:
for i in range(3, 21):
    n_topics = i
    min_df = 1
    max_df = 85
    model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_' + str(n_topics)
    print('number of topics in current iteration: ' + str(i))
    
    tf_vectorizer = vectorize ('tf', min_df, max_df)
    tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 
    
# PREPARE LDA TOPIC MODEL
    path = './IO_YO/lda' + model_specification_in_file_name
#    matrix_factorization, topic_model = generate_lda_topic_model (1.0, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
    matrix_factorization, topic_model = generate_lda_topic_model (tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0   
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())
    print('lda done')

number of topics in current iteration: 3
tf
Count-vectorizing done
doc-term matrix generated in 0.10024 s, matrix dimensions: (5769, 7058)
LDA topic_model created in 70.91324 s
topic distribution is done and printed
lda done
number of topics in current iteration: 4
tf
Count-vectorizing done
doc-term matrix generated in 0.07296 s, matrix dimensions: (5769, 7058)
LDA topic_model created in 73.36167 s
topic distribution is done and printed
lda done
number of topics in current iteration: 5
tf
Count-vectorizing done
doc-term matrix generated in 0.07099 s, matrix dimensions: (5769, 7058)
LDA topic_model created in 72.35954 s
topic distribution is done and printed
lda done
number of topics in current iteration: 6
tf
Count-vectorizing done
doc-term matrix generated in 0.0607 s, matrix dimensions: (5769, 7058)
LDA topic_model created in 93.86444 s
topic distribution is done and printed
lda done
number of topics in current iteration: 7
tf
Count-vectorizing done
doc-term matrix generated in 0.058

In [12]:
for i in range(3, 21):
    n_topics = i
    min_df = 2
    max_df = 85
    model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_' + str(n_topics)
    print('number of topics in current iteration: ' + str(i))
    
    tf_vectorizer = vectorize ('tf', min_df, max_df)
    tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 
    
# PREPARE LDA TOPIC MODEL
    path = './IO_YO/lda' + model_specification_in_file_name
#    matrix_factorization, topic_model = generate_lda_topic_model (1.0, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
    matrix_factorization, topic_model = generate_lda_topic_model (tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0   
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())
    print('lda done')

number of topics in current iteration: 3
tf
Count-vectorizing done
doc-term matrix generated in 0.07909 s, matrix dimensions: (5769, 2430)
LDA topic_model created in 78.282 s
topic distribution is done and printed
lda done
number of topics in current iteration: 4
tf
Count-vectorizing done
doc-term matrix generated in 0.08471 s, matrix dimensions: (5769, 2430)
LDA topic_model created in 77.00756 s
topic distribution is done and printed
lda done
number of topics in current iteration: 5
tf
Count-vectorizing done
doc-term matrix generated in 0.06116 s, matrix dimensions: (5769, 2430)
LDA topic_model created in 69.19299 s
topic distribution is done and printed
lda done
number of topics in current iteration: 6
tf
Count-vectorizing done
doc-term matrix generated in 0.06904 s, matrix dimensions: (5769, 2430)
LDA topic_model created in 57.88906 s
topic distribution is done and printed
lda done
number of topics in current iteration: 7
tf
Count-vectorizing done
doc-term matrix generated in 0.0534

In [13]:
for i in range(3, 21):
    n_topics = i
    min_df = 1
    max_df = 80
    model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_' + str(n_topics)
    print('number of topics in current iteration: ' + str(i))
    
    tf_vectorizer = vectorize ('tf', min_df, max_df)
    tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 

# PREPARE LDA TOPIC MODEL
    path = './IO_YO/lda' + model_specification_in_file_name
    matrix_factorization, topic_model = generate_lda_topic_model (1.0, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
#    matrix_factorization, topic_model = generate_lda_topic_model (tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0   
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())
    print('lda done')

number of topics in current iteration: 3
tf
Count-vectorizing done
doc-term matrix generated in 0.06058 s, matrix dimensions: (5769, 7056)


TypeError: generate_lda_topic_model() got multiple values for argument 'learning_method'

In [None]:
for i in range(3, 21):
    n_topics = i
    min_df = 2
    max_df = 80
    model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_' + str(n_topics)
    print('number of topics in current iteration: ' + str(i))
    
    tf_vectorizer = vectorize ('tf', min_df, max_df)
    tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 

# PREPARE LDA TOPIC MODEL
    path = './IO_YO/lda' + model_specification_in_file_name
#    matrix_factorization, topic_model = generate_lda_topic_model (1.0, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
    matrix_factorization, topic_model = generate_lda_topic_model (tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0   
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())
    print('lda done')

In [None]:
"""
# TUNERS
n_top_words = 5
max_iterations = 200
save = True
min_df = 2
max_df = 85
# -----------------------------------------------------------
data_list, input_table = load_data('./IO_YO/all_2Snorm')

# ALPHA 0.5
for i in range(3, 21):
    n_topics = i
    alpha = 0.5
    beta = None
    model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_alfa' + str(alpha) + '_' + str(n_topics)
    print('number of topics in current iteration: ' + str(i))
    
    tf_vectorizer = vectorize ('tf', min_df, max_df)
    tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 

    path = './IO_YO/topic_lda' + model_specification_in_file_name
    matrix_factorization, topic_model = generate_lda_topic_model (alpha, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names()) 

# ALPHA 1.0
for i in range(3, 21):
    n_topics = i
    alpha = 1.0
    beta = None
    model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_alfa' + str(alpha) + '_' + str(n_topics)
    print('number of topics in current iteration: ' + str(i))
    
    tf_vectorizer = vectorize ('tf', min_df, max_df)
    tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 

    path = './IO_YO/topic_lda' + model_specification_in_file_name
    matrix_factorization, topic_model = generate_lda_topic_model (alpha, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())

# ALPHA 1.5
for i in range(3, 21):
    n_topics = i
    alpha = 1.5
    beta = None
    model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_alfa' + str(alpha) + '_' + str(n_topics)
    print('number of topics in current iteration: ' + str(i))
    
    tf_vectorizer = vectorize ('tf', min_df, max_df)
    tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 

    path = './IO_YO/topic_lda' + model_specification_in_file_name
    matrix_factorization, topic_model = generate_lda_topic_model (alpha, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())    

# ALPHA 2.0
for i in range(3, 21):
    n_topics = i
    alpha = 2.0
    beta = None
    model_specification_in_file_name = '_min' + str(min_df) + '_max' + str(max_df) + '_alfa' + str(alpha) + '_' + str(n_topics)
    print('number of topics in current iteration: ' + str(i))
    
    tf_vectorizer = vectorize ('tf', min_df, max_df)
    tf_doc_term_matrix = generate_input_matrix (tf_vectorizer, input_table) 

# PREPARE LDA TOPIC MODEL
    path = './IO_YO/topic_lda' + model_specification_in_file_name
    matrix_factorization, topic_model = generate_lda_topic_model (alpha, tf_doc_term_matrix, n_topics, max_iterations, learning_method = 'online', learning_offset = 50.) #, random_state = 0
    topic_list = print_topics (path, topic_model, tf_vectorizer.get_feature_names(), n_top_words, save)
    df = calculate_topic_distribution(path, topic_model, tf_doc_term_matrix, data_list, tf_vectorizer.get_feature_names())     
"""
