In [1]:
# IMPORT PACKAGES
import spacy, pandas, numpy, string
from spacy.lang.de import German
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time

In [2]:
# LOAD DATA S.T. 1 LINE IN XLSX = 1 DOCUMENT
def load_data (path):
    data_raw = open(path + '.csv', encoding = 'utf-8').read().replace('\"', '').replace('\ufeff', '')
    data_list = data_raw.split('\n')
    input_table = [row.split(';') for row in data_list]
    
    print(path + '.csv: data loaded')
    return data_list, input_table

In [3]:
# CALCULATE TF / IDF / TF-IDF VALUES
def vectorize (vectorizer_type, min_df, max_df):
    print(vectorizer_type)
    
    if vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(max_df = max_df, #ignore terms that appear more than in...
                                     min_df = min_df) #ignore terms that appear less than in...
        print('Tfidf-vectorizing done')
        return vectorizer
    
    elif vectorizer_type == 'tf':
        vectorizer = CountVectorizer(max_df = max_df,
                                     min_df = min_df)            
        print('Count-vectorizing done')
        return vectorizer                 
            
    else:
        print('error: unknown vectorizer')
        return None

In [4]:
# GENERATE DOC-TERM MATRIX 
def generate_input_matrix(vectorizer, input_table):
    t = time()

    column1 = [row[0] for row in input_table] 
    doc_term_matrix = vectorizer.fit_transform(column1)
    
    print('doc-term matrix generated in %s s' %round((time() - t), 5) + ', matrix dimensions: ' + str(doc_term_matrix.shape))
    return doc_term_matrix

In [5]:
# NMF, Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
def generate_nmf_topic_model(doc_term_matrix, beta_loss, n_topics, max_iterations):
    t = time()

    # NMF - Frobenius-norm : ||A||_Fro^2 = \sum_{i,j} A_{ij}^2
    # math: d_{Fro}(X, Y) = \frac{1}{2} ||X - Y||_{Fro}^2 = \frac{1}{2} \sum_{i,j} (X_{ij} - {Y}_{ij})^2
    if beta_loss == 'frobenius':
        matrix_factorization = NMF(beta_loss = beta_loss, n_components = n_topics, max_iter = max_iterations)
        nmf_model = matrix_factorization.fit(doc_term_matrix)
        print('NMF Frobenius topic_model created in %s s' %round((time() - t), 5))

    # NMF - Kullback-Leibler divergence:
    # math: d_{KL}(X, Y) = \sum_{i,j} (X_{ij} log(\frac{X_{ij}}{Y_{ij}}) - X_{ij} + Y_{ij})
    elif beta_loss == 'kullback-leibler':
        matrix_factorization = NMF(beta_loss = beta_loss, n_components = n_topics, max_iter = max_iterations, solver = 'mu')
        nmf_model = matrix_factorization.fit(doc_term_matrix)
        print('NMF Kullback-Leibler topic_model created in %s s' %round((time() - t), 5))           
    
    else:
        print('error: invalid beta_loss')
        
    return matrix_factorization, nmf_model

In [6]:
# LDA, Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
def generate_lda_topic_model(doc_term_matrix, n_topics, max_iter):
    t = time()
    matrix_factorization = LatentDirichletAllocation(n_components = n_topics,
                                                    max_iter = max_iter)
    
    lda_model = matrix_factorization.fit(doc_term_matrix)
    print('LDA topic_model created in %s s' %round((time() - t), 5))     
    
    return matrix_factorization, lda_model

In [7]:
# PRINT TOPICS, Source: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def print_topics (path, model, model_type, feature_names, n_topics, n_top_words, save):
    # calculates the top words of topics                    
    if (save):
        with open(path + '_tm_' + model_type + '_' + str(n_topics) + '_' + str(n_top_words) + '.csv', 'w', encoding = 'latin-1') as doc_out:
            for idx, topic in enumerate(model.components_):
                topic_list = "Topic #%d: " % (idx + 1)
                topic_list += " ".join(["{} ({}),".format(feature_names[idx], str(round(topic[idx], 3))) for idx in topic.argsort()[:-n_top_words - 1:-1]]) 
                doc_out.write(topic_list + '\n')
                
    else:
        for idx, topic in enumerate(model.components_):
            topic_list = "Topic #%d: " % (idx + 1)
            topic_list += " ".join(["{} ({}),".format(feature_names[idx], str(round(topic[idx], 3))) for idx in topic.argsort()[:-n_top_words - 1:-1]]) 
            print(topic_list)

    return topic_list

In [48]:
# CALCULATE AND PRINT TOPIC DISTRIBUTION
def calculate_topic_distribution(path, topic_model, doc_term_matrix, data_list, feature_names): 

    # TRANSFORM DATA INTO DATAFRAME  
    row_index = 1
    rows = []
    for row in data_list:
        rows.append(str(row_index) + ';' + row)
        row_index = row_index + 1
    
    columns = ["Topic #%d: " % (index + 1) for index, topic in enumerate(topic_model.components_)]
    values = numpy.round(topic_model.transform(doc_term_matrix), 2)
    df = pandas.DataFrame(values, rows, columns)
    df.index.names = ['Transcriptions']

    # CALCULATE DOMINANT TOPIC / DOC
    dominant_topic = numpy.argmax(df.values, axis = 1) + 1  
    df['dominant_topic'] =   dominant_topic
#    df = df.sort_values('dominant_topic')

    topic_total = numpy.sum(values, axis = 0)
    df.loc['total:', columns] = topic_total
#    df.loc['total:', dominant_topic] = numpy.argmax(topic_total, 0) + 1
    
    # PRINT TO CSV
    df.to_csv(path + 'distr.csv', sep=';', columns=None, header=True, index=True, index_label=None, mode='w', encoding='utf-8', compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, date_format=None, doublequote=True, escapechar=None, decimal='.')
    
    print('topic distribution is done and printed')
    return df
    

In [46]:
# TUNERS
n_topics = 5
n_top_words = 5
max_iterations = 200
save = False
# -----------------------------------------------------------
# levels = ['age', 'person']
level = 'person'
paths = ['./IO_YO/all_1norm', './IO_YO/all_1Snorm', './IO_YO/all_2norm', './IO_YO/all_2Snorm']
path = paths[3]

data_list, input_table = load_data(path)

# PREPARE LDA TOPIC MODEL
vectorizer = vectorize ('tf', min_df = 2, max_df = 90)
doc_term_matrix = generate_input_matrix (vectorizer, input_table)
matrix_factorization, topic_model = generate_lda_topic_model (doc_term_matrix, n_topics, max_iterations) 
topic_list = print_topics (path, topic_model, 'lda', vectorizer.get_feature_names(), n_topics, n_top_words, save)

./IO_YO/all_2Snorm.csv: data loaded
tf
Count-vectorizing done
doc-term matrix generated in 0.18278 s, matrix dimensions: (5770, 2485)
LDA topic_model created in 196.12348 s
Topic #1: danken (69.194), frank (62.195), schlecht (60.194), tagen (58.781), neu (49.228),
Topic #2: woche (68.274), kaufen (39.978), erzählen (39.281), schwierig (38.193), gott (36.785),
Topic #3: kennen (87.192), brauchen (55.279), schlafen (50.596), echt (46.658), problem (46.189),
Topic #4: schreiben (110.192), denken (79.039), arbeiten (75.443), warten (55.881), stimmen (50.16),
Topic #5: stehen (99.19), fragen (66.191), abend (62.191), geil (53.073), laufen (51.744),


In [49]:
df = calculate_topic_distribution('./IO_YO/topic_lda_', topic_model, doc_term_matrix, data_list, vectorizer.get_feature_names())

topic distribution is done and printed


In [17]:
# PREPARE NMF TOPIC MODEL
vectorizer = vectorize ('tfidf', min_df = 2, max_df = 90)
doc_term_matrix = generate_input_matrix (vectorizer, input_table)
matrix_factorization, topic_model = generate_nmf_topic_model (doc_term_matrix, 'frobenius', n_topics, max_iterations)
topic_list = print_topics (path, topic_model, 'nmf', vectorizer.get_feature_names(), n_topics, n_top_words, save)
df = calculate_topic_distribution('./IO_YO/topic_nmf_', topic_model, doc_term_matrix)

# PREPARE NMF TOPIC MODEL
vectorizer = vectorize ('tfidf', min_df = 2, max_df = 90)
doc_term_matrix = generate_input_matrix (vectorizer, input_table)
matrix_factorization, topic_model = generate_nmf_topic_model (doc_term_matrix, 'kullback-leibler', n_topics, max_iterations)
topic_list = print_topics (path, topic_model, 'nmf', vectorizer.get_feature_names(), n_topics, n_top_words, save)
df = calculate_topic_distribution('./IO_YO/topic_nmf_', topic_model, doc_term_matrix)

# PREPARE NMF TOPIC MODEL
vectorizer = vectorize ('tf', min_df = 2, max_df = 90)
doc_term_matrix = generate_input_matrix (vectorizer, input_table)
matrix_factorization, topic_model = generate_nmf_topic_model (doc_term_matrix, 'frobenius', n_topics, max_iterations)
topic_list = print_topics (path, topic_model, 'nmf', vectorizer.get_feature_names(), n_topics, n_top_words, save)
df = calculate_topic_distribution('./IO_YO/topic_nmf_', topic_model, doc_term_matrix)

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(matrix_factorization, doc_term_matrix, vectorizer)

./IO_YO/all_2Snorm.csv: data loaded
tf
Count-vectorizing done
doc-term matrix generated in 0.12029 s, matrix dimensions: (5770, 2485)
LDA topic_model created in 215.21102 s
Topic #1: fahren (91.192), woche (89.194), warten (86.193), jahr (81.007), danken (69.193),
Topic #2: klein (70.238), brauchen (67.117), denken (64.671), geil (62.194), schlecht (53.806),
Topic #3: lustig (65.191), super (50.624), abend (50.347), gross (47.641), zeigen (45.192),
Topic #4: kennen (87.193), neu (71.48), tagen (62.779), schreiben (56.529), verstehen (44.203),
Topic #5: laufen (59.926), geld (47.194), komisch (41.194), gefühl (37.466), fertigen (36.506),


ValueError: cannot reindex from a duplicate axis