In [1]:
# IMPORT PACKAGES
import spacy, pandas, numpy
from spacy.lang.de import German #to create spacy parser
#nlp = spacy.load('de_core_news_sm')

# Source: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
# Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
from sklearn.decomposition import NMF, LatentDirichletAllocation

#from sklearn.pipeline import Pipeline #process gets faster, + for the method calls

# source: https://docs.python.org/3/library/time.html
from time import time


In [2]:
# LOAD DATA S.T. 1 LINE IN XLSX = 1 DOCUMENT
def load_data(path):
#    t = time()
    with open(path + '.csv', 'r', encoding = 'latin-1') as doc:
        data = [line for line in doc] # string list

    print(path + '.csv: data loaded')   
#    print('data loaded in %s ms' %(1000 * round((time() - t), 5)))
          
    return data

In [3]:
# CALCULATE TF / IDF / TF-IDF VALUES
def vectorizer (vectorizer_type, min_df, max_df):
    print(vectorizer_type)
    
    if vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(max_df = max_df, #ignore terms that appear more than in...
                                     min_df = min_df) #ignore terms that appear less than in...
        print('Tfidf-vectorizing done')
        return vectorizer
    
    elif vectorizer_type == 'tf':
        vectorizer = CountVectorizer(max_df = max_df,
                                     min_df = min_df)            
        print('Count-vectorizing done')
        return vectorizer                 
            
    else:
        print('error: unknown vectorizer')
        return None

In [4]:
# GENERATE DOC-TERM MATRIX 
def generate_input_matrix(vectorizer, data):
    t = time()
    doc_term_matrix = vectorizer.fit_transform(data)
    print('doc-term matrix generated in %s s' %round((time() - t), 5) + ', matrix dimensions: ' + str(doc_term_matrix.shape))
    
    return doc_term_matrix

In [5]:
# NMF: Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X
# Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
def generate_nmf_topic_model(doc_term_matrix, beta_loss, n_topics, max_iterations):
    t = time()

    # NMF - Frobenius-norm : ||A||_Fro^2 = \sum_{i,j} A_{ij}^2
    # math: d_{Fro}(X, Y) = \frac{1}{2} ||X - Y||_{Fro}^2 = \frac{1}{2} \sum_{i,j} (X_{ij} - {Y}_{ij})^2
    if beta_loss == 'frobenius':
        matrix_factorization = NMF(beta_loss = beta_loss, n_components = n_topics, max_iter = max_iterations)
        nmf_model = matrix_factorization.fit(doc_term_matrix)
        print('NMF Frobenius topic_model created in %s s' %round((time() - t), 5))

    # NMF - Kullback-Leibler divergence:
    # math: d_{KL}(X, Y) = \sum_{i,j} (X_{ij} log(\frac{X_{ij}}{Y_{ij}}) - X_{ij} + Y_{ij})
    elif beta_loss == 'kullback-leibler':
        matrix_factorization = NMF(beta_loss = beta_loss, n_components = n_topics, max_iter = max_iterations, solver = 'mu')
        nmf_model = matrix_factorization.fit(doc_term_matrix)
        print('NMF Kullback-Leibler topic_model created in %s s' %round((time() - t), 5))        
        
    # NMF - Itakura-Saito divergence:    
    # math. d_{IS}(X, Y) = \sum_{i,j} (\frac{X_{ij}}{Y_{ij}} - log(\frac{X_{ij}}{Y_{ij}}) - 1)  
    
    
    else:
        print('error: invalid beta_loss')
        
    return nmf_model

In [6]:
# 
# Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
def generate_lda_topic_model(doc_term_matrix, n_topics, max_iter):
    t = time()
    matrix_factorization = LatentDirichletAllocation(n_components =  n_topics,
                                                    max_iter = max_iter)
    
    lda_model = matrix_factorization.fit(doc_term_matrix)
    print('LDA topic_model created in %s s' %round((time() - t), 5))     
    
    return lda_model

In [7]:
# PRINT TOPICS TO CONSOLE AND/OR SAVE TOPICS TO FILE
# Source: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def visualize_topics (path, model, model_type, feature_names, n_topics, n_top_words, save):
    # calculates the top words of topics                    
    if (save):
        with open(path + '_tm_' + model_type + '_' + str(n_topics) + '_' + str(n_top_words) + '.csv', 'w', encoding = 'latin-1') as doc_out:
            for idx, topic in enumerate(model.components_):
                topic_list = "Topic #%d: " % (idx + 1)
                topic_list += " ".join(["{} ({}),".format(feature_names[idx], str(round(topic[idx], 3))) for idx in topic.argsort()[:-n_top_words - 1:-1]]) 
                doc_out.write(topic_list + '\n')
                
    else:
        for idx, topic in enumerate(model.components_):
            topic_list = "Topic #%d: " % (idx + 1)
            topic_list += " ".join(["{} ({}),".format(feature_names[idx], str(round(topic[idx], 3))) for idx in topic.argsort()[:-n_top_words - 1:-1]]) 
            print(topic_list)

    return topic_list
    

In [8]:
# TUNERS
vectorizer_types = ['tf', 'tfidf']
model_types = ['frobenius', 'kullback-leibler', 'itakura-saito']
vectorizer_type = vectorizer_types[0]
model_type = 'nmf'
n_topics = 40
max_iterations = 200
n_top_words = 5
save = True

# tuners for group level analysis
paths = ['./IO_YO/young', './IO_YO/old']
doc_type = ['_tm_xy_c', '_tm_norm', '_norm']

# tuners for person level analysis
#participant_ids = nlp(open('PARTICIPANT_ID.txt').read())
#paths = ['./IO_P/' + token.text for token in participant_ids]

# EXECUTE
# load data
path = './IO_YO/old'
data = load_data(path + doc_type[2])

if (model_type in model_types):
    # choose vectorizer     min_df = 2 ignorálja a csak 1x elôforduló szavakat, max_df = 1.0 alap
#    vectorizer = vectorizer('tfidf', min_df = 2, max_df = 1.0)
    vectorizer = vectorizer('tfidf', min_df = 1, max_df = 1.0)
    # generate input matrix
    doc_term_matrix = generate_input_matrix(vectorizer, data)
    # do black magic
    topic_model = generate_nmf_topic_model(doc_term_matrix, model_type, n_topics, max_iterations)
    # print topics
    topic_list = visualize_topics (path, topic_model, model_type, vectorizer.get_feature_names(), n_topics, n_top_words, save)
    
else:
    # choose vectorizer    min_df = 2 ignorálja a csak 1x elôforduló szavakat, max_df = 1.0 alap
#    vectorizer = vectorizer('tf', min_df = 2, max_df = 1.0)
    vectorizer = vectorizer('tf', min_df = 1, max_df = 1.0)
    # generate input matrix
    doc_term_matrix = generate_input_matrix(vectorizer, data)
    # do black magic
    topic_model = generate_lda_topic_model(doc_term_matrix, n_topics, max_iterations) 
    # print topics
    topic_list = visualize_topics (path, topic_model, model_type, vectorizer.get_feature_names(), n_topics, n_top_words, save)

./IO_YO/old_norm.csv: data loaded
tf
Count-vectorizing done
doc-term matrix generated in 0.03988 s, matrix dimensions: (2140, 4927)
LDA topic_model created in 48.44751 s


In [9]:
"""
# EXECUTE
# load data
for path in paths:
    data = load_data(path + doc_type[0])

    if (model_type in model_types):
        # choose vectorizer     min_df = 2 ignorálja a csak 1x elôforduló szavakat, max_df = 1.0 alap
#        vectorizer = vectorizer('tfidf', min_df = 2, max_df = 1.0)
        vectorizer = vectorizer('tfidf', min_df = 1, max_df = 1.0)
        # generate input matrix
        doc_term_matrix = generate_input_matrix(vectorizer, data)
        # do black magic
        topic_model = generate_nmf_topic_model(doc_term_matrix, model_type, n_topics, max_iterations)
        # print topics
        topic_list = visualize_topics (path, topic_model, model_type, vectorizer.get_feature_names(), n_topics, n_top_words, save)
    
    else:
        # choose vectorizer    min_df = 2 ignorálja a csak 1x elôforduló szavakat, max_df = 1.0 alap
#        vectorizer = vectorizer('tf', min_df = 2, max_df = 1.0)
        vectorizer = vectorizer('tf', min_df = 1, max_df = 1.0)
        # generate input matrix
        doc_term_matrix = generate_input_matrix(vectorizer, data)
        # do black magic
        topic_model = generate_lda_topic_model(doc_term_matrix, n_topics, max_iterations) 
        # print topics
        topic_list = visualize_topics (path, topic_model, model_type, vectorizer.get_feature_names(), n_topics, n_top_words, save)
"""

"\n# EXECUTE\n# load data\nfor path in paths:\n    data = load_data(path + doc_type[0])\n\n    if (model_type in model_types):\n        # choose vectorizer     min_df = 2 ignorálja a csak 1x elôforduló szavakat, max_df = 1.0 alap\n#        vectorizer = vectorizer('tfidf', min_df = 2, max_df = 1.0)\n        vectorizer = vectorizer('tfidf', min_df = 1, max_df = 1.0)\n        # generate input matrix\n        doc_term_matrix = generate_input_matrix(vectorizer, data)\n        # do black magic\n        topic_model = generate_nmf_topic_model(doc_term_matrix, model_type, n_topics, max_iterations)\n        # print topics\n        topic_list = visualize_topics (path, topic_model, model_type, vectorizer.get_feature_names(), n_topics, n_top_words, save)\n    \n    else:\n        # choose vectorizer    min_df = 2 ignorálja a csak 1x elôforduló szavakat, max_df = 1.0 alap\n#        vectorizer = vectorizer('tf', min_df = 2, max_df = 1.0)\n        vectorizer = vectorizer('tf', min_df = 1, max_df = 1.0