In [93]:
import pandas as pd
import gensim.models
import gensim.corpora
import gensim as gs
import pyLDAvis as pvis
import pyLDAvis.gensim
import gensim.models.coherencemodel
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import seaborn as sn
from sklearn.metrics import f1_score
from gensim.models import FastText
from sklearn.metrics import classification_report

In [94]:
def make_topic_data(dataset_name):
    print("loading vector data for", dataset_name)
    sentences = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv", delimiter=",").astype(str).values.tolist() 
    for index, sample in enumerate(sentences): 
            sentences[index] = list(filter((" ").__ne__, sample))
    #sentences_whole = [" ".join(sentence) for sentence in sentences_split]
    #tokens = [token for sentence in sentences_split for token in sentence]
    dic = gs.corpora.Dictionary(sentences)
    corpus = [dic.doc2bow(sample) for sample in sentences]
    #print("--- sentences_split: \n", sentences_split, "\n")
    #print("--- sentences_whole: \n", sentences_whole, "\n")
    #print("--- tokens: \n", tokens, "\n")
    return sentences, dic, corpus

def visualize_lda(model, corpus, dic):
    pvis.enable_notebook()
    vis = pvis.gensim.prepare(model, corpus, dic)
    vis.show()

def get_coherence_score(model, sentences, dic):
    # the higher the better it is, nutzen um versch. modelle zu vergleichen (mit untersch. topic-anzahö)
    coherence_model_lda = gensim.models.coherencemodel.CoherenceModel(model=model, texts=sentences, dictionary=dic, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    #print('\nCoherence Score: ', coherence_lda)
    return coherence_score

def draw_plot(dataset_name, x, y, best_coherence, best_num_topics):
    fig, ax = plt.subplots()
    ax.plot(x, y)
    ax.set(xlabel="num_topics", ylabel="coherence")
    desc = "dataset: {}\nbest coherence: {}, with topics: {}".format(dataset_name, best_coherence, best_num_topics)
    fig.text(0.5, -0.07, desc, ha='center')
    plt.grid()
    plt.show()
    fig.savefig("../img/num_topics_" + dataset_name + ".png", bbox_inches="tight")
    
def find_best_topic_num(dataset_name, lim_low, lim_high):
    coherences = []
    models = []
    sentences, dic, corpus = make_data(dataset_name)
    for i in range(lim_low, lim_high+1):
        print(dataset_name + "... loop {} / {}".format(i, lim_high))
        #lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dic, num_topics=i, random_state=100,
        #                               update_every=1, chunksize=100, passes=10, per_word_topics=True)
        lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, id2word=dic, num_topics=i, random_state=100,
                                       chunksize=100, passes=10, per_word_topics=True)#update_every=1, 
        models.append(lda_model)
        coherences.append(get_coherence_score(lda_model, sentences, dic))
    max_coherence_index = coherences.index(max(coherences))
    draw_plot(dataset_name, rst(ange(lim_low, len(coherences)+lim_low)), coherences, max(coherences), max_coherence_index+lim_low)
    models[max_coherence_index].save("../models/tm_" + dataset_name + ".model")

def classify_with_lr(train_x, test_x, train_y, test_y): 
    print("building lr model")
    lr = LogisticRegression(multi_class="multinomial", solver="newton-cg")
    print("... training model")
    lr.fit(train_x, train_y)
    print("... calcularing score")
    pred_y = lr.predict(test_x)
    # model metadata
    score, f1_scoore = lr.score(train_x, train_y), f1_score(test_y, pred_y, average="weighted")
    return (test_y, pred_y, score, f1_scoore), lr.coef_ 

def make_topic_data(dataset_name, num_topics):
    print("loading topic data for", dataset_name)
    # load inputs and labels
    dataset = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv").astype(str).values.tolist() 
    targets = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv")["a"].tolist()
    # remove placeholders from the stems dataset
    for index, sample in enumerate(dataset): 
            dataset[index] = list(filter((" ").__ne__, sample))
    # create dic, copora and lda-model
    dic = gs.corpora.Dictionary(sentences)
    corpus = [dic.doc2bow(sample) for sample in sentences]
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, id2word=dic, num_topics=num_topics, random_state=100,
                                           chunksize=100, passes=10, per_word_topics=True)#update_every=1, 
    
    vecs = []
    # for every sentence in the dataset
    for i, sample in enumerate(dataset):
        # get the vector-representations from the doc
        sentence = dic.doc2bow(dataset[i])
        # get the topics from the document (they are ordered by the topic ic)
        topics = lda_model.get_document_topics(sentence, minimum_probability=0.0)
        # write the probability for every topic into a single list
        topic_vec = [topics[i][1] for i in range(num_topics)] 
        # append the prob-vector for this sentence into the all-vectors-list
        vecs.append(topic_vec)
    dataset = vecs
    
    train_x, test_x, train_y, test_y = train_test_split(dataset, targets, test_size=0.2)
    return dic, corpus, lda_model, train_x, test_x, train_y, test_y

In [91]:
datasets = ["test"]
num_topics_dict = {
    "norm_tweet": 8,
    "norm_emotion": 8,
    "norm_test": 8,
    "test": 8
}

In [52]:
# find the optimal number of topics for each dataset
for dataset_name in datasets: 
    find_best_topic_num(dataset_name, 5, 8)
sentences, dic, corpus = make_data("norm_tweet")

loading vector data for norm_tweet


In [92]:
# train loistic regression over the topic distributions
all_results = []
coefficients = []
for dataset_name in datasets: 
    dic, corpus, lda_model, train_x, test_x, train_y, test_y = make_topic_data("test", num_topics_dict.get(dataset_name))
    results, coef = classify_with_lr(*load_lex_data(dataset, feature_set))
    all_results.append([dataset_name, *results])
    coefficients.append(coef)
    
# print reports, make graphics
for index, result in enumerate(all_results): 
    with open("../img/report_lr_" + result[0] + "_"  + result[1] + ".txt", 'w') as f:
        print((result[0] + "_" + result[1] + " (" + str(result[5]) + "):\n" + 
          classification_report(result[2], result[3],target_names=classes)), file=f)
    draw_coefficients_plot(result[0], re

#topics = lda_model.show_topics()
#for topic in topics[:5]: 
#    print(topic)

#print(lda_model.get_topic_terms(topicid=1, topn=2))

8
loading topic data for test


Process ForkPoolWorker-52:
Process ForkPoolWorker-53:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Process ForkPoolWorker-51:
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 103, in worker
    initializer(*initargs)
  File "/home/marcel/.local/lib/python3.6/site-packages/gensim/models/ldamulticore.py", line 334, in worker_e_step
    chunk_no, chunk, worker_lda = input_queue.get()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 94, in get
    res = self._recv_bytes()
  File "/usr/lib/pyth

KeyboardInterrupt: 

  File "/usr/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt


In [55]:
# Converting Topics to Feature Vectors
#new_doc = [dic.doc2bow(sample) for sample in sentences_split[:1]]
# ein neues doc muss ein satz sein (also eine liste)
train_vecs = []
for i, sentence in enumerate(sentences[:1]):
    #print(sentences[i])
    new_doc = dic.doc2bow(sentences[i])
    top_topics = lda_model.get_document_topics(new_doc, minimum_probability=0.0)
    print(top_topics)
    print(lda_model.get_topic_terms(0))
    topic_vec = [top_topics[i][1] for i in range(20)] # das hier anpassen
    #topic_vec.extend([rev_train.iloc[i].real_counts]) # counts of reviews for restaurant
    #topic_vec.extend([len(rev_train.iloc[i].text)]) # length review
    train_vecs.append(topic_vec)
    #print(topic_vec)


[(0, 0.003907854), (1, 0.003907854), (2, 0.003907854), (3, 0.003907854), (4, 0.003907854), (5, 0.003907854), (6, 0.003907854), (7, 0.003907854), (8, 0.003907854), (9, 0.003907854), (10, 0.003907854), (11, 0.87885654), (12, 0.003907854), (13, 0.003907854), (14, 0.003907854), (15, 0.003907854), (16, 0.003907854), (17, 0.003907854), (18, 0.003907854), (19, 0.003907854), (20, 0.003907854), (21, 0.003907854), (22, 0.003907854), (23, 0.003907854), (24, 0.003907854), (25, 0.003907854), (26, 0.003907854), (27, 0.003907854), (28, 0.003907854), (29, 0.003907854), (30, 0.003907854), (31, 0.003907854)]
[(746, 0.1665889), (604, 0.078676336), (628, 0.062612906), (556, 0.046343762), (40, 0.03341817), (1532, 0.029774545), (2371, 0.02411643), (2297, 0.022226531), (962, 0.02126315), (2056, 0.019251674)]


In [None]:
# trainings, testdaten speichern
# für jeden datensatz ein model speichern
    # iterieren mit unterschiedlicher topic-anzahl
    
# gucken welche topic-ids mit den vier Klassen übereinstimmen
    # vlt die wörter der klasse in den lexika suchen? score ausrechnen (wörter in lexikon/länge der topic-wörter?)
    # das aber nur zusätzlich zum manuellen machen
    
# testdaten laufen lassen
    # gucken welches die höchste topic-id ist
    # schauen ob sie der id entspricht die ich oben aufgestellt habe und klassifikation danach machen
    # confusion matrix etc berechnen

# für jeden unbekannten satz den ich eingebe:
    # in bow umrechnen
    # topic-scores der vier klassen berechhnen
    # klassifizieren
    
# den kram aufschreiben :(

In [None]:
# show the first 10 topics
#print(lda_model.show_topics(num_topics=10))
# show all tokens that are part of a topic (only the top 2 words)
#print(lda_model.get_topic_terms(topicid=0, topn=2))
# get the word for a id in the dic
#print(id2word[143])
# print corpus
#print(corpus[:1])
# print corpus but with names, not with ids
#print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])
#visualize_lda(lda_model, corpus, dic)
#lda_model.save("location.model")

In [82]:
#pvis.enable_notebook()
#vis = pvis.gensim.prepare(lda_model, corpus, dic)
#vis

In [72]:
#new_doc = [dic.doc2bow(sample) for sample in sentences_split[:1]]
# ein neues doc muss ein satz sein (also eine liste)
#new_doc2 = dic.doc2bow(*sentences_split[:1])
#print(new_doc)
#print(new_doc2)
# get topics from a new document (fremd am besten)
#top = lda_model.get_document_topics(new_doc, minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
#top2 = lda_model.get_document_topics(new_doc2, minimum_probability=None, minimum_phi_value=None, per_word_topics=False)
# zeige alle topics in dem document
#for i, x in enumerate(top):
#    print(x)
#for i, x in enumerate(top2):
#    print(x)