In [1]:
import gensim.models
import gensim.corpora
import gensim as gs
import pandas as pd
from gensim.models import FastText
from gensim.models.phrases import Phrases, Phraser

In [4]:
def save_models(dataset_name, num_topics):
    print("loading topic data for", dataset_name)
    # load inputs and labels
    dataset = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv").astype(str).values.tolist() 
    # remove placeholders from the stems dataset
    print("removing placeholders")
    for index, sample in enumerate(dataset): 
            dataset[index] = list(filter((" ").__ne__, sample))
    # create dic, copora and lda-model
    print("making dic")
    dic = gs.corpora.Dictionary(dataset)
    dic.save("../models/dictionary/" + dataset_name + "_dictionary")
    print("making corpus")
    corpus = [dic.doc2bow(sample) for sample in dataset]
    print("making lda")
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, id2word=dic, num_topics=num_topics, random_state=100, chunksize=100, passes=10, per_word_topics=True)#update_every=1, 
    lda_model.save("../models/topic_models/" + dataset_name + "_ldamodel")
    print("making fasttext")
    inputs = [" ".join(sentence) for sentence in dataset]
    vector_model = FastText(size=32, window=3, min_count=1)
    vector_model.build_vocab(inputs)  
    vector_model.train(sentences=inputs, total_examples=len(inputs), total_words=vector_model.corpus_total_words, epochs=10)
    vector_model.save("../models/word_embeddings/" + dataset_name + "_fasttext")
    # make bigram model
    sentences = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv")["t"].tolist()
    tokenized = [t.split() for t in sentences]
    phrases = Phrases(tokenized)
    bigram = Phraser(phrases)
    bigram.save("../models/bigrams/bigram_" + dataset_name + ".pkl")

In [6]:
def load_test(dataset_name, num_topics):
    print("loading dic")
    #dic = gs.corpora.Dictionary.load("../models/dictionary/" + dataset_name + "_dictionary")
    print("loading topic model")
    lda_model = gensim.models.ldamulticore.LdaMulticore.load("../models/topic_models/" + dataset_name + "_ldamodel")
    #topics = lda_model.show_topics(num_topics = num_topics)
    #print(len(topics))
    #print(topics)
    #vector_model = FastText.load("../models/word_embeddings/" + dataset_name + "_fasttext")
    return lda_model
    

In [24]:
lda_model = load_test("norm_tweet", 10)
topics = lda_model.show_topics(formatted=False)
t = []
for topic in topics: 
    t.append([word[0] for word in topic[1]])
print(t)

for i, e in enumerate(t[:-1]):
    a = t[i]
    b = t[i+1]
    print(i, "u", i+1, set(a) & set(b))

loading dic
loading topic model
[['probabl', 'birthday', 'tiangong', 'prohibit', '2017', 'decommiss', 'laboratori', 'satellit', 'profan', 'unfortun'], ['mean', 'bed', 'readi', 'experi', 'fail', 'funni', 'clear', 'give', 'tree', 'consid'], ['year', "'", 'see', 'fantast', 'young', 'servic', 'everybodi', 'cuddl', 'usa', 'genius'], ['long', 'twitter', 'damn', 'bout', 'ya', 'parti', 'sent', 'app', 'doin', 'grad'], ['have', 'hous', 'water', 'nice', 'citi', 'pay', 'stomach', 'hors', 'bf', 'carri'], ['hour', 'wo', 'black', 'friday', 'dentist', 'abl', 'burst', 'limit', 'energi', 'mental'], ['biggest', 'step', 'okay', 'woke', 'practic', 'natur', 'shower', 'boyfriend', 'exact', '2012'], ['god', 'talk', 'phone', 'pop', 'drop', 'teacher', 'winner', 'batteri', 'tyler', 'tickl'], ['fun', 'soon', 'decid', 'keep', 'yay', '1st', 'date', 'worth', 'floor', 'hilari'], ['night', 'find', '/', 'idea', 'aw', 'wear', 'came', 'size', 'switch', 'brain']]
0 u 1 set()
1 u 2 set()
2 u 3 set()
3 u 4 set()
4 u 5 set()

In [22]:
lda_model = load_test("norm_emotion", 10)
topics = lda_model.show_topics(formatted=False)
t = []
for topic in topics: 
    t.append([word[0] for word in topic[1]])
print(t)

for i, e in enumerate(t[:-1]):
    a = t[i]
    b = t[i+1]
    print(i, "u", i+1, set(a) & set(b))

loading dic
loading topic model
[['m', 'feel', 'read', 'let', 'ill', 'book', 'hour', 'stay', 'readi', 'ladi'], ['lagi', 'greenlight', 'causal', 'rez', 'kurang', 'laa', 'chantel', 'tahan', 'sanskrit', 'disqus'], ['lagi', 'greenlight', 'causal', 'rez', 'kurang', 'laa', 'chantel', 'tahan', 'sanskrit', 'disqus'], ['lagi', 'greenlight', 'causal', 'rez', 'kurang', 'laa', 'chantel', 'tahan', 'sanskrit', 'disqus'], ['lagi', 'greenlight', 'causal', 'rez', 'kurang', 'laa', 'chantel', 'tahan', 'sanskrit', 'disqus'], ['know', 'feel', 'past', 'hand', 'came', 'deep', 'rough', 'ghost', 'brick', 'kurang'], ['lagi', 'greenlight', 'causal', 'rez', 'kurang', 'laa', 'chantel', 'tahan', 'sanskrit', 'disqus'], ['lagi', 'greenlight', 'causal', 'rez', 'kurang', 'laa', 'chantel', 'tahan', 'sanskrit', 'disqus'], ['lagi', 'greenlight', 'causal', 'rez', 'kurang', 'laa', 'chantel', 'tahan', 'sanskrit', 'disqus'], ['lagi', 'greenlight', 'causal', 'rez', 'kurang', 'laa', 'chantel', 'tahan', 'sanskrit', 'disqus']]
0 

In [4]:
num_topics_dict = {
    "norm_tweet": 79,
    "norm_emotion": 186
}

datasets = ["norm_emotion"]

#for dataset in datasets: 
#    save_models(dataset, num_topics_dict[dataset])
#dataset_name = "test"
#save_models(dataset_name, num_topics_dict[dataset_name])

#load_test(dataset_name, num_topics_dict[dataset_name] + 10)