In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import  LatentDirichletAllocation
import pickle

In [55]:
with open("topic_modelling.pickle", "rb") as f:
    pickle_data = pickle.load(f)
data = pickle_data["data"]

In [65]:
tf_vectorizer = CountVectorizer(max_df=0.90, 
                                min_df=0.05,
                                max_features=100000,                               
                                ngram_range=(1,2),
                                stop_words='english')

In [67]:
dtm_tf = tf_vectorizer.fit_transform(data)

In [68]:
lda = LatentDirichletAllocation(n_topics=5, max_iter=500,
                                learning_method='online',
                                n_jobs=-1,
                                learning_offset=50.,
                                random_state=0)

In [69]:
lda.fit(dtm_tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=500, mean_change_tol=0.001,
             n_jobs=-1, n_topics=5, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [75]:
from sklearn.externals import joblib
joblib.dump(lda, 'topic_modeling_sklearn.pkl')

['topic_modeling_sklearn.pkl']

In [76]:
clf = joblib.load('topic_modeling_sklearn.pkl') 

In [70]:
tf_feature_names = tf_vectorizer.get_feature_names()
#print_top_words(lda, tf_feature_names, 12)

In [71]:
#for i in range(len(file_details)):
#for i in range(5):
#    print(file_details[i], lda.transform(dtm_tf[i]))

In [72]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [77]:
pyLDAvis.sklearn.prepare(clf, dtm_tf, tf_vectorizer, mds='mmds')

In [74]:
pyLDAvis.sklearn.prepare(lda, dtm_tf, tf_vectorizer, mds='tsne')

In [78]:
file_details = pickle_data["file_details"]

In [88]:
r = clf.transform(dtm_tf[0])

In [127]:
import numpy as np
r = np.array([[3,4,1,2,5]])
order = r.argsort()
ranks = order.argsort()
print(r)
print(order)
print(ranks)
print(order[0][-3:][::-1])

[[3 4 1 2 5]]
[[2 3 0 1 4]]
[[2 3 0 1 4]]
[4 1 0]


In [115]:
topics = []
for i, _ in enumerate(dtm_tf):
    r = clf.transform(dtm_tf[i])
    order = r.argsort()
    labels = order[0][-3:]
    topics.append((labels, file_details[i]))

In [128]:
for topic in topics:
    print(topic[0][::-1], topic[1])

[2 1 4] 3 Kinds Of People In Your Life-IyxeODanWJs.en.txt
[2 0 4] Sex With Other Women-kxCxD99XmdU.en.txt
[2 0 1] Escape THIS Malicious Psychological Trap-M4Ko32D5ALw.en.txt
[2 1 0] My boss is jerk, how do i tell him off-FVesOtzV32I.en.txt
[2 4 1] Save Yourself First-AtDDOO0ScGU.en.txt
[2 1 0] The Power of Doing Things You Hate-PODtipE3MHc.en.txt
[1 2 4] Selling Your Soul vs  Selling Juice-HHxkyyBqm-s.en.txt
[2 1 4] Sorry, I can't handle this (Leadership)-U4cXh8sdF9A.en.txt
[2 0 1] I only want virgins (who owns her vagina)-o2eOmAVxfZc.en.txt
[1 4 0] # Daily Habits of The Strong & Successful-F9WDR3pTYYo.en.txt
[2 1 0] Be More AGGRESSIVE!!-OPVWU6oSPZg.en.txt
[2 0 1] Easy Sex with Dumb Girls-mB246CSYqEU.en.txt
[1 2 0] Freedom to Do Whatever You Want-cTrjxhC4_SU.en.txt
[2 1 4] You MUST Promote Yourself-tfkj3pbLhvE.en.txt
[2 1 0] Redneck and a Muslim Hug-rhMM8ECJBHo.en.txt
[1 2 4] Cure Rigid Bodybuilder Syndrome-5O7ixEIPB04.en.txt
[2 0 4] Believe In YOURSELF - Rant Against Religion & Scienc

[2 1 0] The Art of Selfish Generosity-SYeNeDPrkQk.en.txt
[1 2 4] Reborn Through Battle Scars-3DmYrACkfj8.en.txt
[2 1 0] Go Fail, then Go Home-E_ghqZNdoqc.en.txt
[2 1 4] Nobody Respects Me-XBrn1ZxRI60.en.txt
[1 0 2] Best Fitness Books and Bogus Scientists (w_ Paul Chek)-gBPG7MqM6Fk.en.txt
[2 1 0] How To Get Your Passion Back-gPDzrg18q1w.en.txt
[2 1 0] When Is It Okay To FIGHT-0kwXQsbJvn4.en.txt
[2 1 4] How To Get Your Ex-Girlfriend Back-YjXK7zjTJLA.en.txt
[2 1 0] Create a Commanding Presence-EnNIPilEci0.en.txt
[2 1 4] Speak Up with Confidence-R-uwVYUmJt8.en.txt
[1 2 4] Make Him Do It Without Saying A Word-l6_EdU09vvI.en.txt
[1 2 4] What to do when everyone hates you-ifaPkMtkdi4.en.txt
[2 1 4] My Life is Great, But I'm Still Depressed-7AYxP2bTLlo.en.txt
[2 1 4] No Really Means Yes-r95yRi4GZPA.en.txt
[1 2 0] Get Hired and Never Be Fired-pmu2MGRBF1w.en.txt
[2 1 4] To be IN the World, but not OF it-CtOf5unCN2A.en.txt
[1 4 0] Bioenergetics For Your Face-e_wBaM4doN0.en.txt
[1 4 2] Fear of pub

[2 0 1] Being Vulnerable & Self Soothing-nLSMHqfu-_0.en.txt
[1 2 4] Conviction and Courage you can do ANYTHING!-imbFQMpFUvI.en.txt
[2 0 1] 24 year old virgin (fear vs pleasure)-_ZfdVc9jqSg.en.txt
[1 2 4] How to release trapped emotion speaking in tongues)-oNCuM08nZss.en.txt
[1 0 2] 5 Ways Training Saved My Life-0EpfRxL0PF4.en.txt
[2 1 0] Selling your Soul vs  Broken Heart-8WdkEzhpzCM.en.txt
[2 0 1] Stop Being So Awesome-de9RGVkEwuQ.en.txt
[1 2 4] Why I refer to myself in the 3rd person 'Elliott Hulse'-x7DqKj2kDO4.en.txt
[2 4 0] Truth About Your GUILT-kzx0gQdGnaA.en.txt
[2 1 4] Cheating on My Mean Wife-WaR7ltNrTbM.en.txt
[2 1 4] Save The Vagina-MQD8Y2lF4PM.en.txt
[2 1 0] Become More Attractive To Women-mxrP_yAceww.en.txt
[2 1 0] Stop Sucking Her Tits!-s_GVFUIq3UQ.en.txt
[1 0 2] Building 'Good Will' and Influence Is More Powerful Than Marketing [Yo Elliott Miami 2013]-s_WRb1Hqo8M.en.txt
[2 4 1] Ugly Girl-FvQsiWfv0Uc.en.txt
[1 2 4] Always Acting Fake-M5vUlYTGXhU.en.txt
[1 4 2] Train VIRTU

In [117]:
with open("topic_modelling_file_list.pickle", "wb") as f:
    pickle.dump(topics, f)