In [1]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pandas as pd
import os
import lda
import numpy as np

In [2]:
import csv

In [219]:
n_topics = 20
n_top_words = 20
n_features = 6000
n_iter = 500
#n_sample = 2000

In [250]:
path = './data/'
f_lst = ['new_clean_biology.csv', 'new_clean_cooking.csv','new_clean_crypto.csv',
        'new_clean_diy.csv','new_clean_robotics.csv','new_clean_travel.csv']
print('loading dataset...')
t0 = time()
f = f_lst[5]
dataset = pd.read_csv(path+f, header=0)
data_sample = dataset['question']#.iloc[:n_sample]
tag_sample = dataset['tags']#.iloc[:n_sample]
n_sample = len(data_sample.index)
print(n_sample)
print('done in %0.3fs.' % (time() - t0))

loading dataset...
19278
done in 0.147s.


In [251]:
TfIdfVectorizer = TfidfVectorizer(analyzer='word', 
                                  ngram_range=(1,1), 
                                  min_df=0, 
                                  stop_words='english')
matrix = TfIdfVectorizer.fit_transform(data_sample)
feature_names = TfIdfVectorizer.get_feature_names()


In [252]:
print('extracting tf features for LDA...')
tf_vectorizer = CountVectorizer(max_features=n_features,
                               stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_sample)
print('done in %0.3fs.' % (time() - t0))

extracting tf features for LDA...
done in 1.068s.


In [253]:
print('fitting LDA...')
t0 = time()
model = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=1)
model.fit(tf)
topic_word = model.topic_word_
topic_word_dict = {}
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(feature_names)[np.argsort(topic_dist)][:-n_top_words:-1]
    topic_word_dict[i] = topic_words
#     print('Topic {}: {}'.format(i, ' '.join(topic_words)))
print('done in %0.3fs.' % (time() - t0))

INFO:lda:n_documents: 19278
INFO:lda:vocab_size: 6000
INFO:lda:n_words: 816405
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500


fitting LDA...


INFO:lda:<0> log likelihood: -9622323
INFO:lda:<10> log likelihood: -7005140
INFO:lda:<20> log likelihood: -6642906
INFO:lda:<30> log likelihood: -6497020
INFO:lda:<40> log likelihood: -6423438
INFO:lda:<50> log likelihood: -6387314
INFO:lda:<60> log likelihood: -6364565
INFO:lda:<70> log likelihood: -6347697
INFO:lda:<80> log likelihood: -6333139
INFO:lda:<90> log likelihood: -6324904
INFO:lda:<100> log likelihood: -6316666
INFO:lda:<110> log likelihood: -6310219
INFO:lda:<120> log likelihood: -6304970
INFO:lda:<130> log likelihood: -6300032
INFO:lda:<140> log likelihood: -6295761
INFO:lda:<150> log likelihood: -6292635
INFO:lda:<160> log likelihood: -6290116
INFO:lda:<170> log likelihood: -6286801
INFO:lda:<180> log likelihood: -6282958
INFO:lda:<190> log likelihood: -6281784
INFO:lda:<200> log likelihood: -6279282
INFO:lda:<210> log likelihood: -6278329
INFO:lda:<220> log likelihood: -6276400
INFO:lda:<230> log likelihood: -6274189
INFO:lda:<240> log likelihood: -6272822
INFO:lda:<2

done in 50.795s.


In [254]:
doc_topic = model.doc_topic_
for i in range(0, 10):
    print("{} (top topic:) {}".format(tag_sample[i], doc_topic[i].argmax()))
    print(doc_topic[i].argsort()[::-1][0])

caribbean cruising vacations (top topic:) 15
15
guides extreme-tourism amazon-river amazon-jungle (top topic:) 14
14
loyalty-programs routes ewr singapore-airlines sin (top topic:) 2
2
romania transportation (top topic:) 14
14
extreme-tourism antarctica (top topic:) 1
1
usa airport-transfer taxis seattle (top topic:) 11
17
sightseeing public-transport transportation argentina (top topic:) 3
3
safety international-travel money exchange (top topic:) 4
4
russia visas china mongolia trans-siberian (top topic:) 12
12
online-resources transportation peru south-america bolivia (top topic:) 9
9


In [255]:
result = []
all_word_file = open(f[10:-4]+ '_all.csv', 'w')
clean_word_file = open(f[10:-4]+ '_cleaned.csv', 'w')
fieldnames_all = ['doc', 'topic_words']
fieldnames_clean = ['doc', 'cleaned_topic_words']
writer1 = csv.DictWriter(all_word_file, fieldnames = fieldnames_all)
writer1.writeheader()
writer2 = csv.DictWriter(clean_word_file, fieldnames = fieldnames_clean)
writer2.writeheader()

for n in range(n_sample):
    doc = dataset['doc'][n]
    question = dataset['question'][n]
    top_3 = doc_topic[n].argsort()[::-1][:3]
    topic_words = list(topic_word_dict[top_3[0]]) + list(topic_word_dict[top_3[1]]) + list(topic_word_dict[top_3[2]])
    clean_w = []
    for item in topic_words:
        if item in question:
            clean_w.append(item)
        else:
            continue

    writer1.writerow({'doc': doc, 'topic_words':' '.join(topic_words)})
    writer2.writerow({'doc': doc, 'cleaned_topic_words':' '.join(clean_w)})


all_word_file.close()
clean_word_file.close()
    