In [19]:
import pandas as pd

data = pd.read_csv('ytvideo.csv', header=None)
data.columns = ['video_title']
data['index'] = data.index
documents = data

In [20]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [21]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /afs/crc.nd.edu/user/y/ywu6/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) >= 3:
            result.append(lemmatize_stemming(token))
    return result

In [23]:
doc_sample = documents[documents['index'] == 250].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['The', 'perfect', 'ice', '-', 'Ice', 'boats', 'on', 'Lake', 'Geneva']


 tokenized and lemmatized document: 
['perfect', 'ice', 'ice', 'boat', 'lake', 'geneva']


In [24]:
processed_docs = documents['video_title'].map(preprocess)

In [25]:
processed_docs[:10]

0                                [want, talk, marriag]
1      [trump, presid, week, tonight, john, oliv, hbo]
2    [racist, superman, rudi, mancuso, king, bach, ...
3                      [nickelback, lyric, real, fake]
4                                     [dare, go, bald]
5                                        [week, iphon]
6          [roy, moor, jeff, session, cold, open, snl]
7                           [ice, cream, gadget, test]
8    [greatest, showman, offici, trailer, centuri, ...
9                 [rise, robot, wonâ, mean, end, work]
Name: video_title, dtype: object

In [26]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [27]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [28]:
bow_doc_200 = bow_corpus[200]

for i in range(len(bow_doc_200)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_200[i][0], 
                                                     dictionary[bow_doc_200[i][0]], 
                                                     bow_doc_200[i][1]))

Word 768 ("airlin") appears 1 time.
Word 769 ("brent") appears 1 time.
Word 770 ("fli") appears 1 time.
Word 771 ("pella") appears 1 time.
Word 772 ("shouldn") appears 1 time.
Word 773 ("spirit") appears 1 time.


In [29]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [30]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.6385308231954088), (1, 0.4296007165482703), (2, 0.6385308231954088)]


In [31]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=30, id2word=dictionary, passes=2, workers=2)

In [32]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.028*"surpris" + 0.028*"lyric" + 0.023*"video" + 0.014*"jimmi" + 0.014*"pitbul" + 0.014*"finish" + 0.014*"fallon" + 0.014*"pay" + 0.014*"cabello" + 0.014*"jungl"
Topic: 1 
Words: 0.031*"audio" + 0.025*"star" + 0.025*"offici" + 0.022*"bodi" + 0.022*"hilari" + 0.022*"coller" + 0.022*"know" + 0.019*"danc" + 0.019*"sun" + 0.019*"phillip"
Topic: 2 
Words: 0.030*"watch" + 0.021*"secret" + 0.018*"lace" + 0.018*"reveal" + 0.018*"lay" + 0.018*"wig" + 0.018*"aaliyahjay" + 0.015*"hous" + 0.015*"judiciari" + 0.015*"make"
Topic: 3 
Words: 0.035*"spaghetti" + 0.035*"burrito" + 0.018*"democraci" + 0.018*"dictat" + 0.018*"smart" + 0.018*"eminem" + 0.018*"card" + 0.018*"apart" + 0.018*"mug" + 0.018*"walk"
Topic: 4 
Words: 0.024*"jason" + 0.018*"offici" + 0.015*"trailer" + 0.012*"mcadam" + 0.012*"later" + 0.012*"bateman" + 0.012*"quindent" + 0.012*"game" + 0.012*"alan" + 0.012*"momoa"
Topic: 5 
Words: 0.046*"week" + 0.024*"player" + 0.024*"end" + 0.020*"iphon" + 0.017*"hbo" + 0.017*"jo

In [33]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=30, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.020*"pacif" + 0.016*"classic" + 0.013*"electr" + 0.013*"homemad" + 0.013*"airplan" + 0.012*"rim" + 0.012*"chang" + 0.012*"format" + 0.012*"carrier" + 0.012*"ocean"
Topic: 1 
Words: 0.018*"slow" + 0.017*"cream" + 0.017*"gadget" + 0.016*"test" + 0.015*"ice" + 0.014*"littl" + 0.013*"train" + 0.012*"jimmi" + 0.010*"kiwi" + 0.010*"audio"
Topic: 2 
Words: 0.018*"iii" + 0.018*"betray" + 0.015*"nickelback" + 0.015*"offici" + 0.015*"act" + 0.014*"rachel" + 0.012*"reput" + 0.012*"video" + 0.012*"movi" + 0.011*"jason"
Topic: 3 
Words: 0.021*"talk" + 0.020*"marriag" + 0.020*"want" + 0.019*"awesom" + 0.019*"pet" + 0.015*"insid" + 0.015*"peopl" + 0.014*"detect" + 0.014*"car" + 0.013*"liza"
Topic: 4 
Words: 0.010*"alt" + 0.010*"pleader" + 0.009*"audio" + 0.008*"confession" + 0.008*"dashboard" + 0.008*"offici" + 0.008*"video" + 0.008*"airlin" + 0.008*"spirit" + 0.008*"shouldn"
Topic: 5 
Words: 0.020*"peopl" + 0.016*"shower" + 0.015*"video" + 0.015*"offici" + 0.013*"miss" + 0.012*"si

In [34]:
test_title = 'Taylor Swift Music Video Bad Blood'
bow_vector = dictionary.doc2bow(preprocess(test_title))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.34910067915916443	 Topic: 0.038*"face" + 0.030*"offici" + 0.020*"old" + 0.020*"taylor" + 0.019*"iphon"
Score: 0.3286830484867096	 Topic: 0.090*"video" + 0.089*"offici" + 0.059*"music" + 0.029*"amor" + 0.025*"world"
Score: 0.17221234738826752	 Topic: 0.017*"cake" + 0.017*"studi" + 0.017*"helbig" + 0.017*"movi" + 0.017*"molli"


In [35]:
test_title = 'Best Vine 2018'
bow_vector = dictionary.doc2bow(preprocess(test_title))

for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

Score: 0.5166414976119995	 Topic: 0.031*"spotmini" + 0.024*"new" + 0.021*"amor" + 0.018*"tea" + 0.018*"spill"
Score: 0.01666753552854061	 Topic: 0.020*"pacif" + 0.016*"classic" + 0.013*"electr" + 0.013*"homemad" + 0.013*"airplan"
Score: 0.01666753552854061	 Topic: 0.018*"slow" + 0.017*"cream" + 0.017*"gadget" + 0.016*"test" + 0.015*"ice"
Score: 0.01666753552854061	 Topic: 0.018*"iii" + 0.018*"betray" + 0.015*"nickelback" + 0.015*"offici" + 0.015*"act"
Score: 0.01666753552854061	 Topic: 0.021*"talk" + 0.020*"marriag" + 0.020*"want" + 0.019*"awesom" + 0.019*"pet"
Score: 0.01666753552854061	 Topic: 0.010*"alt" + 0.010*"pleader" + 0.009*"audio" + 0.008*"confession" + 0.008*"dashboard"
Score: 0.01666753552854061	 Topic: 0.020*"peopl" + 0.016*"shower" + 0.015*"video" + 0.015*"offici" + 0.013*"miss"
Score: 0.01666753552854061	 Topic: 0.017*"take" + 0.017*"offer" + 0.017*"lambo" + 0.017*"carmax" + 0.014*"fun"
Score: 0.01666753552854061	 Topic: 0.019*"veteransday" + 0.019*"thank" + 0.018*"talk"

In [40]:
test_title = 'food in china'
bow_vector = dictionary.doc2bow(preprocess(test_title))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.34444043040275574	 Topic: 0.035*"offici" + 0.026*"video" + 0.015*"china" + 0.015*"jimmer" + 0.015*"week"
Score: 0.34443923830986023	 Topic: 0.015*"frank" + 0.015*"japanes" + 0.015*"wild" + 0.015*"sakura" + 0.015*"sean"
Score: 0.011111441068351269	 Topic: 0.028*"surpris" + 0.028*"lyric" + 0.023*"video" + 0.014*"jimmi" + 0.014*"pitbul"
Score: 0.011111441068351269	 Topic: 0.031*"audio" + 0.025*"star" + 0.025*"offici" + 0.022*"bodi" + 0.022*"hilari"
Score: 0.011111441068351269	 Topic: 0.030*"watch" + 0.021*"secret" + 0.018*"lace" + 0.018*"reveal" + 0.018*"lay"
Score: 0.011111441068351269	 Topic: 0.035*"spaghetti" + 0.035*"burrito" + 0.018*"democraci" + 0.018*"dictat" + 0.018*"smart"
Score: 0.011111441068351269	 Topic: 0.024*"jason" + 0.018*"offici" + 0.015*"trailer" + 0.012*"mcadam" + 0.012*"later"
Score: 0.011111441068351269	 Topic: 0.046*"week" + 0.024*"player" + 0.024*"end" + 0.020*"iphon" + 0.017*"hbo"
Score: 0.011111441068351269	 Topic: 0.023*"open" + 0.023*"video" + 0.023*"l