In [74]:
import pandas as pd

data = pd.read_csv('ytvideo.csv', header=None)
data.columns = ['video_title']
data['index'] = data.index
documents = data

In [75]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [76]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /afs/crc.nd.edu/user/y/ywu6/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [77]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) >= 3:
            result.append(lemmatize_stemming(token))
    return result

In [78]:
doc_sample = documents[documents['index'] == 250].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['The', 'perfect', 'ice', '-', 'Ice', 'boats', 'on', 'Lake', 'Geneva']


 tokenized and lemmatized document: 
['perfect', 'ice', 'ice', 'boat', 'lake', 'geneva']


In [79]:
processed_docs = documents['video_title'].map(preprocess)

In [80]:
processed_docs[:10]

0                                [want, talk, marriag]
1      [trump, presid, week, tonight, john, oliv, hbo]
2    [racist, superman, rudi, mancuso, king, bach, ...
3                      [nickelback, lyric, real, fake]
4                                     [dare, go, bald]
5                                        [week, iphon]
6          [roy, moor, jeff, session, cold, open, snl]
7                           [ice, cream, gadget, test]
8    [greatest, showman, offici, trailer, centuri, ...
9                 [rise, robot, wonâ, mean, end, work]
Name: video_title, dtype: object

In [81]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [82]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [83]:
bow_doc_200 = bow_corpus[200]

for i in range(len(bow_doc_200)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_200[i][0], 
                                                     dictionary[bow_doc_200[i][0]], 
                                                     bow_doc_200[i][1]))

Word 768 ("airlin") appears 1 time.
Word 769 ("brent") appears 1 time.
Word 770 ("fli") appears 1 time.
Word 771 ("pella") appears 1 time.
Word 772 ("shouldn") appears 1 time.
Word 773 ("spirit") appears 1 time.


In [84]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [85]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.6385308231954088), (1, 0.4296007165482703), (2, 0.6385308231954088)]
