In [5]:
from __future__ import division, print_function, absolute_import
from past.builtins import basestring

import os

import pandas as pd

from twip.constant import DATA_PATH
import string

In [6]:
import matplotlib
from IPython.display import display, HTML 
%matplotlib inline
np = pd.np
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)

In [7]:
import gzip
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

Load previously cleaned data

In [9]:
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    df = pd.DataFrame.from_csv(f, encoding='utf8')
df.tokens

87        [python, never, stop, learning, what, you, enj...
88                              [Watching, Boa, vs, Python]
90          [Monty, Python, The, silly, walk, via, YouTube]
                                ...                        
193375    [RT, RealPython, List, of, Python, API, Wrappe...
193376                          [Watching, Boa, vs, Python]
193377              [IT, Digital, Go, Senior, Python, Djan]
Name: tokens, dtype: object

In [10]:
d = Dictionary.from_documents(df.tokens)

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [11]:
df.tokens.iloc[0]

u'[python, never, stop, learning, what, you, enjoy, doing]'

When we said "QUOTE_NONNUMERIC" we didn't mean **ALL** nonnumeric fields ;)

In [16]:
df['tokens'] = df.txt.str.split()
df.tokens

87        [python, never, stop, learning, what, you, enj...
88                              [Watching, Boa, vs, Python]
90          [Monty, Python, The, silly, walk, via, YouTube]
                                ...                        
193375    [RT, RealPython, List, of, Python, API, Wrappe...
193376                          [Watching, Boa, vs, Python]
193377              [IT, Digital, Go, Senior, Python, Djan]
Name: tokens, dtype: object

In [18]:
df.tokens.values[0:3]

array([ [u'python', u'never', u'stop', u'learning', u'what', u'you', u'enjoy', u'doing'],
       [u'Watching', u'Boa', u'vs', u'Python'],
       [u'Monty', u'Python', u'The', u'silly', u'walk', u'via', u'YouTube']], dtype=object)

In [22]:
d = Dictionary.from_documents(df.tokens)
d

<gensim.corpora.dictionary.Dictionary at 0x7fd71c63e4d0>

In [20]:
tfidf = TfidfModel(d)

TypeError: object of type 'int' has no len()

*Hint-Hint:* `gensim` is sprinting this week at PyCon!

In [24]:
TfidfModel?

In [26]:
TfidfModel(df.txt)

ValueError: need more than 1 value to unpack

In [27]:
TfidfModel(df.tokens)

ValueError: too many values to unpack

In [28]:
TfidfModel((d.doc2bow(tokens) for tokens in df.tokens))

<gensim.models.tfidfmodel.TfidfModel at 0x7fd71c63e050>

But there's a simpler way.  
We already have a vocabulary  
with term and document frequencies in a matrix...  

In [33]:
pd.Series(d.dfs)

0        (72303, Wrestling_Memes)
1          (18504, RobTalksLobos)
2                  (16337, Prony)
                   ...           
86898    (40524, dontusethiscode)
86899             (81118, PCcuck)
86900          (54126, Stockholm)
dtype: object

In [34]:
pd.Series(d.iteritems())

0        (72303, Wrestling_Memes)
1          (18504, RobTalksLobos)
2                  (16337, Prony)
                   ...           
86898    (40524, dontusethiscode)
86899             (81118, PCcuck)
86900          (54126, Stockholm)
dtype: object

OK, now I get it  

- `document` is a list of strings (ordered sequence of tokens)  
- `bow` or [bag of words] is a list of `Counter`-like mappings between word IDs and their count in each document
- `TfidfModel` is a transformation from a BOW into a BORF,  a "bag of relative frequencies"  

TFIDF = BORF = term frequencies normalized by document occurence counts


In [37]:
pd.Series(d.doc2bow(toks) for toks in df.tokens[:3])

0    [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...
1                   [(8, 1), (9, 1), (10, 1), (11, 1)]
2    [(8, 1), (12, 1), (13, 1), (14, 1), (15, 1), (...
dtype: object

Did it assign 0 to the first word it found?  
Sort-of...  

In [39]:
d.token2id['python']

2

In [40]:
d.token2id['Python']

8

In [41]:
d.token2id['you']

7

In [None]:
d.id2token[0]  # guesses anyone?

In [35]:
tfidf = TfidfModel(dictionary=d)
tfidf

<gensim.models.tfidfmodel.TfidfModel at 0x7fd71c63ef90>

In [None]:
tfidf.

In [42]:
tfidf.num_docs

183070

In [43]:
tfidf.num_nnz

2392121

In [44]:
tfidf.save(os.path.join(DATA_PATH, 'tfidf'))

In [45]:
tfidf2 = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))

In [46]:
tfidf2.num_nnz

2392121