In [2]:
from __future__ import division, print_function, absolute_import, unicode_literals
from builtins import str

import os

import pandas as pd

DATA_PATH = os.path.abspath(os.path.join('..', '..', '..', 'Data'))
import string

In [3]:
import matplotlib
from IPython.display import display, HTML 
%matplotlib inline
np = pd.np
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)

In [4]:
import gzip
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

Load previously cleaned data

In [None]:
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    df = read_csv(f)
df.tokens

In [13]:
d = Dictionary.from_documents(([str(s) for s in row]for row in df.tokens))

In [14]:
df.tokens.iloc[0]

"['python', 'never', 'stop', 'learning', 'what', 'you', 'enjoy', 'doing']"

When we said "QUOTE_NONNUMERIC" we didn't mean **ALL** nonnumeric fields ;)

In [15]:
df['tokens'] = df.txt.str.split()
df.tokens

0         [python, never, stop, learning, what, you, enj...
1                               [Watching, Boa, vs, Python]
2           [Monty, Python, The, silly, walk, via, YouTube]
                                ...                        
183067    [RT, RealPython, List, of, Python, API, Wrappe...
183068                          [Watching, Boa, vs, Python]
183069    [Чертова, дюжина, вакансий, в, IT, и, Digital,...
Name: tokens, dtype: object

In [16]:
df.tokens.values[0:3]

array([['python', 'never', 'stop', 'learning', 'what', 'you', 'enjoy', 'doing'],
       ['Watching', 'Boa', 'vs', 'Python'],
       ['Monty', 'Python', 'The', 'silly', 'walk', 'via', 'YouTube']], dtype=object)

In [17]:
d = Dictionary.from_documents(df.tokens)
d

<gensim.corpora.dictionary.Dictionary at 0x7f87abc40f28>

In [18]:
tfidf = TfidfModel(d)

TypeError: object of type 'int' has no len()

*Hint-Hint:* `gensim` is sprinting this week at PyCon!

In [19]:
TfidfModel?

In [20]:
TfidfModel(df.txt)

ValueError: not enough values to unpack (expected 2, got 1)

In [21]:
TfidfModel(df.tokens)

ValueError: too many values to unpack (expected 2)

In [22]:
TfidfModel((d.doc2bow(tokens) for tokens in df.tokens))

<gensim.models.tfidfmodel.TfidfModel at 0x7f879a4bdfd0>

But there's a simpler way.  
We already have a vocabulary  
with term and document frequencies in a matrix...  

In [23]:
pd.Series(d.dfs)

0          444
1        53491
2          323
         ...  
87144        1
87145        1
87146        1
dtype: int64

In [24]:
pd.Series(d.iteritems())

0        ((59729, showerthought), (50000, descending), ...
1        ((59729, showerthought), (50000, descending), ...
2        ((59729, showerthought), (50000, descending), ...
                               ...                        
87144    ((59729, showerthought), (50000, descending), ...
87145    ((59729, showerthought), (50000, descending), ...
87146    ((59729, showerthought), (50000, descending), ...
dtype: object

OK, now I get it  

- `document` is a list of strings (ordered sequence of tokens)  
- `bow` or [bag of words] is a list of `Counter`-like mappings between word IDs and their count in each document
- `TfidfModel` is a transformation from a BOW into a BORF,  a "bag of relative frequencies"  

TFIDF = BORF = term frequencies normalized by document occurence counts


In [25]:
pd.Series(d.doc2bow(toks) for toks in df.tokens[:3])

0    [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...
1                   [(8, 1), (9, 1), (10, 1), (11, 1)]
2    [(8, 1), (12, 1), (13, 1), (14, 1), (15, 1), (...
dtype: object

Did it assign 0 to the first word it found?  
Sort-of...  

In [26]:
d.token2id['python']

1

In [27]:
d.token2id['Python']

8

In [28]:
d.token2id['you']

3

In [29]:
d.id2token[0]  # guesses anyone?

'never'

In [30]:
tfidf = TfidfModel(dictionary=d)
tfidf

<gensim.models.tfidfmodel.TfidfModel at 0x7f87a2ef3860>

In [31]:
tfidf.dfs

{0: 444,
 1: 53491,
 2: 323,
 3: 9048,
 4: 611,
 5: 1658,
 6: 68,
 7: 2374,
 8: 117768,
 9: 36,
 10: 67,
 11: 1155,
 12: 2297,
 13: 126,
 14: 5553,
 15: 60,
 16: 6823,
 17: 15,
 18: 1679,
 19: 3078,
 20: 8735,
 21: 1973,
 22: 2114,
 23: 10592,
 24: 4578,
 25: 3350,
 26: 715,
 27: 7365,
 28: 8051,
 29: 467,
 30: 2987,
 31: 1372,
 32: 33,
 33: 58,
 34: 1604,
 35: 374,
 36: 38276,
 37: 53,
 38: 27,
 39: 23,
 40: 781,
 41: 589,
 42: 5189,
 43: 304,
 44: 36475,
 45: 13282,
 46: 6512,
 47: 98,
 48: 1231,
 49: 4616,
 50: 1581,
 51: 2454,
 52: 4647,
 53: 25,
 54: 8,
 55: 472,
 56: 67101,
 57: 1856,
 58: 3375,
 59: 6109,
 60: 717,
 61: 7338,
 62: 103,
 63: 462,
 64: 20774,
 65: 41,
 66: 551,
 67: 212,
 68: 1001,
 69: 2961,
 70: 703,
 71: 5888,
 72: 59,
 73: 448,
 74: 232,
 75: 42,
 76: 43,
 77: 2473,
 78: 80,
 79: 1096,
 80: 143,
 81: 1,
 82: 4548,
 83: 2244,
 84: 593,
 85: 1732,
 86: 2649,
 87: 27,
 88: 17750,
 89: 52,
 90: 402,
 91: 66,
 92: 84,
 93: 5810,
 94: 3045,
 95: 358,
 96: 16228,
 97

In [42]:
tfidf.num_docs

183070

In [43]:
tfidf.num_nnz

2392121

In [44]:
tfidf.save(os.path.join(DATA_PATH, 'tfidf'))

In [45]:
tfidf2 = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))

In [46]:
tfidf2.num_nnz

2392121