In [3]:
from __future__ import division, print_function, absolute_import
from builtins import str
import sys
import collections

import matplotlib
%matplotlib inline
import pandas as pd
from IPython.display import display, HTML 
HTML("<style>.container { width:100% !important; }</style>")
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 300)

import os
from decimal import Decimal
from traceback import print_exc

np = pd.np
BASE_PATH = os.path.abspath(os.path.join('..', '..', '..'))
DATA_PATH = os.path.join(BASE_PATH, 'Data')
MODELS_PATH = os.path.join(BASE_PATH, 'Models')
sys.path.append(DATA_PATH)
sys.path.append(MODELS_PATH)
tld_iana = pd.read_csv(os.path.join(DATA_PATH, 'tlds-from-iana.csv'))
tld_iana = collections.OrderedDict(sorted(zip((tld.strip().lstrip('.') for tld in tld_iana.domain),
                                              [(sponsor.strip(), -1) for sponsor in tld_iana.sponsor]),
                                          key=lambda x: len(x[0]),
                                          reverse=True))

In [4]:
tfdf = pd.read_csv(os.path.join(DATA_PATH, 'tweet_vocab.csv.gz'), index_col=0, compression='gzip',
                   quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC, low_memory=False)
tfdf.describe().round().astype(int)

FileNotFoundError: [Errno 2] No such file or directory: '/home/hobs/src/AgileMachineLearning/Data/tweet_vocab.csv.gz'

If you try to allocate a 16k word by 100k document DataFrame of 64-bit integers, you'll get a memory error on a 16 GB laptop.  
Later we'll learn about "constant RAM" tools that can handle an unlimitted stream of documents with a large (1M word) vocabulary. But first let's be frugal and see what we can do with robust, mature tools like Pandas.  
Rather than cutting back on those 100k tweets, lets cut back on the words. What are all those 16k words and how often are they all used (maybe we can ignore infrequent words).  

In [None]:
GB = 8 * (100 * 1000 * len(tfdf)) / 1.e9
GB

In [None]:
tfdf

Fortunately the odd words are at the top and bottom of an alphabetical index!  
And it does look like the less useful tokens aren't used many times or in many documents.  
What do you notice that might help distinguish "natural" words (zoom, zoos, zope, zynga) from URLs and machine-code (000, zzp, zsl107)?  

In [None]:
tfdf = tfdf[tfdf.df > 9]
tfdf = tfdf[(tfdf.df > 9) & (((tfdf.df - tfdf.tf) / tfdf.tf) < 0.15)]
tfdf = tfdf[(tfdf.df > 20) & (((tfdf.df - tfdf.tf) / tfdf.tf) < 0.15)]
tfdf

In [None]:
Numpy arrays (guts of Pandas DataFrame) require 8 bytes for each double-precision value (int64)

In [None]:
GB = 8 * (100 * 1000 * len(tfdf)) / 1.e9
GB

Memory requirements (4 GB) are doable  
But we've lost important words: **"zoom"**  
And there's still a bit of garbage: **"zh3gs0wbno"**  
These look like keys, slugs, hashes or URLs  
Even though the tweets.json format includes a column for URLs  
The URLs are left within the raw text as well  
Let's use a formal but simple grammar engine:

## Extended regular expressions 

In [None]:
try:
    from pug.nlp import constant
except:
    from Data import constant
# constant.uri_schemes_popular = ['chrome', 'https', 'http', ]
url_scheme_popular = r'(\b(' + '|'.join(constant.uri_schemes_popular) + r')[:][/]{2})'
fqdn_popular = r'(\b[a-zA-Z0-9-.]+\b([.]' + r'|'.join(constant.tld_popular) + r'\b)\b)'
url_path = r'(\b[\w/?=+#-_&%~\'"\\.,]*\b)'

pd.set_option('display.max_rows', 14)
pd.Series(constant.uri_schemes_iana)


In [None]:
url_popular = r'(\b' + r'(http|https|svn|git|apt)[:]//' + fqdn_popular + url_path + r'\b)'
tweet = "Play the [postiive sum game](http://totalgood.com/a/b?c=42) of life instead of svn://us.gov."
import re
re.findall(url_popular, tweet)