In [9]:
import glob
import re
from collections import Counter

In [10]:
path = './tweets/*.out'

In [11]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)


In [12]:
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 

In [13]:
files_path = glob.glob(path)
count_all = Counter()

In [24]:
for file_path in files_path:
    with open(file_path, 'r') as file:
        for line in file:
            terms_all = [term for term in preprocess(line)]
            count_all.update(terms_all)
print (len(count_all))
print (count_all.most_common(5))

1386808
[('.', 4760112), ('/', 3528324), ('de', 1755872), ('la', 1209528), (',', 1185590)]


In [26]:
print (sum(count_all.values()))

68022652


In [16]:
from nltk.corpus import stopwords
import nltk
import string

In [17]:
punctuation = list(string.punctuation)

In [18]:
stop = stopwords.words('spanish') + punctuation + ['rt', 'via']

In [27]:
count_stop = Counter()
for file_path in files_path:
    with open(file_path, 'r') as file:
        for line in file:
            terms_stop = [term for term in preprocess(line) if term not in stop]
            count_stop.update(terms_stop)
print (len(count_stop))
print (count_stop.most_common(5))

1386541
[(':/', 505175), ('com', 404705), ('http', 383480), ('ón', 353405), ('twitter', 302532)]


In [28]:
len(list(count_stop.elements()))

21219077

In [29]:
print (sum(count_stop.values()))

21219077
