In [2]:
import re 
import string
import timestring
from sklearn.linear_model import LinearRegression, LogisticRegression
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import cross_val_score, KFold
import glob, os
import numpy as np
from sklearn import svm
from collections import defaultdict
from scipy.sparse import csr_matrix

In [90]:
def tokenize(text):
    punc_re = '[' + '\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\_\\{\\|\\}' + ']'
    text = text.lower()
    text = re.sub('#(\S+)', r'HASHTAG_\1', text)
    text = re.sub('@(\S+)', r'MENTION_\1', text)
    text = re.sub('http\S+', 'THIS_IS_A_URL', text)
    text = re.sub(r'(.)\1\1\1+', r'\1', text)
    text = re.sub(r'[0-9]', '9', text)
    toks = []
    for tok in text.split():
        tok = re.sub(r'^(' + punc_re + '+)', r'\1 ', tok)
        tok = re.sub(r'(' + punc_re + '+)$', r' \1', tok)
        for subtok in tok.split():
            if re.search('\w', subtok):
                toks.append(subtok)
    return toks

In [88]:
vectorizer = TfidfVectorizer(decode_error='ignore', ngram_range=(1, 2), max_df=1., min_df=2,
                             use_idf=True, tokenizer=tokenize, binary=False, norm='l2')

In [10]:
DIR = '/data/2/protest'

In [89]:
def read_keywords(path):
    return [s.strip().lower() for s in open(path)]
    
keywords = read_keywords(DIR + '/keywords.txt')
keywords

['foradilma', 'fora dilma', 'forapt', 'fora pt', 'vemprarua', 'vem pra rua']

In [91]:
import os, io, json, codecs

def matches_keywords(text, keywords):
    """ Return true if any keyword is a substring of this text, ignoring case. """
    text = text.lower()
    for kw in keywords:
        if kw in text:
            return True
    return False

def filename2user(fname):
    """Convert filename like this
      /data/2/protest/Timeline/MandinhaSimone.txt.txt
    into a username like
      MandinhaSimone
    """
    return re.sub(r'^([^\.]+)\..+', r'\1', os.path.basename(fname))

def iterate_instances(path, keywords, negative_window):
    """
    Return an iterator over tuples containing:
    (concatenated tweet text, label, username)
    For each user in path, we find the first tweet containing one of the specified keywords.
    We then create one positive instance, containing all tweets prior to the matched tweet.
    We also create one negative instance, which is the same as the positive instance, except
    the N most recent tweets are removed (where N is set by the negative_window parameter).
    We additionally filter users if they use one of the keywords in one of their first `negative_window`
    tweets. This is to we have enough tweets to make a negative example.
    """
    for fname in glob.glob(path + '/*.txt'):
        user = filename2user(fname)
        lines = []
        for i, line in enumerate(open(fname)):
            js = json.loads(line)
            # exclude people who use keyword within first `window` of tweets.
            if i <= negative_window and matches_keywords(js['text'], keywords):
                print('skipping', fname, 'because uses keyword in first', negative_window, 'tweets')
                break
            if i > negative_window and matches_keywords(js['text'], keywords):
                yield (' '.join(lines), 1, user)
                yield (' '.join(lines[:-negative_window]), 0, user)
                break
            lines.append(js['text'])
            
y = []
users = []
negative_window = 10
# The loop below iterates over each instance and vectorizes the text.
# Simulataneously, we append to the y (labels) and users lists.
# We do this to avoid having to store all the text in memory at once and to 
# only require one loop through the files.
iterator = iterate_instances(DIR + '/Timeline', keywords, negative_window)
X = vectorizer.fit_transform(x[0] for x in iterator if not users.append(x[2]) and not y.append(x[1]))
print('read %d instances into X matrix with shape %s' % (len(users), str(X.shape)))
print('label distribution=', Counter(y))
y = np.array(y)
users = np.array(users)

skipping /data/2/protest/Timeline/MariaFeistauer.txt.txt because uses keyword in first 10 tweets
skipping /data/2/protest/Timeline/Matredamandio.txt.txt because uses keyword in first 10 tweets
skipping /data/2/protest/Timeline/edmarmbastos.txt.txt because uses keyword in first 10 tweets
skipping /data/2/protest/Timeline/cajoso1.txt.txt because uses keyword in first 10 tweets
skipping /data/2/protest/Timeline/FOFURA1055.txt.txt because uses keyword in first 10 tweets
skipping /data/2/protest/Timeline/usiuva.txt.txt because uses keyword in first 10 tweets
skipping /data/2/protest/Timeline/ElMarriachi43.txt.txt because uses keyword in first 10 tweets
skipping /data/2/protest/Timeline/OslecMac74.txt.txt because uses keyword in first 10 tweets
skipping /data/2/protest/Timeline/CSobania.txt.txt because uses keyword in first 10 tweets
skipping /data/2/protest/Timeline/wmedsantos.txt.txt because uses keyword in first 10 tweets
skipping /data/2/protest/Timeline/herminioaneto.txt.txt because use

In [92]:
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model_mod = LogisticRegression(penalty='l2', C=2.6)
model_mod.fit(X, y)

# 10 Cross-validation accuracy
cv = KFold(len(y), 10, shuffle=False)  # Don't shuffle b/c we don't want a user in both training and testing set.
accuracies = []
for train_ind, test_ind in cv:
    model_mod.fit(X[train_ind],y[train_ind])   
    accuracies.append(accuracy_score(y[test_ind], model_mod.predict(X[test_ind])))
    
print('Average 10-fold cross validation accuracy=%.4f (std=%.2f)' % (np.mean(accuracies), np.std(accuracies)))

predicted = model_mod.predict(X)
print('accuracy on training data=%.4f' % accuracy(y, predicted))

Average 10-fold cross validation accuracy=0.5070 (std=0.02)
accuracy on training data=0.5912


In [93]:
from pprint import pprint 
coefs = sorted(zip(vectorizer.get_feature_names(), model_mod.coef_[0]),key=lambda x:x[1])
pprint(coefs[:30])
pprint(coefs[-30:])

[('MENTION_vixxavier acredito', -0.45075759660025527),
 ('MENTION_vixxavier', -0.41407313570559506),
 ('MENTION_blogdopim ninguem', -0.15640751111880824),
 ('MENTION_ingdoc MENTION_lobaoeletrico', -0.15640751111880824),
 ('ninguem gosta', -0.15640751111880824),
 ('um comunista', -0.14475965064777133),
 ('acredito', -0.14364939133552279),
 ('rt MENTION_ingdoc', -0.13806387255262972),
 ('THIS_IS_A_URL MENTION_blogdopim', -0.13745794246007784),
 ('MENTION_ingdoc', -0.13162607663537565),
 ('gosta mais', -0.12903761300571143),
 ('MENTION_coralnet', -0.118599809289055),
 ('vica', -0.11830596568532399),
 ('MENTION_lobaoeletrico THIS_IS_A_URL', -0.1127486502840931),
 ('MENTION_exame_com', -0.11026014328283816),
 ('comunista', -0.10503150541381466),
 ('via', -0.09271825578922839),
 ('dinheiro do', -0.088981057340257963),
 ('no', -0.086119766487644606),
 ('THIS_IS_A_URL via', -0.086096073671798098),
 ('da', -0.085941107074687467),
 ('MENTION_david_gds', -0.085875762703050251),
 ('MENTION_david_g

Results:

| negative_window      |   train acc  | test acc | n users |
| -------------------- | ------------ | -------- | ------- |
| 10                   | .591         | .507     | 285     |
| 20                   | .610         | .509     | 274     |
| 30                   | .614         | .515     | 264     |