In [157]:
from TwitterAPI import TwitterAPI
from collections import defaultdict
from scipy.sparse import lil_matrix
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

import requests
import ConfigParser
import sys
import pickle
import numpy as np
import re

In [158]:
def get_names(filename):
    return [l.strip().lower() for l in open(filename).readlines()]

names = get_names('chicago.txt')
print names

['art institute of chicago', 'millennium park', 'cloud gate', 'museum of science and industry', 'wrigley field', 'michigan avenue', 'adler planetarium', 'field museum', 'willis tower', 'u.s. cellular field', 'symphony center', 'chicago cultural center', 'lincoln park', 'holy name cathedral', 'maggie daley park', 'chicago theatre', 'buckingham fountain', 'grant park', 'john hancock', 'goodman theatre', 'shedd aquarium', 'oriental theatre', 'united center', 'university of chicago', 'tribune tower', 'garfield park', 'chicago history museum', 'navy pier', 'rockefeller memorial chapel', 'cadillac palace theatre']


In [159]:
def get_sight_names(names):
    return [(n, ''.join(n.split()), n.replace(' ', '_')) for n in names]

sight_names = get_sight_names(names)
print sight_names

[('art institute of chicago', 'artinstituteofchicago', 'art_institute_of_chicago'), ('millennium park', 'millenniumpark', 'millennium_park'), ('cloud gate', 'cloudgate', 'cloud_gate'), ('museum of science and industry', 'museumofscienceandindustry', 'museum_of_science_and_industry'), ('wrigley field', 'wrigleyfield', 'wrigley_field'), ('michigan avenue', 'michiganavenue', 'michigan_avenue'), ('adler planetarium', 'adlerplanetarium', 'adler_planetarium'), ('field museum', 'fieldmuseum', 'field_museum'), ('willis tower', 'willistower', 'willis_tower'), ('u.s. cellular field', 'u.s.cellularfield', 'u.s._cellular_field'), ('symphony center', 'symphonycenter', 'symphony_center'), ('chicago cultural center', 'chicagoculturalcenter', 'chicago_cultural_center'), ('lincoln park', 'lincolnpark', 'lincoln_park'), ('holy name cathedral', 'holynamecathedral', 'holy_name_cathedral'), ('maggie daley park', 'maggiedaleypark', 'maggie_daley_park'), ('chicago theatre', 'chicagotheatre', 'chicago_theatre

In [160]:
def get_twitter(config_file):
    config = ConfigParser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

twitter = get_twitter('twitter.cfg')
print 'Established Twitter connection.'

Established Twitter connection.


In [161]:
def robust_request(twitter, resource, params, max_tries=5):
    """ If a Twitter request fails, sleep for 15 minutes.
    Do this at most max_tries times before quitting.
    Args:
      twitter .... A TwitterAPI object.
      resource ... A resource string to request.
      params ..... A parameter dictionary for the request.
      max_tries .. The maximum number of tries to attempt.
    Returns:
      A TwitterResponse object, or None if failed.
    """
    for i in range(max_tries):
        request = twitter.request(resource, params)
        if request.status_code == 200:
            return request
        else:
            print >> sys.stderr, 'Got error:', request.text, '\nsleeping for 15 minutes.'
            sys.stderr.flush()
            time.sleep(60 * 15)

In [6]:
def get_tweets(twitter, limit, location=False, search_term=None):
    tweets = []
    while True:
        try:
            paras = {}
            
            if location:
                paras['locations'] = '-87.932688, 41.638685, -87.517954, 42.021388'
            elif search_term:
                paras['track'] = search_term
            
            print paras
                    
            # Restrict to U.S.
            for response in twitter.request('statuses/filter', paras):
                tweets.append(response)
                if len(tweets) % 10 == 0:
                    print 'found %d tweets' % len(tweets)
                if len(tweets) >= limit:
                    return tweets
        except:
            print "Unexpected error:", sys.exc_info()[0]
        
    return tweets

In [None]:
tweets = get_tweets(twitter, 50, False, 'art institute of chicago')
print len(tweets)

In [None]:
pickle.dump(tweets, open('tweets.pkl', 'wb'))

In [7]:
tweets = pickle.load(open('tweets.pkl', 'rb'))

5000


In [101]:
print len(tweets)

12


In [8]:
def print_tweet(tweets, index):
    test_tweet = tweets[index]
    print('test tweet:\n\tscreen_name=%s\n\tname=%s\n\tdescr=%s\n\ttext=%s' %
          (test_tweet['user']['screen_name'],
           test_tweet['user']['name'],
           test_tweet['user']['description'],
           test_tweet['text']))

In [162]:
def tokenize(string,
             lowercase,
             keep_punctuation,
             collapse_urls,
             collapse_mentions):
    """ Split a tweet into tokens."""
    if not string:
        return []
    if lowercase:
        string = string.lower()
    tokens = []
    if collapse_urls:
        string = re.sub('http\S+', 'THIS_IS_A_URL', string)
    if collapse_mentions:
        string = re.sub('@\S+', 'THIS_IS_A_MENTION', string)
    if keep_punctuation:
        tokens = string.split()
    else:
        tokens = re.sub('\W+', ' ', string).split()
    return tokens

In [163]:
def tweet2tokens(tweet,
                 lowercase=True,
                 keep_punctuation=True,
                 collapse_urls=True,
                 collapse_mentions=True):
    """ Convert a tweet into a list of tokens, from the tweet text and optionally the
    user description. """
    tokens = tokenize(tweet['text'],
                      lowercase,
                      keep_punctuation,
                      collapse_urls,
                      collapse_mentions)
    return tokens

In [164]:
tokens_list = [tweet2tokens(t,
                            lowercase=True,
                            keep_punctuation=False,
                            collapse_urls=True,
                            collapse_mentions=False)
              for t in tweets]

TypeError: string indices must be integers, not str

In [16]:
def make_vocabulary(tokens_list):
    vocabulary = defaultdict(lambda: len(vocabulary))  # If term not present, assign next int.
    for tokens in tokens_list:
        for token in tokens:
            vocabulary[token]  # looking up a key; defaultdict takes care of assigning it a value.
    print '%d unique terms in vocabulary' % len(vocabulary)
    return vocabulary

In [17]:
vocabulary = make_vocabulary(tokens_list)

9580 unique terms in vocabulary


In [18]:
def make_feature_matrix(tokens_list, vocabulary):
    X = lil_matrix((len(tweets), len(vocabulary)))
    for i, tokens in enumerate(tokens_list):
        for token in tokens:
            j = vocabulary[token]
            X[i,j] += 1
    return X.tocsr()  # convert to CSR for more efficient random access.

In [19]:
X = make_feature_matrix(tokens_list, vocabulary)

In [20]:
index2term = dict((i, t) for t, i in vocabulary.items())
term2index = dict((t, i) for t, i in vocabulary.items())

In [21]:
def get_sight_indices(X, names):
    indices = []
    for i in range(X.shape[0]):
        for n in names:
                terms = n.split(' ')
                if len(terms) == 1 and term2index.has_key(n) and X[i, term2index[n]] != 0 and i not in indices:
                    indices.append(i)
                elif len(terms) > 1:
                    flag = True
                    for t in terms:
                        if not term2index.has_key(t) or X[i, term2index[t]] == 0:
                            flag = False
                    if flag == True and i not in indices:
                        indices.append(i)
    return indices

sight_indices = []
for n in sight_names:
        sight_indices.append(get_sight_indices(X, n))

for i in sight_indices:
    print i,

[2305, 2466] [2293, 3740] [1783, 3170] [4769] [] [1108] [] [606, 2419] [] [] [] [] [4530] [] [] [3377, 4366] [] [] [4572] [] [1660, 2634] [3055, 3377] [] [] [] [] [] [] [] []


This approach doesn't work when the amount of tweets is relatively small.

We implement a second approach for small amount of data.

In [165]:
def get_tweets2(twitter, limit=100, search_item=None):
    tweets = []
    for r in twitter.request('search/tweets', {'q':search_item, 'lang':'en', 'count':limit}):
        tweets.append(r)
    print "found %d tweets" % len(tweets)
        
    return tweets

sight_tweets = []
for n in names:
    print "Retrieving tweets for:", n
    sight_tweets.append(get_tweets2(twitter, 100, n))

Retrieving tweets for: art institute of chicago
found 100 tweets
Retrieving tweets for: millennium park
found 100 tweets
Retrieving tweets for: cloud gate
found 100 tweets
Retrieving tweets for: museum of science and industry
found 100 tweets
Retrieving tweets for: wrigley field
found 100 tweets
Retrieving tweets for: michigan avenue
found 100 tweets
Retrieving tweets for: adler planetarium
found 53 tweets
Retrieving tweets for: field museum
found 100 tweets
Retrieving tweets for: willis tower
found 100 tweets
Retrieving tweets for: u.s. cellular field
found 10 tweets
Retrieving tweets for: symphony center
found 100 tweets
Retrieving tweets for: chicago cultural center
found 93 tweets
Retrieving tweets for: lincoln park
found 100 tweets
Retrieving tweets for: holy name cathedral
found 33 tweets
Retrieving tweets for: maggie daley park
found 86 tweets
Retrieving tweets for: chicago theatre
found 100 tweets
Retrieving tweets for: buckingham fountain
found 33 tweets
Retrieving tweets for:

In [166]:
print len(sight_tweets)
print len(sight_tweets[0])

30
100


In [167]:
def sort_mention_times(sight_tweets):
    mention_times = ((index, len(tweets)) for index, tweets in enumerate(sight_tweets))
    return sorted(mention_times, key=lambda x:x[1], reverse=True)

sorted_sight_tweets = sort_mention_times(sight_tweets)
for i in sorted_sight_tweets:
    print i

(0, 100)
(1, 100)
(2, 100)
(3, 100)
(4, 100)
(5, 100)
(7, 100)
(8, 100)
(10, 100)
(12, 100)
(15, 100)
(17, 100)
(18, 100)
(20, 100)
(21, 100)
(22, 100)
(23, 100)
(24, 100)
(25, 100)
(26, 100)
(27, 100)
(11, 93)
(14, 86)
(19, 61)
(6, 53)
(13, 33)
(16, 33)
(29, 28)
(9, 10)
(28, 4)


In [168]:
url = urlopen('http://www2.compute.dtu.dk/~faan/data/AFINN.zip')
zipfile = ZipFile(StringIO(url.read()))
afinn_file = zipfile.open('AFINN/AFINN-111.txt')

afinn = dict()

for line in afinn_file:
    parts = line.strip().split()
    if len(parts) == 2:
        afinn[parts[0]] = int(parts[1])

print 'read', len(afinn), 'AFINN terms.\nE.g.:', afinn.items()[:10]

read 2462 AFINN terms.
E.g.: [('limited', -1), ('suicidal', -2), ('pardon', 2), ('desirable', 2), ('protest', -2), ('lurking', -1), ('controversial', -2), ('hating', -3), ('ridiculous', -3), ('hate', -3)]


In [170]:
def afinn_sentiment(terms, afinn, verbose=False):
    pos = 0
    neg = 0
    for t in terms:
        if t in afinn:
            if verbose:
                print '\t%s=%d' % (t, afinn[t])
            if afinn[t] > 0:
                pos += afinn[t]
            else:
                neg += -1 * afinn[t]
    return pos, neg

In [178]:
def afinn_sentiment2(terms, afinn):
    total = 0.
    for t in terms:
        if t in afinn:
            total += afinn[t]
    return total

In [172]:
sight_tokens_list = []
for tweets in sight_tweets:
    sight_tokens_list.append([tweet2tokens(t,
                                lowercase=True,
                                keep_punctuation=False,
                                collapse_urls=True,
                                collapse_mentions=False)
                  for t in tweets])
print len(sight_tokens_list)
print sight_tokens_list[0][4]

30
[u'rt', u'jvsiah_', u'really', u'hoping', u'school', u'of', u'the', u'art', u'institute', u'of', u'chicago', u'accepts', u'me']


In [105]:
sight_positives = []
sight_negatives = []
sight_neutrals = []
for tokens in sight_tokens_list:
    positives = []
    negatives = []
    neutrals = []
    for tweet in tokens:
        pos, neg = afinn_sentiment(tweet, afinn)
        if pos > neg:
            positives.append((pos, neg, ' '.join(tweet)))
        elif neg > pos:
            negatives.append((pos, neg, ' '.join(tweet)))
        else:
            neutrals.append((pos, neg, ' '.join(tweet)))
    sight_positives.append(positives)
    sight_negatives.append(negatives)
    sight_neutrals.append(neutrals)

print len(sight_positives)
print len(sight_positives)
print len(sight_neutrals)

30
30
30


In [180]:
sight_scores = []
for tokens in sight_tokens_list:
    scores= []
    for tweet in tokens:
        scores.append(afinn_sentiment2(tweet, afinn))
    sight_scores.append(scores)

print len(sight_scores)
print len(sight_scores[0])
print sight_scores[0]

30
100
[0.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, -1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0]


In [193]:
sight_mean_scores = [(i, sum(s) / len(s)) for i, s in enumerate(sight_scores)]

In [204]:
top_neg = sorted(sight_mean_scores, key=lambda x:x[1])[:10]
for t in top_neg:
    print '%.5f %s' % (t[1], names[t[0]])

-4.18000 university of chicago
-2.05000 michigan avenue
-1.00000 cadillac palace theatre
-0.67000 tribune tower
-0.29000 garfield park
0.00000 rockefeller memorial chapel
0.20000 u.s. cellular field
0.21000 cloud gate
0.27273 holy name cathedral
0.34000 john hancock


In [205]:
top_pos = sorted(sight_mean_scores, key=lambda x:x[1], reverse=True)[:10]
for t in top_pos:
    print '%.5f %s' % (t[1], names[t[0]])

1.57000 grant park
1.53000 field museum
1.34000 wrigley field
1.21212 buckingham fountain
1.18000 united center
0.97000 shedd aquarium
0.88000 navy pier
0.75581 maggie daley park
0.71000 museum of science and industry
0.71000 chicago history museum


In [192]:
print names[5]

michigan avenue


In [106]:
for index, positives in enumerate(sight_positives):
    print '\n---Positive tweets of:', names[index]
    for pos, neg, tweet in sorted(positives, key=lambda x: x[1], reverse=True):
        print pos, neg, tweet


---Positive tweets of: art institute of chicago
3 1 eldzier cortor a forgotten great back in chicago after 60 years THIS_IS_A_URL
2 0 my kind of town the art institute of chicago THIS_IS_A_URL
3 0 rt marcartbrut un tres grand peintre francis picabia ecclesiastic 1913 oil on canvas art institute of chicago THIS_IS_A_URL
2 0 feel inspired at the art institute of chicago artphotography THIS_IS_A_URL
1 0 art institute of chicago in full christmas spirit chicago theartinstitute THIS_IS_A_URL THIS_IS_A_URL
1 0 i want this statue from the art institute of chicago where can i buy a replica THIS_IS_A_URL
3 0 it was nice running into actionbronson at the art institute of chicago today THIS_IS_A_URL
5 0 rt insideweddings a breathtaking blue wedding at the art institute of chicago THIS_IS_A_URL artisan events THIS_IS_A_URL
5 0 a breathtaking blue wedding at the art institute of chicago THIS_IS_A_URL artisan events THIS_IS_A_URL
5 0 a breathtaking blue wedding at the art institute of chicago THIS_

In [107]:
for index, negatives in enumerate(sight_negatives):
    print '\n---Negative tweets of:', names[index]
    for pos, neg, tweet in sorted(negatives, key=lambda x: x[1], reverse=True):
        print pos, neg, tweet


---Negative tweets of: art institute of chicago
0 1 wilhelmgustloff did nt he paint a rainy afternoon street scene in paris it is in the art institute of chicago
0 1 saralmacdonald kilttripusa rainy day was 1 of my faves at the art institute in chicago when i visited especially 2 see a seurat painting

---Negative tweets of: millennium park
0 3 millennium park city tree is very disappointing no ornaments cityofchicagotree
0 3 rt queen_kas pls tell anty oby let her goan cry in millennium park tomyboiz reports that boko haram just abducted more girls in borno
0 3 rt queen_kas pls tell anty oby let her goan cry in millennium park tomyboiz reports that boko haram just abducted more girls in borno
0 3 pls tell anty oby let her goan cry in millennium park tomyboiz reports that boko haram just abducted more girls in borno nigeria
0 2 rt andrewjpg sassleski make sure to spend some time at cindy s with an incredible view of millennium park and fire THIS_IS_A_URL
0 2 sassleski make sure to spen

In [108]:
for index, neutrals in enumerate(sight_neutrals):
    print '\n---Neutral tweets of:', names[index]
    for pos, neg, tweet in sorted(neutrals, key=lambda x: x[1], reverse=True):
        print pos, neg, tweet


---Neutral tweets of: art institute of chicago
0 0 apply now to work for the art institute of chicago as senior application developer in chicago jobs THIS_IS_A_URL
0 0 art institute af the art institute of chicago THIS_IS_A_URL
0 0 truetwit at the art institute of chicago bonnard basket and plate of fruit on a red checkered tablecloth 1939 THIS_IS_A_URL
0 0 rt to_work sonja_seear cuervoarte stainglass by mathias kyazze from uganda and marc chagall in chicago institute of art THIS_IS_A_URL
0 0 just posted a photo the art institute of chicago THIS_IS_A_URL
0 0 american gothic art institute of chicago THIS_IS_A_URL
0 0 just a rest liveartfully noddpottery the art institute of chicago the modern wing THIS_IS_A_URL
0 0 inside looking out the art institute of chicago THIS_IS_A_URL
0 0 rt fb_vz a day at artinstitutechi chicago artinstituteofchicago art seurat the art institute of chicago THIS_IS_A_URL
0 0 a day at artinstitutechi chicago artinstituteofchicago art seurat the art institute of 

In [None]:
for i in range(names):

In [98]:
f = open('pos.txt', 'w')
for positives in sight_positives:
    for pos, neg, tweet in positives:
        content = '\t'.join([str(pos), str(neg), tweet]) + '\n'
        f.write(content)
f.close()

In [99]:
f = open('neg.txt', 'w')
for negatives in sight_negatives:
    for pos, neg, tweet in negatives:
        content = '\t'.join([str(pos), str(neg), tweet]) + '\n'
        f.write(content)
f.close()

In [100]:
f = open('neu.txt', 'w')
for neutrals in sight_neutrals:
    for pos, neg, tweet in neutrals:
        content = '\t'.join([str(pos), str(neg), tweet]) + '\n'
        f.write(content)
f.close()

In [110]:
total_tweets = 0
for tweets in sight_positives:
    total_tweets += len(tweets)
for tweets in sight_negatives:
    total_tweets += len(tweets)
for tweets in sight_neutrals:
    total_tweets += len(tweets)
print total_tweets

2516


In [126]:
def get_labeled_tweets(filenames):
    labels = []
    tweets = []
    for f in filenames:
        for l in open(f).readlines():
            terms = l.strip().lower().split('\t')
            labels.append(terms[0])
            tweets.append(terms[3])
    return labels, tweets

In [146]:
labels, tweets = get_labeled_tweets(['pos_labeled.txt', 'neg_labeled.txt', 'neu_labeled.txt'])
y = np.array(labels)
print len(labels)
print len(tweets)
print tweets[0]

2489
2489
eldzier cortor a forgotten great back in chicago after 60 years this_is_a_url


In [147]:
def tweet2tokens(tweet,
                 lowercase=True,
                 keep_punctuation=True,
                 collapse_urls=True,
                 collapse_mentions=True):
    """ Convert a tweet into a list of tokens, from the tweet text and optionally the
    user description. """
    tokens = tokenize(tweet,
                      lowercase,
                      keep_punctuation,
                      collapse_urls,
                      collapse_mentions)
    return tokens

In [150]:
tokens_list = [tweet2tokens(t,
                            lowercase=True,
                            keep_punctuation=False,
                            collapse_urls=True,
                            collapse_mentions=False)
              for t in tweets]
print len(tokens_list)

2489


In [151]:
def make_vocabulary(tokens_list):
    vocabulary = defaultdict(lambda: len(vocabulary))  # If term not present, assign next int.
    for tokens in tokens_list:
        for token in tokens:
            vocabulary[token]  # looking up a key; defaultdict takes care of assigning it a value.
    print '%d unique terms in vocabulary' % len(vocabulary)
    return vocabulary

vocabulary = make_vocabulary(tokens_list)

4530 unique terms in vocabulary


In [152]:
def make_feature_matrix(tokens_list, vocabulary):
    X = lil_matrix((len(tweets), len(vocabulary)))
    for i, tokens in enumerate(tokens_list):
        for token in tokens:
            j = vocabulary[token]
            X[i,j] += 1
    return X.tocsr()  # convert to CSR for more efficient random access.

In [153]:
X = make_feature_matrix(tokens_list, vocabulary)

In [154]:
def do_cross_val(X, y, nfolds):
    """ Compute average cross-validation acccuracy."""
    cv = KFold(len(y), nfolds)
    accuracies = []
    for train_idx, test_idx in cv:
        clf = LogisticRegression()
        clf.fit(X[train_idx], y[train_idx])
        predicted = clf.predict(X[test_idx])
        acc = accuracy_score(y[test_idx], predicted)
        accuracies.append(acc)
    avg = np.mean(accuracies)
    return avg

In [208]:
print 'avg accuracy', do_cross_val(X, y, 10)

avg accuracy 0.707538217386
