In [1]:
from TwitterAPI import TwitterAPI
from collections import defaultdict, Counter
from scipy.sparse import lil_matrix
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score #, confusion_matrix

import requests
import ConfigParser
import sys
import pickle
import re
import numpy as np

In [2]:
"""
Get location info for Streaming API use from the txt file using the city's name as the filename

Paras:
    city_name: city's name
    
Returns:
    a string to record the location info of the city
"""
def get_locations(city_name):
    return open(city_name.lower() + '.txt').readlines()[0].strip().lower()

locations = get_locations('Chicago')
print locations

-87.932688, 41.638685, -87.517954, 42.021388


In [3]:
"""
Get geo info for REST API use from the txt file using the city's name as the filename

Paras:
    city_name: city's name
    
Returns:
    a string to record the geo info of the city
"""
def get_geocode(city_name):
    return open(city_name.lower() + '.txt').readlines()[1].strip().lower()

geocode = get_geocode('Chicago')
print geocode

-87.627810, 41.882048, 50mi


In [4]:
"""
Get the sight names from the txt file using the city's name as the filename

Paras:
    city_name: city's name
    
Returns:
    a list of string to record the sight names of the city
"""
def get_sight_names(city_name):
    return [l.strip().lower() for l in open(city_name.lower() + '.txt').readlines()[2:]]

sight_names = get_sight_names('Chicago')
print 'INDEX\tNAME'
for i, n in enumerate(sight_names):
    print i, '\t', n

INDEX	NAME
0 	art institute of chicago
1 	millennium park
2 	cloud gate
3 	museum of science and industry
4 	wrigley field
5 	michigan avenue
6 	adler planetarium
7 	field museum
8 	willis tower
9 	u.s. cellular field
10 	symphony center
11 	chicago cultural center
12 	lincoln park
13 	holy name cathedral
14 	maggie daley park
15 	chicago theatre
16 	buckingham fountain
17 	grant park
18 	john hancock
19 	goodman theatre
20 	shedd aquarium
21 	oriental theatre
22 	united center
23 	university of chicago
24 	tribune tower
25 	garfield park
26 	chicago history museum
27 	navy pier
28 	rockefeller memorial chapel
29 	cadillac palace theatre


In [5]:
"""
Get multiple names of the sights.
    E.g. use “art institute of chicago”, “artinstituteofchicago” and “art_institute_of_chicago”
    for searching the tweets for the Art Institute of Chicago

Paras:
    sight_names: a sight's name
    
Returns:
    a list of tuple to record the multiple names of the sights
"""
def get_sight_multi_names(sight_names):
    return [(n, ''.join(n.split()), n.replace(' ', '_')) for n in sight_names]

sight_multi_names = get_sight_multi_names(sight_names)
for n in zip(sight_names, sight_multi_names):
    print n[0], '\n\t', ' | '.join(n[1][i] for i in range(len(n[1]))), '\n'

art institute of chicago 
	art institute of chicago | artinstituteofchicago | art_institute_of_chicago 

millennium park 
	millennium park | millenniumpark | millennium_park 

cloud gate 
	cloud gate | cloudgate | cloud_gate 

museum of science and industry 
	museum of science and industry | museumofscienceandindustry | museum_of_science_and_industry 

wrigley field 
	wrigley field | wrigleyfield | wrigley_field 

michigan avenue 
	michigan avenue | michiganavenue | michigan_avenue 

adler planetarium 
	adler planetarium | adlerplanetarium | adler_planetarium 

field museum 
	field museum | fieldmuseum | field_museum 

willis tower 
	willis tower | willistower | willis_tower 

u.s. cellular field 
	u.s. cellular field | u.s.cellularfield | u.s._cellular_field 

symphony center 
	symphony center | symphonycenter | symphony_center 

chicago cultural center 
	chicago cultural center | chicagoculturalcenter | chicago_cultural_center 

lincoln park 
	lincoln park | lincolnpark | lincoln_par

In [6]:
"""
Establish twitter connection

Paras:
    config_file: config file's name
    
Returns:
    twitter
"""
def get_twitter(config_file):
    config = ConfigParser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

twitter = get_twitter('twitter.cfg')
print 'Established Twitter connection.'

Established Twitter connection.


In [7]:
"""
Get tweets using Stream API 'statuses/filter' within the limit amount with two modes:
    mode = 0: use track: keywords as paras
    mode = 1: use location: get_location(city_name) as paras

Paras:
    twitter: twitter connection
    limit: limit amount of the retrieved tweets
    city_name: city's name
    mode: when 0 use keywords as paras to search for tweets, when 1 use location
    keywords: used to search tweets
    lang='en': language option, default English
    verbose=False: if True, print log
    n=100: frequency of printing log
    
Returns:
    a list of dictionary to store all the tweets retrieved
"""
def get_tweets(twitter,
               limit,
               city_name,
               mode,
               keywords,
               lang='en',
               verbose=False,
               n=100):
    tweets = []
    paras = {}

    # statuses/filter
    # track: keywords
    if mode == 0:
        if keywords:
            paras['track'] = keywords
    
    # statuses/filter
    # location: get_location(city_name)
    elif mode == 1:
        paras['locations'] = get_locations(city_name)
        
    print 'mode =', mode
    print 'paras=' + str(paras)
    
    while True:
        try:
            if len(paras) != 0:
                for response in twitter.request('statuses/filter', paras):
                    tweets.append(response)
                    if verbose:
                        if len(tweets) % n == 0:
                            print 'found %d tweets' % len(tweets)
                    if len(tweets) >= limit:
                        return tweets
        except:
            print "Unexpected error:", sys.exc_info()[0]
        
    return tweets

# First approach - first try

In [None]:
"""
def get_tweets(twitter,
           limit,
           city_name,
           mode,
           keywords,
           lang='en',
           verbose=False,
           n=100)

Get tweets with Stream API using mode 0
"""
tweets = get_tweets(twitter,
                    50,
                    'Chicago',
                    0,
                    'art institute of chicagi',
                    'en',
                    True,
                    10)
print 'Get %d tweets.' % len(tweets)

This approach doesn't work.

We implement a second approach for smaller amount of data.

# First Approach - second try

In [None]:
"""
def get_tweets(twitter,
           limit,
           city_name,
           mode,
           keywords,
           lang='en',
           verbose=False,
           n=100)

Get tweets with Stream API using mode 0
"""
tweets = get_tweets(twitter,
                    50,
                    'Chicago',
                    1,
                    None,
                    'en',
                    True,
                    10)
print 'Get %d tweets.' % len(tweets)

In [None]:
"""
Dump the tweets into a file for archive
"""
pickle.dump(tweets, open('tweets.pkl', 'wb'))

In [8]:
"""
Read tweets from the archived file
"""
tweets = pickle.load(open('tweets.pkl', 'rb'))
print 'Get %d tweets from archive file.' % len(tweets)

Get 5000 tweets from archive file.


In [9]:
"""
Print a tweet's content

Paras:
    tweets: list of tweets
    index: index of the tweet to be printed
    
Returns:
    N/A
"""
def print_tweet(tweets, index):
    test_tweet = tweets[index]
    print('tweet:\n\tscreen_name = %s\n\tname = %s\n\tdescr = %s\n\ttext = %s' %
          (test_tweet['user']['screen_name'],
           test_tweet['user']['name'],
           test_tweet['user']['description'],
           test_tweet['text']))

print_tweet(tweets, 3377)
print_tweet(tweets, 4366)

tweet:
	screen_name = Lainey
	name = Lainey Canevaro
	descr = Small town NorCal girl, living in the Windy City; Wifey, Mom, Sis, Daughter. Sbux coffee & @FlacosTacos Sangarita lover! Thoughts my own (FBOW) #cl = client
	text = Sherlock Holmes. Very Interesting... (@ Oriental Theatre - @broadwaychicago for Sherlock Holmes (Chicago)) https://t.co/gS0aQ1LDsx
tweet:
	screen_name = OscarSanchezR_
	name = Oscar Sanchez
	descr = Ingeniero Industrial
	text = 🌠📽🚪 #chicago #theatre @ Chicago Theatre https://t.co/87XP0uyNEZ


In [10]:
"""
Get each sight's tweet indices that relates to this sight from the retrieved tweets list

Paras:
    tweets: list of tweets
    names: sights' names
    
Returns:
    a list of list of indices
"""
def get_sight_indices(tweets, names):
    indices = []
    for i, t in enumerate(tweets):
        for n in names:
            if i not in indices and n in t['text'].lower():
                indices.append(i)
    return indices

sight_indices = []
for names in sight_multi_names:
        sight_indices.append(get_sight_indices(tweets, names))

for i in sight_indices:
    print i,

[2305, 2466] [2293, 3740] [1783, 3170] [4769] [] [1108] [] [606, 2419] [] [] [] [] [4530] [] [] [4366] [] [] [4572] [] [1660, 2634] [3055, 3377] [] [] [] [] [] [] [] []


In [11]:
"""
Print the result from getting sights' indices
"""
print 'The tweets that has \'art institute of chicago\' or \'artinstituteofchicago\' or \'art_institute_of_chicago\':'
for i, t in enumerate(tweets):
    if 'art institute of chicago' in t['text'].lower():
        print i, ':', t['text']

The tweets that has 'art institute of chicago' or 'artinstituteofchicago' or 'art_institute_of_chicago':
2305 : A day at @artinstitutechi #chicago #artinstituteofchicago #art #seurat @ The Art Institute of Chicago https://t.co/gW8ZRY2kRx
2466 : Inside looking out @ The Art Institute of Chicago https://t.co/3NgNZWrCcn


This approach doesn't work when the amount of tweets is relatively small.

We implement a second approach for small amount of data.

# Second Approach

In [14]:
"""
Get tweets using REST API 'search/tweets' within the limit amount with two modes:
    mode = 0: use q: keywords, lang: lang as paras
    mode = 1: use q: keywords, lang: lang, geocode: get_geocode(city_name) as paras

Paras:
    twitter: twitter connection
    mode: when 0 use keywords and language as paras to search for tweets, when 1 use keywords language and location
    city_name: city's name
    keywords: used to search tweets
    limit: limit amount of the retrieved tweets
    lang='en': language option, default English
    
Returns:
    a list of dictionary to store all the tweets retrieved
"""
def get_tweets2(twitter,
                mode,
                city_name,
                keywords,
                limit=100,
                lang='en'):
    tweets = []
    paras = {}
    
    # search/tweets
    # q: keywords
    # lang: lang
    # geocode: get_geocode(city_name)
    if keywords:
        paras['q'] = keywords
        paras['count'] = limit
        paras['lang'] = lang
        if mode == 1:
            paras['geocode'] = get_geocode(city_name)
    
    print 'paras=' + str(paras)
    for r in twitter.request('search/tweets', paras):
        tweets.append(r)
    print "found %d tweets" % len(tweets)
    
    return tweets

In [15]:
"""
Get tweets with REST API using mode 1
"""
sight_tweets_geo = []
for n in sight_names:
    print "Retrieving tweets with geo info for:", n
    sight_tweets_geo.append(get_tweets2(twitter, 1, 'Chicago', n))

Retrieving tweets with geo info for: art institute of chicago
paras={'q': 'art institute of chicago', 'count': 100, 'geocode': '-87.627810, 41.882048, 50mi', 'lang': 'en'}
found 0 tweets
Retrieving tweets with geo info for: millennium park
paras={'q': 'millennium park', 'count': 100, 'geocode': '-87.627810, 41.882048, 50mi', 'lang': 'en'}
found 0 tweets
Retrieving tweets with geo info for: cloud gate
paras={'q': 'cloud gate', 'count': 100, 'geocode': '-87.627810, 41.882048, 50mi', 'lang': 'en'}
found 0 tweets
Retrieving tweets with geo info for: museum of science and industry
paras={'q': 'museum of science and industry', 'count': 100, 'geocode': '-87.627810, 41.882048, 50mi', 'lang': 'en'}
found 0 tweets
Retrieving tweets with geo info for: wrigley field
paras={'q': 'wrigley field', 'count': 100, 'geocode': '-87.627810, 41.882048, 50mi', 'lang': 'en'}
found 0 tweets
Retrieving tweets with geo info for: michigan avenue
paras={'q': 'michigan avenue', 'count': 100, 'geocode': '-87.627810,

This way return nealy nothing

In [16]:
"""
Get tweets with REST API using mode 0
"""
sight_tweets = []
for n in sight_names:
    print "Retrieving tweets for:", n
    sight_tweets.append(get_tweets2(twitter, 0, 'Chicago', n))

Retrieving tweets for: art institute of chicago
paras={'q': 'art institute of chicago', 'count': 100, 'lang': 'en'}
found 100 tweets
Retrieving tweets for: millennium park
paras={'q': 'millennium park', 'count': 100, 'lang': 'en'}
found 100 tweets
Retrieving tweets for: cloud gate
paras={'q': 'cloud gate', 'count': 100, 'lang': 'en'}
found 100 tweets
Retrieving tweets for: museum of science and industry
paras={'q': 'museum of science and industry', 'count': 100, 'lang': 'en'}
found 100 tweets
Retrieving tweets for: wrigley field
paras={'q': 'wrigley field', 'count': 100, 'lang': 'en'}
found 100 tweets
Retrieving tweets for: michigan avenue
paras={'q': 'michigan avenue', 'count': 100, 'lang': 'en'}
found 100 tweets
Retrieving tweets for: adler planetarium
paras={'q': 'adler planetarium', 'count': 100, 'lang': 'en'}
found 54 tweets
Retrieving tweets for: field museum
paras={'q': 'field museum', 'count': 100, 'lang': 'en'}
found 100 tweets
Retrieving tweets for: willis tower
paras={'q': '

In [17]:
print 'Get tweets for %d sights' % len(sight_tweets)
print 'sight 0 has %d tweets' % len(sight_tweets[0])

Get tweets for 30 sights
sight 0 has 100 tweets


In [18]:
"""
Sort the number of tweets retrieved for every sight

Paras:
    sight_tweets: list of tweets group by sight
    
Returns:
    a list of tuple that records (index of the sight, mention times)
"""
def sort_mention_times(sight_tweets):
    mention_times = ((index, len(tweets)) for index, tweets in enumerate(sight_tweets))
    return sorted(mention_times, key=lambda x:x[1], reverse=True)

sorted_sight_tweets = sort_mention_times(sight_tweets)
for i in sorted_sight_tweets:
    print i, sight_names[i[0]]

(0, 100) art institute of chicago
(1, 100) millennium park
(2, 100) cloud gate
(3, 100) museum of science and industry
(4, 100) wrigley field
(5, 100) michigan avenue
(7, 100) field museum
(8, 100) willis tower
(10, 100) symphony center
(11, 100) chicago cultural center
(15, 100) chicago theatre
(18, 100) john hancock
(20, 100) shedd aquarium
(21, 100) oriental theatre
(22, 100) united center
(23, 100) university of chicago
(24, 100) tribune tower
(25, 100) garfield park
(26, 100) chicago history museum
(17, 94) grant park
(12, 92) lincoln park
(27, 91) navy pier
(14, 79) maggie daley park
(19, 63) goodman theatre
(6, 54) adler planetarium
(29, 42) cadillac palace theatre
(16, 31) buckingham fountain
(13, 27) holy name cathedral
(9, 7) u.s. cellular field
(28, 3) rockefeller memorial chapel


The mention contributes little to the result.

In [19]:
"""
Get AFINN lexicon dataset
"""
url = urlopen('http://www2.compute.dtu.dk/~faan/data/AFINN.zip')
zipfile = ZipFile(StringIO(url.read()))
afinn_file = zipfile.open('AFINN/AFINN-111.txt')

afinn = dict()

for line in afinn_file:
    parts = line.strip().split()
    if len(parts) == 2:
        afinn[parts[0]] = int(parts[1])

print 'Read', len(afinn), 'AFINN terms'

Read 2462 AFINN terms


In [20]:
"""
Using AFINN to seperate positive and negtive terms

Paras:
    terms: terms of the tweets to be determined pos/neg
    afinn: AFINN lexicon dictionary
    verbose=False: print log if True
    
Returns:
    pos: score of total positive terms
    neg: score of total megative terms
"""
def afinn_sentiment(terms, afinn, verbose=False):
    pos = 0
    neg = 0
    for t in terms:
        if t in afinn:
            if verbose:
                print '\t%s=%d' % (t, afinn[t])
            if afinn[t] > 0:
                pos += afinn[t]
            else:
                neg += -1 * afinn[t]
    return pos, neg

In [21]:
"""
Tokenize a tweet

Paras:
    string: the tweet to be tokenized
    lowercase: change the tweet content to lowercase if True
    keep_punctuation: keep all punctuations when tokenizing the tweet if True
    collapse_urls: collapse all urls when tokenizing the tweet if True
    collapse_mentions: collapse all mentions when tokenizing the tweet if True
    
Returns:
    a list of tokens
"""
def tokenize(tweet,
             lowercase,
             keep_punctuation,
             collapse_urls,
             collapse_mentions):
    """ Split a tweet into tokens."""
    if not tweet:
        return []
    
    if lowercase:
        tweet = tweet.lower()
        
    tokens = []
    
    if collapse_urls:
        tweet = re.sub('http\S+', 'THIS_IS_A_URL', tweet)
        
    if collapse_mentions:
        tweet = re.sub('@\S+', 'THIS_IS_A_MENTION', tweet)
        
    if keep_punctuation:
        tokens = tweet.split()
    else:
        tokens = re.sub('\W+', ' ', tweet).split()
        
    return tokens

In [22]:
"""
Get tokens list for all sights
"""
sight_tokens_list = []
for ts in sight_tweets:
    sight_tokens_list.append([tokenize(t['text'],
                                       lowercase=True,
                                       keep_punctuation=False,
                                       collapse_urls=True,
                                       collapse_mentions=False)
                  for t in ts])
print 'Get tokens list for %d sights' % len(sight_tokens_list)
print 'Print an example of the tokens:'
print sight_tokens_list[0][4]

Get tokens list for 30 sights
Print an example of the tokens:
[u'rt', u'angellxn', u'when', u'whole', u'squad', u'gets', u'into', u'paris', u'college', u'of', u'art', u'and', u'art', u'institute', u'of', u'chicago', u'THIS_IS_A_URL']


In [23]:
"""
Using AFINN lexicon dict to seperate the tweets to three different sets of tweets
"""
sight_positives = []
sight_negatives = []
sight_neutrals = []
for tokens in sight_tokens_list:
    positives = []
    negatives = []
    neutrals = []
    for tweet in tokens:
        pos, neg = afinn_sentiment(tweet, afinn)
        if pos > neg:
            positives.append((pos, neg, ' '.join(tweet)))
        elif neg > pos:
            negatives.append((pos, neg, ' '.join(tweet)))
        else:
            neutrals.append((pos, neg, ' '.join(tweet)))
    sight_positives.append(positives)
    sight_negatives.append(negatives)
    sight_neutrals.append(neutrals)

print len(sight_positives)
print len(sight_positives)
print len(sight_neutrals)

30
30
30


In [24]:
"""
Print all positive tweets in order
"""
for index, positives in enumerate(sight_positives):
    print '\n---Positive tweets of:', sight_names[index]
    for pos, neg, tweet in sorted(positives, key=lambda x: x[1], reverse=True):
        print pos, neg, tweet


---Positive tweets of: art institute of chicago
2 0 got gloves from the school of the art institute of chicago in the mail today they re so cute saic THIS_IS_A_URL
3 0 rt emilythough on saturday adam and i are going to chicago and i am so excited to introduce him to the loves of my life in the art instit
3 0 on saturday adam and i are going to chicago and i am so excited to introduce him to the loves of my life in the art institute aka van gogh
4 0 mariidipierro if you re ever in the city of chicago you must check out the art institute amazing
2 0 i m not lion it feels like christmas chicago chicagoartinstitute the art institute of chicago THIS_IS_A_URL
2 0 top story ask vincent the art institute of chicago THIS_IS_A_URL see more THIS_IS_A_URL
2 0 top story ask vincent the art institute of chicago THIS_IS_A_URL see more THIS_IS_A_URL
3 0 i love this idea and i ll ask vincent a question too ask vincent the art institute of chicago THIS_IS_A_URL via addthis
2 0 small yet so powerful art

In [25]:
"""
Print all negative tweets in order
"""
for index, negatives in enumerate(sight_negatives):
    print '\n---Negative tweets of:', sight_names[index]
    for pos, neg, tweet in sorted(negatives, key=lambda x: x[1], reverse=True):
        print pos, neg, tweet


---Negative tweets of: art institute of chicago
2 3 he looks mad i don t think he likes that wreath him the art institute of chicago THIS_IS_A_URL
0 3 lost in a void art institute of chicago THIS_IS_A_URL
0 2 apparently this is a problem at the school of the art institute chicago learningphotography THIS_IS_A_URL
0 2 miss this day artmuseum the art institute of chicago THIS_IS_A_URL
0 1 what passed for daylight today gray the art institute of chicago THIS_IS_A_URL

---Negative tweets of: millennium park
1 4 rt ianermoian so protesters think that tearing down the christmas tree in millennium park will solve the problems of laquan mcdonald s sh
1 3 before the weather gets ugly with the snow and drops below 10 degrees i want to go see the tree in millennium park and the zoo lights
0 2 miss them millennium park THIS_IS_A_URL
0 2 only thing mayor emanuel did wrong is move the christmas tree to millennium park
0 1 made a stop at the tree at millennium park jinglearoundtheloop THIS_IS_A_URL


In [26]:
"""
Print all neutral tweets in order
"""
for index, neutrals in enumerate(sight_neutrals):
    print '\n---Neutral tweets of:', sight_names[index]
    for pos, neg, tweet in sorted(neutrals, key=lambda x: x[1], reverse=True):
        print pos, neg, tweet


---Neutral tweets of: art institute of chicago
2 2 insignificant trollstate russia what is important museim art institute of chicago stroller tour monthly on the 1st wednesday 11 30 12 30 pm
0 0 rt moderndayms seven things you must see at the art institute of chicago ups and downs of a yoga mom THIS_IS_A_URL upsdownsy
0 0 seven things you must see at the art institute of chicago ups and downs of a yoga mom THIS_IS_A_URL upsdownsyogamom chicago
0 0 ask vincent the art institute of chicago THIS_IS_A_URL via addthis
0 0 travel calling on legends at the art institute of chicago craveonline THIS_IS_A_URL via craveonline
0 0 rt angellxn when whole squad gets into paris college of art and art institute of chicago THIS_IS_A_URL
0 0 ask vincent the art institute of chicago THIS_IS_A_URL via addthis
0 0 who s been to the art institute of chicago
0 0 rt artinstitutechi a look at the new contemporary an unprecedented exhibition on view 12 13 via archdigest THIS_IS_A_URL htt
0 0 rt claytonguse the

In [27]:
"""
Calculate the number of tweets in each sets and the total number of tweets
"""
total_pos = 0
total_neg = 0
total_neu = 0
for pos_ts in sight_positives:
    total_pos += len(pos_ts)
for neg_ts in sight_negatives:
    total_neg += len(neg_ts)
for neu_ts in sight_neutrals:
    total_neu += len(neu_ts)
print 'Have %d tweets in total:' % (total_pos + total_neg + total_neu)
print 'positives=%d\nnegtives=%d\nneutral=%d' % (total_pos, total_neg, total_neu)

Have 2483 tweets in total:
positives=765
negtives=286
neutral=1432


In [28]:
"""
A second way using AFINN to seperate positive and negtive terms and calculate the total score of the terms

Paras:
    terms: terms of the tweets to be determined pos/neg
    afinn: AFINN lexicon dictionary
    
Returns:
    total: the total score of the terms
"""
def afinn_sentiment2(terms, afinn):
    total = 0.
    for t in terms:
        if t in afinn:
            total += afinn[t]
    return total

In [29]:
"""
Get the total score for each tweets of each sights
"""
sight_scores = []
for tokens in sight_tokens_list:
    scores= []
    for tweet in tokens:
        scores.append(afinn_sentiment2(tweet, afinn))
    sight_scores.append(scores)

print len(sight_scores)
print len(sight_scores[0])
print sight_scores[0]

30
100
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 3.0, 3.0, 4.0, 2.0, 0.0, -2.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 2.0, 0.0, 3.0, 2.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 4.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, -3.0, 0.0, 0.0, -2.0, 0.0, 2.0, 2.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 2.0, 2.0, 0.0, 0.0]


In [30]:
"""
Calculate the mean score for each sight
"""
sight_mean_scores = [(i, sum(s) / len(s)) for i, s in enumerate(sight_scores)]
print sight_mean_scores

[(0, 0.48), (1, 0.54), (2, 0.22), (3, 0.94), (4, 0.52), (5, -0.03), (6, 0.5), (7, 1.24), (8, 0.17), (9, 0.5714285714285714), (10, 0.4), (11, 0.2), (12, 0.17391304347826086), (13, 0.3333333333333333), (14, 1.0886075949367089), (15, 0.1), (16, 1.2903225806451613), (17, 1.3936170212765957), (18, 0.76), (19, 0.4126984126984127), (20, 1.34), (21, 0.78), (22, 1.53), (23, -2.55), (24, 0.56), (25, -0.16), (26, 0.76), (27, 0.4945054945054945), (28, 0.0), (29, -0.5476190476190477)]


In [31]:
"""
Print the first 10 mean scores and sights' names
"""
top_pos = sorted(sight_mean_scores, key=lambda x:x[1], reverse=True)[:10]
for t in top_pos:
    print '%.5f %s' % (t[1], sight_names[t[0]])

1.53000 united center
1.39362 grant park
1.34000 shedd aquarium
1.29032 buckingham fountain
1.24000 field museum
1.08861 maggie daley park
0.94000 museum of science and industry
0.78000 oriental theatre
0.76000 john hancock
0.76000 chicago history museum


In [32]:
"""
Print the last 10 mean scores and sights' names
"""
top_neg = sorted(sight_mean_scores, key=lambda x:x[1])[:10]
for t in top_neg:
    print '%.5f %s' % (t[1], sight_names[t[0]])

-2.55000 university of chicago
-0.54762 cadillac palace theatre
-0.16000 garfield park
-0.03000 michigan avenue
0.00000 rockefeller memorial chapel
0.10000 chicago theatre
0.17000 willis tower
0.17391 lincoln park
0.20000 chicago cultural center
0.22000 cloud gate


In [33]:
"""
Get the most frequently-used terms a sight's tweets use

Paras:
    tokens_list: the tokens list of the tweets of a sight
    n=20: top n frequently-used terms to be returned
    
Returns:
    a Counter stores the top n frequentlt-used terms
"""
def get_most_frequent_terms(tokens_list, n=20):
    counts = Counter()
    for tokens in tokens_list:
        str_tokens = [str(t) for t in tokens]
        counts.update(str_tokens)
    return counts

In [34]:
"""
Get the top 30 frequently-used terms for each sight
"""
sight_term_counts = []
for tl in sight_tokens_list:
    sight_term_counts.append(get_most_frequent_terms(tl, 30))

In [35]:
"""
Print the top 30 frequently-used terms for the first 10 sight
"""
for p in top_pos:
    print 'The top 30 frequent terms for \'%s\':' % sight_names[p[0]]
    print sorted(sight_term_counts[p[0]].items(), key=lambda x:x[1], reverse=True)[:30], '\n'

The top 30 frequent terms for 'united center':
[('center', 93), ('united', 90), ('THIS_IS_A_URL', 73), ('the', 54), ('at', 49), ('to', 38), ('rt', 38), ('bulls', 38), ('1', 28), ('7', 27), ('kcjhoop', 21), ('improve', 20), ('nuggets', 19), ('of', 19), ('99', 17), ('90', 17), ('down', 17), ('a', 15), ('this', 14), ('and', 14), ('chicago', 13), ('states', 13), ('4', 11), ('game', 11), ('i', 10), ('now', 9), ('is', 9), ('in', 9), ('tickets', 8), ('2', 8)] 

The top 30 frequent terms for 'grant park':
[('park', 96), ('THIS_IS_A_URL', 92), ('grant', 91), ('in', 40), ('the', 39), ('at', 32), ('a', 22), ('rt', 21), ('agora', 16), ('s', 16), ('to', 14), ('are', 14), ('this', 13), ('for', 13), ('sculpture', 12), ('atlanta', 12), ('great', 11), ('male', 11), ('pyrenees', 11), ('man', 11), ('they', 11), ('found', 11), ('anyone', 11), ('national', 11), ('fseb2', 11), ('skinny', 11), ('shy', 11), ('missing', 11), ('elisemeansbiz', 11), ('will', 11)] 

The top 30 frequent terms for 'shedd aquarium':

In [36]:
"""
Print the top 30 frequently-used terms for the last 10 sight
"""
for p in top_neg:
    print 'The top 30 frequent terms for \'%s\':' % sight_names[p[0]]
    print sorted(sight_term_counts[p[0]].items(), key=lambda x:x[1], reverse=True)[:30], '\n'

The top 30 frequent terms for 'university of chicago':
[('of', 111), ('university', 100), ('chicago', 98), ('THIS_IS_A_URL', 65), ('the', 63), ('rt', 46), ('s', 36), ('threat', 35), ('onlineclasses', 33), ('classes', 33), ('online', 33), ('a', 32), ('and', 31), ('hell', 27), ('weeks', 27), ('t', 27), ('yikyakapp', 27), ('between', 27), ('christmas', 27), ('place', 27), ('isn', 27), ('three', 27), ('those', 27), ('it', 27), ('thanksgiving', 27), ('to', 23), ('cancels', 22), ('gun', 22), ('after', 22), ('at', 16)] 

The top 30 frequent terms for 'cadillac palace theatre':
[('THIS_IS_A_URL', 47), ('palace', 42), ('cadillac', 42), ('theatre', 40), ('the', 40), ('king', 31), ('ticket', 30), ('lion', 30), ('at', 25), ('in', 25), ('chicago', 22), ('tickets', 17), ('thelionking', 16), ('cheap', 15), ('down', 15), ('slashes', 15), ('broadwaychicago', 14), ('rt', 12), ('for', 12), ('until', 12), ('prices', 8), ('again', 8), ('one', 7), ('lives', 7), ('17', 7), ('week', 7), ('to', 6), ('of', 6), 

In [37]:
"""
Store the tweets into files to archive
"""
f = open('pos.txt', 'w')
for positives in sight_positives:
    for pos, neg, tweet in positives:
        content = '\t'.join([str(pos), str(neg), tweet]) + '\n'
        f.write(content)
f.close()

f = open('neg.txt', 'w')
for negatives in sight_negatives:
    for pos, neg, tweet in negatives:
        content = '\t'.join([str(pos), str(neg), tweet]) + '\n'
        f.write(content)
f.close()

f = open('neu.txt', 'w')
for neutrals in sight_neutrals:
    for pos, neg, tweet in neutrals:
        content = '\t'.join([str(pos), str(neg), tweet]) + '\n'
        f.write(content)
f.close()

In [38]:
"""
Get the labeled tweets from the archive files

Paras:
    filenames: the filenames that are used to retrieve labeled tweets from
    
Returns:
    labels: a list of all the labels
    tweets: a list of all the relative tweets
"""
def get_labeled_tweets(filenames):
    labels = []
    tweets = []
    length = 0
    for f in filenames:
        for l in open(f).readlines():
            terms = l.strip().lower().split('\t')
            labels.append(terms[0])
            tweets.append(terms[3])
        print 'Get %d labels and tweets from the archived file: %s' % (len(tweets) - length, f)
        length = len(tweets)
    return labels, tweets

In [39]:
"""
Get the labels and relative labeled tweets
"""
labels, labeled_tweets = get_labeled_tweets(['pos_labeled.txt', 'neg_labeled.txt', 'neu_labeled.txt'])
print len(labels)
print len(labeled_tweets)

Get 816 labels and tweets from the archived file: pos_labeled.txt
Get 430 labels and tweets from the archived file: neg_labeled.txt
Get 1243 labels and tweets from the archived file: neu_labeled.txt
2489
2489


In [40]:
"""
Get tokens list for all labeled tweets
"""
tokens_list = [tokenize(t,
                        lowercase=True,
                        keep_punctuation=False,
                        collapse_urls=True,
                        collapse_mentions=False)
              for t in labeled_tweets]

In [41]:
"""
Make vocabulary using the tokens list

Paras:
    tokens_list: the tokens list to be used to make the vocabulary
    
Returns:
    a dictionary of thee vocabulary
"""
def make_vocabulary(tokens_list):
    vocabulary = defaultdict(lambda: len(vocabulary))  # If term not present, assign next int.
    for tokens in tokens_list:
        for token in tokens:
            vocabulary[token]  # looking up a key; defaultdict takes care of assigning it a value.
    print '%d unique terms in vocabulary' % len(vocabulary)
    return vocabulary

vocabulary = make_vocabulary(tokens_list)

4530 unique terms in vocabulary


In [42]:
"""
Make feature matrix using tweets, tokens list and vocabulary

Paras:
    tweets
    tokens_list
    vocabulary
    
Returns:
    a csr matrix of the features
"""
def make_feature_matrix(tweets, tokens_list, vocabulary):
    X = lil_matrix((len(tweets), len(vocabulary)))
    for i, tokens in enumerate(tokens_list):
        for token in tokens:
            j = vocabulary[token]
            X[i,j] += 1
    return X.tocsr()  # convert to CSR for more efficient random access.

In [43]:
"""
Get the feature matrix and label array
"""
X = make_feature_matrix(labeled_tweets, tokens_list, vocabulary)
y = np.array(labels)

In [44]:
"""
Using logistic regression to do n-fold cross validation

Paras:
    X: csr matrix of feature
    y: list of labels
    nfolds: n-fold
    
Returns:
    avg: the average accuracy of the cross validation
"""
def do_cross_val(X, y, nfolds):
    """ Compute average cross-validation acccuracy."""
    cv = KFold(len(y), nfolds)
    accuracies = []
    for train_idx, test_idx in cv:
        clf = LogisticRegression()
        clf.fit(X[train_idx], y[train_idx])
        predicted = clf.predict(X[test_idx])
        acc = accuracy_score(y[test_idx], predicted)
        accuracies.append(acc)
    avg = np.mean(accuracies)
    return avg

print 'avg accuracy', do_cross_val(X, y, 10)

avg accuracy 0.707538217386
