In [None]:
# Basic stuff - don't worry to much about it I'm just loading all the things you need
# If you have any problems running the first cell run `pip install -r requirements.txt`

%matplotlib inline
import tweepy
import pandas as pd
import json
import numbers
import re
import os.path
import datetime

pd.options.display.max_colwidth = 400
pd.options.display.max_rows = 25
pd.options.display.max_columns = None

Create a Twitter app at https://apps.twitter.com. Then generate an access token. put your consumer keys and access tokens in the credentials.txt file.

In [None]:
# This takes the credentials that you just created and format them so I can use them to access twitter and get tweets
def load_credentials():
    consumer_key, consumer_key_secret, access_token, access_token_secret = (None,)*4
    if not os.path.isfile('credentials.txt'): 
        return consumer_key, consumer_key_secret, access_token, access_token_secret
    lines = [line.rstrip('\n') for line in open('credentials.txt')]
    chars_to_strip = " \'\""
    for line in lines:
        if "consumer_key" in line and 'fill_in' not in line:
            consumer_key = re.findall(r'[\"\']([^\"\']*)[\"\']', line)[0]
        if "consumer_secret" in line and 'fill_in' not in line:
            consumer_key_secret = re.findall(r'[\"\']([^\"\']*)[\"\']', line)[0]
        if "access_token" in line and 'fill_in' not in line:
            access_token = re.findall(r'[\"\']([^\"\']*)[\"\']', line)[0]
        if "access_secret" in line and 'fill_in' not in line:
            access_token_secret = re.findall(r'[\"\']([^\"\']*)[\"\']', line)[0]
    return consumer_key, consumer_key_secret, access_token, access_token_secret

In [None]:
# Testing to make sure that it works. 
# This is pulling 20 tweets from your own Twitter timeline (tweets from ppl you follow).
# If tweets print out below, then your credentials are all set!

tweets_raw_data = []
public_tweets = api.home_timeline()
for tweet in public_tweets:
    print tweet.text
    tweets_raw_data.append(tweet)

In [None]:
# This is what the data you get back actually looks like (just the first tweet)
tweets_raw_data[0]._json

In [None]:
# The meat of the project  - this is a function that gets tweets from twitter based on your search criteria
# It does some basic cleaning up and puts the tweets in a format that is easier to work with for stats stuff

class MyStreamListener(tweepy.StreamListener):
    def __init__(self,limit=20,print_output=True,save_output=True,
                 filename='file.csv',include_rts=True,strict_text_search=False,
                 search_terms=None):
        self.df = pd.DataFrame()
        self.limit = limit
        self.counter = 0
        self.print_output = print_output
        self.header=False
        self.save_output=save_output
        self.filename=filename
        self.include_rts=include_rts
        self.strict_text_search = strict_text_search
        self.search_terms = search_terms

    def on_data(self, data):
        d = {}
        decoded = json.loads(data)
        tweet_fields_to_collect = ['created_at','id','text','source','favorite_count','coordinates','lang','place','retweet_count','retweeted','truncated']
        user_fields_to_collect = ['name','screen_name','location','id_str','statuses_count','followers_count','friends_count','favourites_count','description']
        if self.strict_text_search:
            if not isinstance(self.search_terms, list):
                self.search_terms = re.findall(r"[\w']+", self.search_terms)
            if not any(term.lower() in decoded['text'].lower() for term in self.search_terms):
                print "skipped"
                print decoded['text']
                return True
        for k,v in decoded.iteritems():
            if k in tweet_fields_to_collect:
                if isinstance(v, numbers.Number):
                    v = str(v)
                try:
                    d['tweet_' + k.strip()] = v
                except:
                    print "Failure collecting tweet field", v.encode('ascii', 'ignore')
            if k=='user':
                for user_k,user_v in v.iteritems():
                    if user_k in user_fields_to_collect:
                        if isinstance(user_v, numbers.Number):
                            user_v = str(user_v)
                        try:
                            d[user_k.strip()]=user_v
                        except:
                            print "Failure collecting user field",user_v.encode('ascii', 'ignore')
            if k=='retweeted_status':
                for retweet_k,retweet_v in v.iteritems():
                    if retweet_k in tweet_fields_to_collect:
                        if isinstance(retweet_v, numbers.Number):
                            retweet_v = str(retweet_v)
                        try:
                            d['retweet_'+retweet_k.strip()]=retweet_v
                        except:
                            print "Failure collecting retweet field",user_v.encode('ascii', 'ignore')
        if not self.include_rts:
            if ('retweet_text' in d and len(d['retweet_text'])>0) or d['tweet_text'].startswith('RT @'):
                return True
        tweet_df = pd.DataFrame(d, index=[0])
        frames = [self.df, tweet_df]
        self.df = pd.concat(frames)
        self.counter+=1
        if self.print_output:
            try:
                print(decoded['text'])
            except:
                print("Failure outputting tweet text",decoded['text'].encode('ascii', 'ignore'))
        if self.counter>=self.limit:
            print("finished collecting %s tweets, ending" % self.limit)
            if self.include_rts and 'retweet_text' in self.df.columns:
                self.df = self.df[['tweet_' + x for x in tweet_fields_to_collect] + user_fields_to_collect + ['retweet_' + x for x in tweet_fields_to_collect]]
            else:
                self.df = self.df[['tweet_' + x for x in tweet_fields_to_collect] + user_fields_to_collect]
            self.df.rename(columns={'id_str':'user_id'},inplace=True)
            self.df.to_csv(self.filename, index=False, encoding='utf-8')
            return False
        else:
            return True
        
    def on_error(self, status_code):
        if status_code == 420:
            return False
        
    def on_disconnect(self, notice):
        print("disconnecting due to " + str(notice))

In [None]:
## This is where you put what you want to search for - i.e. BREXIT
search_query = ''

In [None]:
# This is you actually getting the tweets and saving them to a file

# so where you need to alter stuff to match what you need
# Options you can set:
#        limit: int, how many tweets to capture
#        print_output: bool, whether to print the tweet to screen
#        save_output: bool, whether to save the tweet data to a csv file
#        filename: str, the filename to name the saved output, by default it's file.csv
#        include_rts: bool, whether to capture retweets
#        strict_text_search: bool, ocasionally, stream will capture a tweet that doesn't actually include the search query
#            set to True to filter out these "accidental" tweets
#        search_terms: str or array, pass in the search query or an array of terms you want to use for filtering
#           if strict_text_search = True. Script checks and turns any string into array of strings

filename = '%s_%s.csv' % (search_query, datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S"))
myStreamListener = MyStreamListener(limit=50,
                                    print_output=False,
                                    filename=filename,
                                    search_terms=search_query,
                                    strict_text_search=True,
                                    include_rts=False)
myStream = tweepy.Stream(auth, listener=myStreamListener)
myStream.filter(track=[search_query])
df = pd.read_csv(filename)

In [None]:
# This is how you display what you have
df

In [None]:
# This is how you display just the tweets
df[df['tweet_text']][['tweet_text']]

In [None]:
# This is some basic stats as an example
# They show if there is a correlation between number of tweets and number of statuses for users who have less than 1000 followers. 
# (Spoiler alert - yes there is but there is also a lot of noise)

import statsmodels.formula.api as sm
import seaborn as sns
result = sm.ols(formula="statuses_count ~ followers_count", data=df.query("followers_count<1000")).fit()
result.summary()

In [None]:
# This is a pretty graph showing what the stats mean
# A straight diagonal line low left to high right means correlation
ax = sns.regplot(x="followers_count", y="statuses_count", data=df.query('followers_count<1000'))