In [1]:
# Exercise 3.1.1 define a listener which listens to tweets in real time


import tweepy
# to install tweepy, use: pip install tweepy

# import twitter authentication module
from tweepy import OAuthHandler

# import tweepy steam module
from tweepy import Stream

# import stream listener
from tweepy.streaming import StreamListener

# import the python package to handle datetime
import datetime

# set your keys to access tweets 
# you can find your keys in Twitter.
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''
 
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
 
# Customize a tweet event listener 
# inherited from StreamListener provided by tweepy
# This listener reacts when a tweet arrives or an error happens

class MyListener(StreamListener):
    
    # constructor
    def __init__(self, output_file, time_limit):
        
            # attribute to get listener start time
            self.start_time=datetime.datetime.now()
            
            # attribute to set time limit for listening
            self.time_limit=time_limit
            
            # attribute to set the output file
            self.output_file=output_file
            
            # initiate superclass's constructor
            StreamListener.__init__(self)
    
    # on_data is invoked when a tweet comes in
    # overwrite this method inheritted from superclass
    # when a tweet comes in, the tweet is passed as "data"
    def on_data(self, data):
        
        # get running time
        running_time=datetime.datetime.now()-self.start_time
        print(running_time)
        
        # check if running time is over time_limit
        if running_time.seconds/60.0<self.time_limit:
            
            # ***Exception handling*** 
            # If an error is encountered, 
            # a try block code execution is stopped and transferred
            # down to the except block. 
            # If there is no error, "except" block is ignored
            try:
                # open file in "append" mode
                with open(self.output_file, 'a') as f:
                    # Write tweet string (in JSON format) into a file
                    f.write(data)
                    
                    # continue listening
                    return True
                
            # if an error is encountered
            # print out the error message and continue listening
            
            except BaseException as e:
                print("Error on_data:" , str(e))
                
                # if return "True", the listener continues
                return True
            
        else:  # timeout, return False to stop the listener
            print("time out")
            return False
 
    # on_error is invoked if there is anything wrong with the listener
    # error status is passed to this method
    def on_error(self, status):
        print(status)
        # continue listening by "return True"
        return True

In [None]:
# Exercise 3.1.2 Collect tweets with specific topics within 2 minute

# initiate an instance of MyListener 
tweet_listener=MyListener(output_file="srksalman.txt",time_limit=10)

# start a staeam instance using authentication and the listener
twitter_stream = Stream(auth, tweet_listener)
# filtering tweets by topics
twitter_stream.filter(track=['#SlapAFilm', '#ISurviveTwitterBy','Kylie Jenner'])

In [None]:
tweet_listener=MyListener(output_file="newsrksalman.txt",time_limit=10)
twitter_stream = Stream(auth, tweet_listener)
twitter_stream.filter(track=['#SlapAFilm', '#ISurviveTwitterBy','Kylie Jenner'])
#twitter_stream.sample()

In [14]:
# Exercise 4.1. Read/write JSON 
import json
tweets=[]

with open('newsrksalman.txt', 'r') as f:
    # each line is one tweet string in JSON format
    for line in f: 
        
        # load a string in JSON format as Python dictionary
        tweet = json.loads(line) 
              
        tweets.append(tweet)

# write the whole list back to JSON
json.dump(tweets, open("all_tweets.json",'w'))

# to load the whole list
# pay attention to json.load and json.loads
tweets=json.load(open("all_tweets.json",'r'))


In [42]:
# A tweet is a dictionary
# Some values are dictionaries too!
# for details, check https://dev.twitter.com/overview/api/tweets

print("# of tweets:", len(tweets))
first_tweet=tweets[0]

print("\nprint out first tweet nicely:")
print(json.dumps(first_tweet, indent=4)) 
print (tweets[0]["text"])



('# of tweets:', 10865)

print out first tweet nicely:
{
    "quote_count": 0, 
    "contributors": null, 
    "truncated": false, 
    "text": "SPAM WITH MUCH FRUIT AND TONIC DEEPIKA PADUKONE THEY WOULD BE LIKE MATH GRAN PRIX SHOW CAVEMAN AND I LIKE A UNINTERESTED, HILARIOUS", 
    "is_quote_status": false, 
    "in_reply_to_status_id": null, 
    "reply_count": 0, 
    "id": 911419330922897408, 
    "favorite_count": 0, 
    "source": "<a href=\"https://twitter.com/conqtip\" rel=\"nofollow\">You need Beano</a>", 
    "retweeted": false, 
    "coordinates": null, 
    "timestamp_ms": "1506134276501", 
    "entities": {
        "user_mentions": [], 
        "symbols": [], 
        "hashtags": [], 
        "urls": []
    }, 
    "in_reply_to_screen_name": null, 
    "in_reply_to_user_id": null, 
    "retweet_count": 0, 
    "id_str": "911419330922897408", 
    "favorited": false, 
    "user": {
        "follow_request_sent": null, 
        "profile_use_background_image": false, 
       

KeyError: 'text'

In [51]:

print(len(tweets))
text = ""
for i in range(0,400):
    text = text + tweets[i]["text"] 
    
print (text)



10865
SPAM WITH MUCH FRUIT AND TONIC DEEPIKA PADUKONE THEY WOULD BE LIKE MATH GRAN PRIX SHOW CAVEMAN AND I LIKE A UNINTERESTED, HILARIOUSRT @DeepikaPFC: [IG] Deepika's hairstylist for #Padmavati, Amit Thakur recalls how this scene from the poster gave him goosebumps.… @zareen_khan
#zareenkhan
#Actress
#Tollywood
#TollywoodActress
#Bollywood
#BollywoodActress
#Kollywood… https://t.co/PSSeZ1iwXJ#WaheedaRehman on "Khamoshi" #IndianCinema #Bollywood - Interesting! 'Asitda was a rare talent' https://t.co/KuOcQusyNL via @TOICitiesNewsRT @ActressDivya: @zareen_khan
#zareenkhan
#Actress
#Tollywood
#TollywoodActress
#Bollywood
#BollywoodActress
#Kollywood… RT @ActressDivya: @priyamani6
#PriyaMani
#Actress
#Tollywood
#TollywoodActress
#Bollywood
#BollywoodActress
#Kollywood… @iamsrk I hope everything goes well ...
@S1dharthM  
Thank you for your publication in the Instagram stories.
#SRK… https://t.co/D7uY9UNERhRT @BrownPeopleDo: Deepika Padukone | All About You 
A/W Collection 2017 https://t.co

In [55]:
noUnicode = text.encode('utf8')
print(type(noUnicode))
print(noUnicode)


<type 'str'>
SPAM WITH MUCH FRUIT AND TONIC DEEPIKA PADUKONE THEY WOULD BE LIKE MATH GRAN PRIX SHOW CAVEMAN AND I LIKE A UNINTERESTED, HILARIOUSRT @DeepikaPFC: [IG] Deepika's hairstylist for #Padmavati, Amit Thakur recalls how this scene from the poster gave him goosebumps.… @zareen_khan
#zareenkhan
#Actress
#Tollywood
#TollywoodActress
#Bollywood
#BollywoodActress
#Kollywood… https://t.co/PSSeZ1iwXJ#WaheedaRehman on "Khamoshi" #IndianCinema #Bollywood - Interesting! 'Asitda was a rare talent' https://t.co/KuOcQusyNL via @TOICitiesNewsRT @ActressDivya: @zareen_khan
#zareenkhan
#Actress
#Tollywood
#TollywoodActress
#Bollywood
#BollywoodActress
#Kollywood… RT @ActressDivya: @priyamani6
#PriyaMani
#Actress
#Tollywood
#TollywoodActress
#Bollywood
#BollywoodActress
#Kollywood… @iamsrk I hope everything goes well ...
@S1dharthM  
Thank you for your publication in the Instagram stories.
#SRK… https://t.co/D7uY9UNERhRT @BrownPeopleDo: Deepika Padukone | All About You 
A/W Collection 2017 https

In [71]:
text = text.replace(",","").lower()
a = text.split(" ")

print(a)

[u'spam', u'with', u'much', u'fruit', u'and', u'tonic', u'deepika', u'padukone', u'they', u'would', u'be', u'like', u'math', u'gran', u'prix', u'show', u'caveman', u'and', u'i', u'like', u'a', u'uninterested', u'hilariousrt', u'@deepikapfc:', u'[ig]', u"deepika's", u'hairstylist', u'for', u'#padmavati', u'amit', u'thakur', u'recalls', u'how', u'this', u'scene', u'from', u'the', u'poster', u'gave', u'him', u'goosebumps.\u2026', u'@zareen_khan\n#zareenkhan\n#actress\n#tollywood\n#tollywoodactress\n#bollywood\n#bollywoodactress\n#kollywood\u2026', u'https://t.co/pssez1iwxj#waheedarehman', u'on', u'"khamoshi"', u'#indiancinema', u'#bollywood', u'-', u'interesting!', u"'asitda", u'was', u'a', u'rare', u"talent'", u'https://t.co/kuocqusynl', u'via', u'@toicitiesnewsrt', u'@actressdivya:', u'@zareen_khan\n#zareenkhan\n#actress\n#tollywood\n#tollywoodactress\n#bollywood\n#bollywoodactress\n#kollywood\u2026', u'rt', u'@actressdivya:', u'@priyamani6\n#priyamani\n#actress\n#tollywood\n#tollywooda

In [72]:
for i,t in enumerate(a):
    a[i]=a[i].encode('utf8')
    print(type(a[i]))
    

<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>

<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>

<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>

<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>

In [73]:
count_per_topic={}

for word in a:
    if word in count_per_topic:
        count_per_topic[word]+=1
    else:
        count_per_topic[word]=1
    
print(count_per_topic)

{'': 36, 'hate': 2, 'd\xc3\xb3lares': 1, '@whosalexander:': 38, 'sorry': 1, 'serenidade': 1, 'dela': 1, 'https://t.co/x\xe2\x80\xa6rt': 1, '\n\n-kylie': 1, 'every': 1, 'https://t.co/p7dt2b6hvirt': 1, 'tired': 1, 'https://t.co/fxmrdrbchdrip': 1, 'nasceu': 1, 'https://t.co/1dfekbkrplrt': 7, 'me:': 21, 'even': 2, 'acordo': 2, 'nem': 1, 'https://t.\xe2\x80\xa6': 1, 'whatrt': 1, 'new': 12, 'ever': 14, 'told': 1, 'million': 1, 'never': 1, 'colour.\xe2\x80\xa6rt': 8, 'here': 1, '@fanu1farhan:': 1, 'met': 3, "aren't": 1, 'daughter': 1, 'kardashian/blac': 1, "i'm": 22, 'ba\xe2\x80\xa6': 3, 'embarazadaaa?': 1, '@trashyewest:': 1, 'https://t.co/nybbrlswmkrt': 1, 'https://t.co/derjd8twt1rt': 1, 'https://t.co/203qjb9umi': 1, 'would': 1, 'spears.rt': 1, '?\nrt': 1, 'tell': 29, 'saber': 1, 'phone': 17, 'me': 27, 'mo': 1, 'work': 4, '@complex:': 1, '#actress': 1, 'unibrow.': 1, 'ta': 1, 'my': 27, '#iheartfestival\n': 1, 'shook': 1, '@6upsidedownis9:': 1, 'climax': 2, '#savagert': 1, 'india': 4, 'want'

In [74]:
sorted_topics = sorted(count_per_topic.items(),key=lambda item:-item[1])
print(sorted_topics)

[('jenner', 270), ('kylie', 258), ('the', 226), ('is', 181), ('pregnant', 163), ('of', 117), ('a', 85), ('i', 84), ('and', 79), ('that', 77), ('rico', 64), ('to', 63), ('puerto', 63), ('now', 50), ('attention', 47), ('travis', 47), ('got', 46), ('in', 46), ('has', 46), ('your', 46), ('kris', 45), ('no', 44), ('country', 42), ('because', 42), ('\n\nok', 41), ('power', 41), ('entire', 41), ('with', 40), ('before', 39), ('@whosalexander:', 38), ('on', 37), ('', 36), ('baby', 36), ('tmz', 36), ('jenner:', 35), ('t\xe2\x80\xa6rt', 35), ('be', 35), ('gonna', 30), ('tell', 29), ('for', 29), ('me', 27), ('my', 27), ('after', 27), ("jenner's", 27), ('this', 25), ('works', 24), ('being', 23), ("i'm", 22), ('de', 22), ("don't", 22), ('leaving', 22), ('kardashian', 22), ('me:', 21), ('pregnancy', 21), ('than', 21), ('ya', 21), ('scott', 21), ('getting', 20), ("it's", 19), ('more', 19), ('from', 19), ('mum', 18), ('mexico', 18), ('west', 18), ('*on', 18), ('phone', 17), ('child', 17), ('"guess', 17

In [75]:
top_50_topics=sorted_topics[0:50]
print(top_50_topics)

[('jenner', 270), ('kylie', 258), ('the', 226), ('is', 181), ('pregnant', 163), ('of', 117), ('a', 85), ('i', 84), ('and', 79), ('that', 77), ('rico', 64), ('to', 63), ('puerto', 63), ('now', 50), ('attention', 47), ('travis', 47), ('got', 46), ('in', 46), ('has', 46), ('your', 46), ('kris', 45), ('no', 44), ('country', 42), ('because', 42), ('\n\nok', 41), ('power', 41), ('entire', 41), ('with', 40), ('before', 39), ('@whosalexander:', 38), ('on', 37), ('', 36), ('baby', 36), ('tmz', 36), ('jenner:', 35), ('t\xe2\x80\xa6rt', 35), ('be', 35), ('gonna', 30), ('tell', 29), ('for', 29), ('me', 27), ('my', 27), ('after', 27), ("jenner's", 27), ('this', 25), ('works', 24), ('being', 23), ("i'm", 22), ('de', 22), ("don't", 22)]


In [76]:
topics,count = zip(*top_50_topics)
print(topics,count)

(('jenner', 'kylie', 'the', 'is', 'pregnant', 'of', 'a', 'i', 'and', 'that', 'rico', 'to', 'puerto', 'now', 'attention', 'travis', 'got', 'in', 'has', 'your', 'kris', 'no', 'country', 'because', '\n\nok', 'power', 'entire', 'with', 'before', '@whosalexander:', 'on', '', 'baby', 'tmz', 'jenner:', 't\xe2\x80\xa6rt', 'be', 'gonna', 'tell', 'for', 'me', 'my', 'after', "jenner's", 'this', 'works', 'being', "i'm", 'de', "don't"), (270, 258, 226, 181, 163, 117, 85, 84, 79, 77, 64, 63, 63, 50, 47, 47, 46, 46, 46, 46, 45, 44, 42, 42, 41, 41, 41, 40, 39, 38, 37, 36, 36, 36, 35, 35, 35, 30, 29, 29, 27, 27, 27, 27, 25, 24, 23, 22, 22, 22))


In [77]:
import pandas as pd
import brunel

df = pd.DataFrame(top_50_topics,columns=["topic","count"])

%brunel data('df') label(topic) size(count) color(topic) bubble sort(count)\
tooltip(count)

<IPython.core.display.Javascript object>