In [None]:
import json
import tweepy
import sys
import traceback
from datetime import datetime, timezone
from kafka import KafkaProducer
from config import Keywords , event_fields, user_fields, consumer_key,\
                    consumer_secret, access_token, access_secret, poll_continiously

In [None]:
# Probably you can compare the performance of opposing hashtags insuch way
#Keywords = "#OwenJonesIsAWankerDay #SolidarityWithOwenJonesDay"

In [None]:
class TwitterStreamListener(tweepy.StreamListener):
    def __init__(self):
        self.producer = KafkaProducer(bootstrap_servers='localhost:9092',
                                      key_serializer=str.encode, 
                                      value_serializer=lambda v: json.dumps(v).encode('utf-8'))

        self.datefmt = "%a %b %d %H:%M:%S %z %Y"
        self.topic = "TweeterArchive"
        self.tweets = []
        self.rawdata = [] 
        self.count = 0
        self.ltweet = ''        
        self.status = 'waiting for tweets...'
        
    def on_data(self, data):
        # data is the full *tweet* json data
        api_events = json.loads(data)
        
        # save raw json data in memory array for debugging
        self.rawdata.append(data)
        
        #filter out retweets
        if not api_events['text'].startswith('RT'):
            
            # Gathering relevant values
            # Tweet-related values
            twitter_events = {k:v for k,v in api_events.items() 
                              if k in event_fields}    
            
            tweet_key, tweet_created_at = self.date_fmt_convert(twitter_events['created_at'])
                        
            twitter_events['created_at'] = tweet_created_at
            
            # When tweets are longer than 140 symbols, the 'text' field gets truncated, and the
            # full text is in ['extended_tweet']['full_text']
            if 'extended_tweet' in api_events:
                twitter_events['text'] = api_events['extended_tweet']['full_text']

            # User-related values

            user_events = {k:v for k,v in api_events['user'].items() 
                           if k in user_fields}
           
            user_events['created_ym'], user_events['created_at'] =\
            self.date_fmt_convert(user_events['created_at'],fmt = "%Y%m") 

            twitter_events['user_id'] = user_events['id'] 
            twitter_events['user_followers'] = user_events['followers_count']
            
            events = {'users':user_events, 'tweets': twitter_events }                

            # save processed dictionary in memory array for debugging
            self.tweets.append((tweet_key,events))
            # print last tweet to console on the same line
            self.ltweet = events['tweets']['text'][:50].replace('\n','')
             
            self.send_to_kafka(str(tweet_key), events)                 
            
        self.status = 'tweets received: '+ str(self.count) + '| last relevant tweet: ' + self.ltweet           
        self.count+=1        

    def send_to_kafka(self, key, data):  
        
        # send data to kafka topic(s)
        #self.producer.send('TweeterArchive', events)
        #self.producer.send('TweeterData', events) 
        self.producer.send(self.topic, key = key, value = data)
        self.producer.flush()
        
    def date_fmt_convert(self, date, fmt = "%Y%m%d%H%M"):        
        # Convert twitter datetime format such as 
        # "Sat Jan 4 11:39:13 +0500 2019"
        # to partitioning-compatible format and shift it to utc, so: 
        # key (int): 201901040639, timestamp (int): 1547420603
        date_time_obj = datetime.strptime(date, self.datefmt)
        date_time_obj.astimezone(tz=timezone.utc)
        return date_time_obj.strftime(fmt), int(date_time_obj.timestamp())


    def on_error(self, status_code):
        if status_code == 420:
            return False

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)
streamListener = TwitterStreamListener()
#property injection into an object at runtime
tweepy.Stream.status = property(lambda self : self.listener.status if self.running else "Stopped")
twitter_stream = tweepy.Stream(auth=api.auth, listener=streamListener)
twitter_stream.filter(track=[Keywords], languages=['en'], is_async=True) # -filter:retweets

In [None]:
poll_continiously(twitter_stream,attrname='status')

In [None]:
twitter_stream.disconnect() #only uncommment this when you want tot stop stream 