# Part - I : Crawling Twitter

### Import statements

In [1]:
import jsonlines
import tweepy
from tweepy import OAuthHandler

### Authentication Keys

In [1]:
# Replace with original keys for successful authentication

c_key = '******'
c_secret = '******'
a_token = '******'
a_secret = '******'

### Function to authorize access

<b>Parameters:</b> c_key (Consumer Key), c_secret (Consumer Secret), a_token (Access Token), a_secret (Access Token Secret) <br>
<b>Return Value:</b> api => api object which provides access to Twitter content

In [3]:
def authUser(c_key, c_secret, a_token, a_secret):
    auth = OAuthHandler(c_key, c_secret)
    auth.set_access_token(a_token, a_secret)
    api = tweepy.API(auth)
    return api

### Function to write each tweet into a .jsonl file

<b>Parameters:</b> tweet_dict => a tweet in the form of a dictionary <br>
<b>Return Value:</b> total_tweets => Total number of tweets by the user (useful in loop termination while crawling)

In [4]:
def storeTweet(tweet_dict):
    with jsonlines.open('responses.jsonl', mode='a') as writer:
        writer.write(tweet_dict)
    total_tweets = tweet_dict['user']['statuses_count']
    return total_tweets

### Function to retrieve tweets and store them in .jsonl file

<b>Parameters:</b> api => object to access Twitter content, s_name => screen name, maxid => id of the last tweet fetched  <br>
<b>Return Value:</b> maxid => id of the latest fetched tweet, total_tweets => total number of tweets by the user, loc_counter => number of fetched tweets written to 'responses.jsonl'

<div style="text-align: justify"> <b>Explanation:</b> The following function retrieves 200 tweets at a time and stores them in an 'Iterable' list which is traversed using the for loop. The '._json' attribute is used to access the JSON response sent by the Twitter API. Variable 'maxid' is used to keep track of the latest tweet id that has been fetched so that duplicate tweets are not picked up each time the script is made to run. Variable 'total_tweets' stores the total count of user's tweets. 'loc_counter' is used to keep track of the number of updates in 'responses.jsonl' file. </div>

In [5]:
def fetchNStore(api, s_name, maxid):
    loc_counter = 0
    total_tweets = 0
    if maxid is None:
        for tweet in tweepy.Cursor(api.user_timeline, screen_name=s_name, tweet_mode='extended').items(200):
            total_tweets = storeTweet(tweet._json)
            maxid = int(tweet._json['id']) - 1
            loc_counter += 1
    else:
        for tweet in tweepy.Cursor(api.user_timeline, screen_name=s_name, tweet_mode='extended', max_id=maxid).items(200):
            total_tweets = storeTweet(tweet._json)
            maxid = int(tweet._json['id']) - 1
            loc_counter += 1
    return maxid, total_tweets, loc_counter

### Script to call the above-defined functions

<div style="text-align: justify"> <b>Explanation:</b> 'lastfetched.txt' stores the latest id of the fetched tweet. Keeping track of this id is important as it avoids writing of duplicate tweets to 'responses.jsonl'. This functionality is helpful in cases where the rate limits (imposed by Twitter API) exceed while fetching of data. The 'while' loop runs till the total number of tweets have been written to 'responses.jsonl'. 'try-except' mechanism is used to look out for rate-limit exceed errors or any other errors caused while fetching of data. 'totalfetched.txt' keeps track of the total number of tweets written so far into 'responses.jsonl'. This is useful when we want to resume fetching of tweets from a particular point in case of any interruptions that might have occurred earlier. </div>

In [6]:
screen_name = 'midasIIITD'
api = authUser(c_key, c_secret, a_token, a_secret)
# fobj = open('lastfetched.txt', 'w')              # uncomment both lines if 'lastfetched.txt' does not exist
# fobj.close()
with open('lastfetched.txt', 'r') as reader:
    maxid = reader.readline()
if maxid == "":
    maxid = None
else:
    maxid = int(maxid)
try:
    flag = True
    try:
        with open('totalfetched.txt', 'r') as r:
            glob_counter = int(r.readline())         # updating glob_counter to the number of tweets already read
    except FileNotFoundError:
        glob_counter = 0
    total_tweets = 0
    while flag:
        maxid, new_tweet_count, loc_counter = fetchNStore(api, screen_name, maxid)
        total_tweets = new_tweet_count
        with open('lastfetched.txt', 'w') as writer:
            writer.write(str(maxid))
        glob_counter += loc_counter
        if glob_counter >= new_tweet_count:          # value of total_tweets obtained dynamically allows for loop termination
            flag = False
    print('All tweets (%d) extracted from %s.' % (total_tweets, screen_name))
except Exception as error:
    with open('totalfetched.txt', 'w') as w:          # writing the total number of tweets seen so far
        w.write(str(glob_counter))
    print(error)

All tweets (296) extracted from midasIIITD.


# Part - II : Parsing JSON Files

### Import statements

In [7]:
import prettytable
from datetime import datetime

### Function which parses the .jsonl file and arranges content in a table

<b>Parameters:</b> file_name => file to be parsed  <br>
<b>Return Value:</b> table => object of class PrettyTable which contains data arranged in tabular format.

<b> Assumption: </b> The number of favourites/likes, retweets and images present are to be reported of the original tweet and not of the tweet that has been retweeted. 

In [8]:
def createTable(file_name):
    table = prettytable.PrettyTable()
    table.add_column('S.No', [])
    table.add_column('Text', [])
    table.add_column('Date-Time', [])
    table.add_column('# of likes', [])
    table.add_column('# of retweets', [])
    table.add_column('# of images present', [])
    with jsonlines.open(file_name) as reader:
        index = 0
        for tweet in reader:
            image_count = None
            text = tweet['full_text']             # full_text used to include tweets of length upto 280 characters
            raw_dt = tweet['created_at']
            date_time = datetime.strptime(raw_dt, "%a %b %d %H:%M:%S %z %Y").strftime("%d/%m/%Y %H:%M:%S")
            likes = tweet['favorite_count']
            retweets = tweet['retweet_count']
            ent = tweet['entities']
            if ent.get('media') is not None:
                image_count = 0
                ext_ent = tweet['extended_entities']
                lst = ext_ent['media']
                for i in range(len(lst)):
                    if lst[i]['type'] == 'photo':
                        image_count += 1 
            index += 1
            table.add_row([index, text, date_time, likes, retweets, image_count])
    return table   

### Script to call the above function and print the table

In [9]:
table = createTable('responses.jsonl')
print(table)

+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+------------+---------------+---------------------+
| S.No |                                                                                                                                                       Text                                                                                                                                                       |      Date-Time      | # of likes | # of retweets | # of images present |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# References

1. Tweepy Documentation - http://docs.tweepy.org/en/3.7.0/index.html
2. Stack Overflow - https://stackoverflow.com/