# Python TASK

You have to write a python script which can fetch all the tweets(as many as allowed by Twitter
API) done by midas@IIITD twitter handle and dump the responses into JSONlines file.
The other part of your script should be able to parse these JSONline files to display the
following for every tweet in a tabular format.
1. The text of the tweet.
2. Date and time of the tweet.
3. The number of favorites/likes.
4. The number of retweets.
5. Number of Images present in Tweet. If no image returns None.

### Submitted by
#### Utkarsh Sharma (flamingice2801@gmail.com)

## 1. Dowload Data from Twitter API

In [1]:
#Load Libraries
import keys
import requests
from requests_oauthlib import OAuth1
import json

#Define Authorization to authorize the Twitter API use
auth = OAuth1(keys.api_key, keys.api_secret_key, keys.access_token, keys.access_token_secret)

#Function to json dump the data on to the disk
def dump_json(data, filename):
    with open(filename, 'w') as outfile:
        json.dump(data, outfile)
    print("Successfully saved data")

#Function to download data from Twitter
def request_data(url_req, params, auth):
    data = []
    fromDate = None
    toDate = None
    
    while True:
        #Get response from Twitter API
        results = requests.get(url_rest, params=params, auth=auth)
        is_response_ok = results.ok
        if is_response_ok:
            #Get the results in a json format
            results = results.json()
            
            #Append the results together in a list
            data = data + results['results']
            
            #If the response has next key, then continue downloading more data else break the loop
            if 'next' in results.keys():
                params['next'] = results['next']
            else:
                fromDate = results['requestParameters']['fromDate']
                toDate = results['requestParameters']['toDate']
                break

    return {'results':data, 'requestParameters':{'fromDate':fromDate, 'toDate':toDate}}

if __name__ == '__main__':
    
    #URL for search reques. The api used is twitter premium search api
    url_rest = "https://api.twitter.com/1.1/tweets/search/fullarchive/test.json"
    
    #Query contains the twitter handle of MIDAS IIITD
    query = 'from:midasIIITD'
    
    #fromDate parameter is the date from which the tweets are downloaded to the current date
    #The follwing date is September 1, 2018. That means tweets will be downloaded from Sept 1, 2018 to Mar 26, 2019
    fromDate = 201809010000
    
    #Define a dictionary for all parameters
    params = {'query': query, 'maxResults':100, 'next':None, 'fromDate':fromDate}
    
    #Request data from Twitter API
    data = request_data(url_rest, params, auth)
    
    #Dump the data in data.json file
    out_filename = 'data.json'
    dump_json(data, out_filename)

Successfully saved data


## Load JSON file from disk and parse it to extract the required information

In [31]:
#Load libraries
import json
from collections import defaultdict
import pandas as pd
from texttable import Texttable

#Function to load the json file from disk
def load_json(filename):
    with open(filename, 'r') as infile:
        data = json.load(infile)
    return data

#Function to extract number of images from a tweet
def get_num_images(tweet_data):
    num_images = 0
    
    #Find the number of images from the tweet. If any media is present in the tweet, the Twitter API have its details in the entities key
    #Count the number of media whose type is photo
    if 'extended_tweet' in tweet_data.keys() and 'entities' in tweet_data['extended_tweet'].keys() and 'media' in tweet_data['extended_tweet']['entities'].keys():
        count = 0
        for med in tweet_data['extended_tweet']['entities']['media']:
            if med['type'] == 'photo':
                count = count+1
        num_images = count
    
    if 'entities' in tweet_data.keys() and 'media' in tweet_data['entities'].keys():
        count = 0
        for med in tweet_data['entities']['media']:
            if med['type'] == 'photo':
                count = count+1
        num_images = count
    
    #If num_images is 0 then return None else return num_images
    if num_images != 0:
        return num_images
    else:
        return None

#Function to parse the json file    
def parse_json(filename):
    data = load_json(filename)
    
    #Define a dictioary with keys and values as list
    data_dict = defaultdict(list)
    
    #Loop over every tweet in the loaded json file
    for i, tweet_data in enumerate(data['results']):
        
        #Extract the required information.
        #If the tweet has retweeted_status key then change the tweet to retweeted_status
        if 'retweeted_status' in tweet_data.keys():
            tweet_data = tweet_data['retweeted_status']
            
        #If the tweet_data has extended_tweet key then extract the full tweet text from it
        if 'extended_tweet' in tweet_data.keys():
            text = tweet_data['extended_tweet']['full_text']
        else:
            text = tweet_data['text']
        
        #Get the number of images present in a tweet
        num_images = get_num_images(tweet_data)
        
        #Get the date and time of the tweet
        date_time = tweet_data['created_at']
        
        #Get the number of likes/favorites
        favorite_count = tweet_data['favorite_count']
        
        #Get the number of retweets
        retweet_count = tweet_data['retweet_count']
        
        #Append the extracted information in the data_dict
        data_dict['text'].append(text)
        data_dict['date_time'].append(date_time)
        data_dict['num_favorites'].append(favorite_count)
        data_dict['num_retweets'].append(retweet_count)
        data_dict['num_images'].append(num_images)
    return data_dict

#Function to draw table from extracted data
def draw_table(data_dict):
    table = Texttable()
    #Add column names
    table.header(data_dict.keys())
    #Iteratively add each row
    for i in range(len(data_dict['text'])):
        table.add_row([data_dict['text'][i], data_dict['date_time'][i], data_dict['num_favorites'][i], data_dict['num_retweets'][i], data_dict['num_images'][i]])
    print(table.draw())

if __name__ == '__main__':
    
    filename = 'data.json'
    
    #Extract required information from json file and save it in a dictionary
    data_dict = parse_json(filename)
    
    #Draw table
    draw_table(data_dict)
    
#     print(pd.DataFrame(data_dict, dtype='int32'))

+-----------------+----------------+---------------+--------------+------------+
|      text       |   date_time    | num_favorites | num_retweets | num_images |
| Congratulations | Mon Mar 25     | 8             | 1            | 2          |
| @midasIIITD     | 13:01:57 +0000 |               |              |            |
| students Simra  | 2019           |               |              |            |
| Shahid @Simcyy  |                |               |              |            |
| and Nilay       |                |               |              |            |
| Shrivastava     |                |               |              |            |
| @NilayShri on   |                |               |              |            |
| getting         |                |               |              |            |
| selected for a  |                |               |              |            |
| research        |                |               |              |            |
| internship at   |         