# Download Twitter Data

#### Import Dependencies

In [None]:
# Tweepy library to connect to the Twitter API and process the response
import tweepy
# JSON to serialise JSON
import json
# Pandas to work with dataframes
import pandas as pd
# wget to retrieve profile images
import wget
# OS to manipulate file paths and names
import os
# Shutil to handle file duplicates
import shutil
# Time to handle API rate limiting
import time

# Read environment variables
from dotenv import load_dotenv

#### Load configuration file for environment variables

In [None]:
load_dotenv('configuration.env')

#### Load Twitter authentication from environment variables

In [None]:
# Twitter App credentials
consumer_key = os.getenv('TWITTER_CONSUMER_KEY')
consumer_secret = os.getenv('TWITTER_CONSUMER_SECRET')
access_token = os.getenv('TWITTER_ACCESS_TOKEN')
access_token_secret = os.getenv('TWITTER_ACCESS_TOKEN_SECRET')

#### Twitter Authentication

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, retry_count=3,retry_delay=5,retry_errors=set([401, 404, 500, 503]))

#### Set API Rate Limit

In [None]:
rate_limit = 5

#### Seed User

In [None]:
# User's screen name
screen_name = "BIM360"

#### Get User ID from Screen Name

In [None]:
user = api.get_user(screen_name)
seed_user = user.id_str
print("User ID for " + screen_name + " is " + seed_user)

### Download Twitter Friends List

Function for downloading a list of a user's friends (the accounts they follow). The function requires the user ID of a 'seed' user.

**Note:** Twitter accounts marked 'private' or `protected` with return an empty list.

In [None]:
def download_friends_list(user_id):
    
    friends = []
    
    try:
        # getting the friends list 
        friends = api.friends_ids(user_id, stringify_ids=True)
        
        print(user_id + " friends = " + str(len(friends)))
        
    except tweepy.TweepError as e: 
        print(user_id + " encountered Tweepy Error: {}".format(e))
        pass
    
    with open("./01_friends_lists/{filename}.json".format(filename=user_id), "w") as file:
        file.write(json.dumps(friends))
    
    return friends

#### Download friends of seed user

In [None]:
friends = download_friends_list(seed_user)

#### Download friends-of-friends

If the download fails at any point you can restart from a specific point by taking a slice. Amending the first line of the following code to `for friend in friends[100:]:` will process all remaining items in the list from the hundredth element to the end.

In [None]:
for friend in friends:
    
    fof = download_friends_list(str(friend))
    
    # Wait for rate limiter
    time.sleep(rate_limit)
        
print("Downloading friends-of-friends complete!")

### Identify Unique Users

This code identifies a list of unique users contained the friends lists captured above. Obtaining a list of unique users will speed up the process of downloading user details.

#### Set directory to read friends lists from

In [None]:
friends_directory = "./01_friends_lists/"

#### Identify unique users and calculate summary statistics

In [None]:
filecount = 0
users = []
uniqueIDs = []

#Loop through files of friends lists
for file in os.listdir(friends_directory):
    
    # Count files processed
    filecount += 1
    
    # Open the file
    with open(friends_directory+file,encoding = "utf-8") as f:
        
        # Loop through friendlists
        for friendlist in f:
            
            # Convert JSON array to python list
            friends = json.loads(friendlist)
            
            # Add friends to list of users
            users = users + friends

#Create a dictionary with keys from users list to remove duplicates
d = dict.fromkeys(users)
#Unpack the dictionary back to a list
uniqueIDs = [*d]

print("---------SUMMARY---------")
print("Seed User = 1")
print("Friends = " + str(filecount))
print("Friends of Friends = " + str(len(uniqueIDs) - filecount - 1)) #Excluding seed user
print("Unique users = " + str(len(uniqueIDs)))
print("Relationships = " + str(len(users)))

### Download Twitter User Details

Function for downloading public user information for each unique account identified in the friends list above. The function requires a list of user's IDs. It also accepts an optional argument `store_images=True` which will download a user's profile image or logo if they have one.

In [None]:
def download_users(userIDs, store_images=False):    
    
    # Count of processed users and batches
    user_count = 0
    batch_count = 0
    
    chunks = (len(userIDs) - 1) // 100 + 1
    for i in range(chunks):
        batch = userIDs[i*100:(i+1)*100]
        
        # Increment batch count
        batch_count = batch_count + 1
        print("---------BATCH {}---------".format(str(batch_count)))
    
        try:
            #Request users by ID in batches
            users = api.lookup_users(batch)

            for user in users:
                # Increment friend count
                user_count = user_count + 1
                print(str(user_count) + ": " + user.id_str + "_" + user.screen_name)

                #Store the user's data
                data = {"id_str": user.id_str,
                        "screen_name": user.screen_name,
                        "name": user.name,
                        "location": user.location,
                        "profile_image_url": user.profile_image_url,
                        "description": user.description,
                        # Placeholder for website url
                        "expanded_url": "",
                        "followers_count": user.followers_count,
                        "friends_count": user.friends_count,
                        "statuses_count": user.statuses_count,
                        # Format datetime object
                        "created_at": user.created_at.strftime("%y-%m-%d, %H:%M:%S"),
                        "protected": user.protected,
                        "verified": user.verified
                       }

                # Check for website URL and add to data
                if ("url" in user.entities):
                    data["expanded_url"] = user.entities["url"]["urls"][0]["expanded_url"],
                else:
                    print("No associated url")

                # Save user details
                with open("./02_user_details/{filename}.json".format(filename=user.id_str), "w") as file:
                    file.write(json.dumps(data))

                # Save the Twitter profile image
                if store_images == True:
                    file_destination = os.path.join("03_logos/{filename}.png".format(filename=user.id_str))
                    # Check if file already exists
                    if os.path.exists(file_destination) != True:
                        try:
                            # Remove the '_normal' tag from the URL to get a full sized image
                            link = user.profile_image_url.replace("_normal","")
                            # Use wget to download and save the image file
                            wget.download(link, file_destination)
                        except:
                            print("Profile image not available")
                    else:
                        print("Profile image already downloaded")
                else:
                    pass

            # Wait for rate limiter
            time.sleep(rate_limit)
        
        except tweepy.TweepError as e: 
            print("Encountered Tweepy Error: {}".format(e))
            pass
    
    print("Processing complete!")

#### Download User Details

If the download fails at any point you can restart from a specific point by taking a slice. Amending the first line of the following code with `uniqueIDs[100:]` will process all remaining items in the list from the hundredth element to the end.

In [None]:
download_users(uniqueIDs, store_images=True)

###  Check Missed User Accounts

As the download process takes time the status of accounts may change. If a requested user is unknown, suspended, or deleted at the time of the request to the Twitter API then that user will not be returned in the list of results. The following code checks for any missed records. By repeating the code you can ensure you have details of all available accounts and then store a reference list of those accounts that weren't available for download.

#### Set directory to read user details from

In [None]:
user_directory = "./02_user_details/"

#### Get list of captured user IDs

In [None]:
capturedIDs = []

#Loop through files of user details
for file in os.listdir(user_directory):
    filename = os.path.splitext(file)[0]
    capturedIDs.append(filename)

print("Captured user details = " + str(len(capturedIDs)) + "/" + str(len(uniqueIDs)))
print("Missed records = " + str(len(uniqueIDs) - len(capturedIDs)))

#### Generate list of missing IDs

In [None]:
missingIDs = list(set(uniqueIDs).difference(capturedIDs))
print("Records to find = " + str(len(missingIDs)))

#### Check if missing user details can be downloaded

In [None]:
download_users(missingIDs, store_images=True)

#### Store list of missing IDs

In [None]:
with open("./04_inaccessible_users/inaccessible_users.json", "w") as file:
    file.write(json.dumps(missingIDs))