# Twitter - Crime Database

In [1]:
import tweepy as tp
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time

from datetime import datetime
# from utils.db_utils import *
# from utils.twitter_utils import *
from dotenv import load_dotenv

sns.set()

In [2]:
# constants
PATH = os.path.expanduser('~') + '/'
env_file = '.env'
today = datetime.today().strftime('%Y-%m-%d')

## Connect to Twitter API

In [3]:
def env_reader(env_path):
    '''
    This function will read the environment variables given the location of where the
    .env file is located
    '''
    if not load_dotenv(env_path):
        return print('Environment File Not Found')
    return print('Loaded!')

env_reader(os.getcwd() + '/' + env_file)

Loaded!


In [4]:
def connect_twitter(api_key, secret_key, access_token, secret_access_token):
    '''
    This function will create a connection to the twitter api using the necessary
    credentials associated to the project.
    
    params:
        api_key (String) : Taken from twitter developer account
        secret_key (String) : Taken from twitter developer account
        access_token (String) : Taken from twitter developer account
        secret_access_token (String) : Taken from twitter developer account
         
    returns:
        This function will return an API which can be called
    '''
    auth = tp.OAuthHandler(
        consumer_key = api_key, 
        consumer_secret = secret_key
    )
    auth.set_access_token(
        key =  access_token, 
        secret = secret_access_token
    )
    api = tp.API(auth)
    
    try:
        api.verify_credentials()
        print("Connection to Twitter established.")
    except:
        print("Failed to connect to Twitter.")
    return api

api = connect_twitter(
    api_key = os.getenv("twitter_api_key"), 
    secret_key = os.getenv("twitter_secret"),
    access_token = os.getenv("twitter_access_token"),
    secret_access_token = os.getenv("twitter_access_token_secret")
)

Connection to Twitter established.


## Get Tweets

In [5]:
def mk_dir(path, date):
    '''
    The purpose of this function is to make a directory if one does not exist of the current
    date in the data folder.
    
    params:
        path (String) : The path to the data folder
        date (String) : The current date yyyy-mm-dd
        
    returns:
        This function will do nothing if the folder already exists, otherwise it will create
        the folder
    
    example:
        mk_dir(path = './data/', date = '2021-09-23')
    '''
    
    # check if directory exists 
    exists = os.path.exists(path + date)
    
    if not exists:
        os.makedirs(path + date)
        print("New Directory Created")

In [6]:
def get_all_tweets(screen_name, api = api, today = today):
    '''
    This function will get the latest ~3200 tweets associated to a twitter screen name.
    It will proceed to get the tweet id, created at and the content and store it in a df.
    It will save the associated results in a CSV file.
    
    params:
        screen_name (String) : The twitter handle associated to the user you want to get
                               tweets from
        api (API) : The tweepy API connection
        today (String) : Todays date in string format
    
    returns:
        This function will return a df associated to the tweet id, created_at and content
        
    source:
        [yanofsky](https://gist.github.com/yanofsky/5436496)
    '''
    #initialize a list to hold all the tweepy Tweets
    alltweets = []  
    
    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name,count=200)
    
    #save most recent tweets
    alltweets.extend(new_tweets)
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
#         print(f"getting tweets before {oldest}")
        
        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
        
        #save most recent tweets
        alltweets.extend(new_tweets)
        
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
#         print(f"...{len(alltweets)} tweets downloaded so far")
    
    #transform the tweepy tweets into a 2D array that will populate the csv 
    outtweets = [
        [
            t.author.name, t.id_str, t.created_at, t.text, t.entities.get('hashtags'), t.author.location,
            t.author.created_at, t.author.url, t.author.screen_name, t.favorite_count, t.favorited,
            t.retweet_count, t.retweeted, t.author.followers_count, t.author.friends_count
        ] for t in alltweets
    ]
    
    #write the csv  
    cols = [
        'author_name', 'tweet_id', 'tweet_created_at', 'content', 'hashtags', 'location',
        'author_created_at', 'author_url', 'author_screen_name', 'tweet_favourite_count', 'tweet_favourited', 
        'retweet_count', 'retweeted', 'author_followers_count', 'author_friends_count'
    ]
    df = pd.DataFrame(outtweets, columns = cols)
    mk_dir(path = './data/', date = today)
    df.to_csv('./data/{}/{}_tweets_{}.csv'.format(today, screen_name, today), index = False)
    time.sleep(10)
    return df

In [7]:
def read_tweet_data(path):
    '''
    This function will identify if todays data has already been scraped from the twitter API.
        - If it has been scraped, this function will read all the scraped data from today 
           and previous days
    Upon fetching all the data, it will drop duplicates on the tweet_id and tweet_created_at
    columns to remove duplicated tweets scraped from previous days.
    
    params:
        path (String) : The path to the data folder
        today (String) : Today's date in string format yyyy-mm-dd
        
    returns:
        This function will return the tweets_df associated to tweets from all handles over 
        the past few months
        
    example:
        read_tweet_data(
            path = './data/'
        )
    '''
    
    # get all non hidden subdirectories from the path
    sub_dir = [d for d in os.listdir(path) if d[0] != '.']
    
    # read csv from all sub directories and concat results
    files = []
    for d in sub_dir:
        for p in os.listdir(path + d):
            if p[0] != '.':
                files.append(path + d + '/' + p)

    read_csvs = []
    for file in files:
        read_csvs.append(pd.read_csv(file, converters={'hashtags': eval}, encoding='utf-8-sig'))

    read_csvs = pd.concat(read_csvs)
    tweets_df = read_csvs.drop_duplicates(subset = ['tweet_id', 'tweet_created_at'])
    return tweets_df

In [13]:
handles = [
    'CP24', 'TPSOperations', 'TorontoPolice'
]

In [14]:
%%time
if today not in os.listdir('./data/'):
    for user in handles:
        print(user)
        _ = get_all_tweets(user)

CP24
New Directory Created
TPSOperations
TorontoPolice
CPU times: user 4.27 s, sys: 623 ms, total: 4.89 s
Wall time: 1min 4s


In [15]:
%%time
tweets_df = read_tweet_data(
    path = './data/'
)

CPU times: user 87.2 ms, sys: 13.2 ms, total: 100 ms
Wall time: 135 ms


In [16]:
tweets_df.shape

(9749, 15)

## Preprocess Data

In [18]:
def parse_hashtags(row):
    if row['hashtags_count'] > 1:
        tags = row['hashtags']
        hashies = []
        for t in tags:
            hashies.append(t['text'])
        return hashies
    return None

In [19]:
tweets_df['hashtags_count'] = tweets_df['hashtags'].apply(lambda x : len(x))

In [20]:
%time tweets_df['hashtags'] = tweets_df.apply(lambda x : parse_hashtags(x), axis = 1)

CPU times: user 59.7 ms, sys: 15.5 ms, total: 75.2 ms
Wall time: 82.1 ms


In [27]:
tweets_df[tweets_df['author_name'] == 'Toronto Police'].tweet_created_at.min()

'2021-01-15 21:24:00+00:00'

In [29]:
# tweets_df