# 02-twitter-prep-urls
**Purpose**:  extract URLs from tweets
- filter out duplicates and URLs internal to Twitter since we are only interested in URLs shared external to Twitter

In [None]:
import datetime
import os
from pprint import pprint

from inca import Inca

dir_inp = os.path.join('..', '..', 'data', '02-intermediate', '01-congress-legislators')
dir_out = os.path.join('..', '..', 'data', '02-intermediate', '02-twitter')
from collections import defaultdict
from urllib.parse import urlparse

import pandas as pd
from usrightmedia.shared.es_queries import *

In [None]:
myinca = Inca()

In [None]:
myinca.database.doctype_inspect('tweets2')

- [Understanding the new tweet payload]( https://blog.twitter.com/developer/en_us/topics/tips/2020/understanding-the-new-tweet-payload)
```
If a Tweet contains a URL, you can request information about it in the new v2 Tweet payload
using ‘entities’ as a value for the tweet.fields parameter. This will provide a ‘urls’ object
on your Tweet object (nested in the ‘data’ object). For a URL in the Tweet, you will be able
to get the URL, the title, description, and unwounded_url as shown below: (see link)
```

- [API Reference](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all)


In [None]:
# tweets with at least one URL
tweets = []
for n, doc in enumerate(myinca.database.document_generator(query_tw_field_exists('entities.urls'))):
    tweets.append(doc['_source'])
    # if n > 1000:
    #     break

In [None]:
def extract_urls(tweet):
    """ Extract the URL(s) included within each tweet
    
    Args:
        tweet (Twitter API v2 payload)

    Returns:
        extracted URLs (list of dictionaries):
            - the tweet-level info is duplicated across every URL dict
            - at the URL level:
                - 'url_id' (tweet_id + URL index value within tweet)
                - 'most_unrolled_url' is the URL for further processing

    """

    # tweet-level info
    tweet_id =  tweet['id']
    created_at =  datetime.datetime.strptime(tweet['created_at'],'%Y-%m-%dT%H:%M:%S.%fZ')
    text = tweet['text']
    author_id = tweet['author_id']
    username = tweet['author']['username']

    # list of URLs within tweet
    tweet_urls = tweet['entities']['urls']

    extracted_urls = []

    for n, t in enumerate(tweet_urls):
        r = defaultdict(None)

        # for every URL, include info about the tweet it came from
        r['tweet_id'] = tweet_id
        r['created_at'] = created_at
        r['text'] = text
        r['author_id'] = author_id
        r['username'] = username
        r['tweet_url'] = f"https://twitter.com/{username}/status/{tweet_id}"

        # add any versions of URL which are available
        r['url_id'] = f"{tweet_id}_{n}"
        r['url'] = t.get('url', None)
        r['expanded_url'] = t.get('expanded_url', None)
        r['display_url'] = t.get('display_url', None)
        r['unwound_url'] = t.get('unwound_url', None)
        
        # preferred URL version
        # similar to https://github.com/twitterdev/tweet_parser/blob/master/tweet_parser/getter_methods/tweet_links.py
        if r['unwound_url']:
            r['most_unrolled_url'] = r['unwound_url']
            r['most_unrolled_field'] = 'unwound_url'
        elif r['expanded_url']:
            r['most_unrolled_url'] = r['expanded_url']
            r['most_unrolled_field'] = 'expanded_url'
        elif r['url']:
            r['most_unrolled_url'] = r['url']
            r['most_unrolled_field'] = 'url'

        extracted_urls.append(r)

    return extracted_urls


In [None]:
urls = []
for tweet in tweets:
    urls.extend(extract_urls(tweet))

In [None]:
df = pd.DataFrame(urls)

In [None]:
df.head()

In [None]:
df[['url', 'expanded_url', 'display_url', 'unwound_url', 'most_unrolled_url']].count()

In [None]:
print(f"number of tweets: {df['tweet_id'].nunique()}") # same as len(tweets)
print(f"number of URLs extracted from tweets: {len(df)}")

In [None]:
# number of URLs each tweet object contains
# "60,827 tweets contain 2 URLs each"
vc_smry = df.groupby('tweet_id').size().value_counts()
vc_smry = pd.DataFrame(vc_smry).reset_index().rename(columns={'index':'count (# of URLs)', 0: 'count (tweet_id)'})
vc_smry

In [None]:
# helper for filtering
# https://stackoverflow.com/a/48628442
vc_helper = df['tweet_id'].value_counts()
vc_helper

In [None]:
# examples of tweet_ids which each had 2 URLs associated with them
df[df['tweet_id'].isin(vc_helper.index[vc_helper.eq(2)])]['tweet_id'].unique()[0:5]

In [None]:
# examples of tweet_ids which each had 5 URLs associated with them
df[df['tweet_id'].isin(vc_helper.index[vc_helper.eq(5)])]['tweet_id'].unique()[0:5]

In [None]:
def show_tweet(tweet_id):
    """ prints a tweet's URL, text, and info about URLs embedded in the text

    Args:
        tweet_id (str)
    
    Returns:
        None    
    
    """
    t = [tweet for tweet in tweets if tweet['id']==tweet_id][0]
    print(f"https://twitter.com/{t['author']['username']}/status/{t['id']}")
    print(t['text'])
    pprint(t['entities']['urls'])

In [None]:
# the two URLs are unique: one is internal to twitter.com and the other is not
show_tweet('686942249113022468')

In [None]:
# latter 4 URLs are a set of photos associated with the tweet. They share the same t.co URL and all show '...photo/1'
# should be '...photo/1', '...photo/2', etc.
show_tweet('1040678926274715648')

In [None]:
# check: rows where the tweet object contained duplicate URLs
df.loc[df.duplicated(['tweet_id', 'most_unrolled_url'], keep='first')].reset_index(drop=True)

In [None]:
# boolean filter conditions
df['is_dupe'] = df.duplicated(['tweet_id', 'most_unrolled_url'],keep='first')
df['url_netloc'] = df['most_unrolled_url'].map(lambda x: urlparse(x).netloc)
df['is_from_tw'] = df['url_netloc'] == 'twitter.com'
df

In [None]:
# we want to count URLs per tweet instance and avoid 'inflating' the count due to duplicates (is_dupe==False)
# we are also only interested in URLs shared from non-'twitter.com' (is_from_tw==False)
filter_smry = pd.DataFrame(df.groupby(['is_dupe', 'is_from_tw'],dropna=False).size()).reset_index().rename(columns={0:'count'})
filter_smry

In [None]:
# keep a URL if it is distinct within its tweet object and not from 'twitter.com'
df_filtered = df.loc[(df['is_dupe']==False) & (df['is_from_tw']==False)].reset_index(drop=True)
df_filtered

In [None]:
# check: seems like unwound_url is often unavailable from Twitter API v2
df['most_unrolled_field'].value_counts(dropna=False)

In [None]:
df_filtered['most_unrolled_field'].value_counts(dropna=False)

In [None]:
# check: which URLs got excluded?
excluded_url_ids = list(set(df.loc[df['most_unrolled_field'].notnull()]['url_id']) - set(df_filtered.loc[df_filtered['most_unrolled_field'].notnull()]['url_id']))
df.loc[df['url_id'].isin(excluded_url_ids)].reset_index()

In [None]:
print(f"{len(df_filtered)} URLs shared by politicians on Twitter which should be processed further (URL expansion and URL matching)")
df_filtered.to_pickle(os.path.join(dir_out, f'politicians_tweeted_urls.pkl'))