## Expanded URLs
**Purpose**: 
- Process the expanded version of the URLs were (re-)tweeted by congressional Republicans during 2016-2020.
- Note: this project uses a [forked version of urlExpander](https://github.com/wlmwng/urlExpander).

In [1]:
import os
import json
import pandas as pd
import plotnine as p9
import urlexpander
from mizani.breaks import date_breaks
from mizani.formatters import date_format

In [2]:
dir_fig = os.path.join("..", "..", "figures")
dir_url = os.path.join('..', '..', 'data', '02-intermediate', '02-twitter')
urls = pd.read_pickle(os.path.join(dir_url, f'politicians_tweeted_urls.pkl'))

### Load default version of expanded URLs

In [None]:
urls = pd.read_pickle(os.path.join(dir_url, f"politicians_tweeted_urls_resolved.pkl"))

### TODO (after revised version of expansion finishes)
- Load expanded URLs from cache
- Select the URL version for URL matching
- Remove existing `tweets2_url` documents and then re-add
- Re-run the URL matching processor

In [3]:
cache = []
with open(os.path.join(dir_url, "urlexpander_cache.jsonl")) as f:
    for line in f:
        cache.append(json.loads(line))

In [4]:
cache[-1]

{'original_url': 'https://youtu.be/7_l8Rb7xMPc',
 'resolved_url': 'https://www.youtube.com/watch?v=7_l8Rb7xMPc&feature=youtu.be',
 'resolved_domain': 'youtube.com'}

In [5]:
urls = pd.DataFrame(cache)
urls

Unnamed: 0,original_url,resolved_url,resolved_domain
0,https://mullin.house.gov/news/documentsingle.a...,https://mullin.house.gov/news/documentsingle.a...,house.gov
1,https://byrne.house.gov/spring-2017-internship,http://byrne.house.gov/__CONNECTIONPOOL_ERROR__,byrne.house.gov
2,https://www.instagram.com/p/BL4QdKqh9zG/,http://instagram.com/__CLIENT_ERROR__,instagram.com
3,https://www.politico.com/agenda/story/2018/01/...,https://www.politico.com/agenda/story/2018/01/...,politico.com
4,https://www.augustachronicle.com/opinion/20200...,https://eu.augustachronicle.com/,augustachronicle.com
...,...,...,...
2271,https://bit.ly/2JuhHWM,https://www.heartland.org/news-opinion/news/pr...,heartland.org
2272,http://facebook.com/RepBrianMast/videos/184795...,https://www.facebook.com/RepBrianMast/videos/1...,facebook.com
2273,http://bit.ly/2m7gy9a,https://biggs.house.gov/media/press-releases/c...,house.gov
2274,http://omaha.com/news/nation/trump-s-remarks-m...,http://omaha.com/__CLIENT_ERROR__,omaha.com


In [6]:
def select_url(row):
    """Select the URL which will be processed for URL matching"""
    if "ERROR" in row["resolved_url"]:
        resolved_url = row["original_url"]
    else:
        resolved_url = row["resolved_url"]
    return resolved_url

In [7]:
urls["selected_url"] = urls.apply(lambda r: select_url(r), axis=1)

In [8]:
urls

Unnamed: 0,original_url,resolved_url,resolved_domain,selected_url
0,https://mullin.house.gov/news/documentsingle.a...,https://mullin.house.gov/news/documentsingle.a...,house.gov,https://mullin.house.gov/news/documentsingle.a...
1,https://byrne.house.gov/spring-2017-internship,http://byrne.house.gov/__CONNECTIONPOOL_ERROR__,byrne.house.gov,https://byrne.house.gov/spring-2017-internship
2,https://www.instagram.com/p/BL4QdKqh9zG/,http://instagram.com/__CLIENT_ERROR__,instagram.com,https://www.instagram.com/p/BL4QdKqh9zG/
3,https://www.politico.com/agenda/story/2018/01/...,https://www.politico.com/agenda/story/2018/01/...,politico.com,https://www.politico.com/agenda/story/2018/01/...
4,https://www.augustachronicle.com/opinion/20200...,https://eu.augustachronicle.com/,augustachronicle.com,https://eu.augustachronicle.com/
...,...,...,...,...
2271,https://bit.ly/2JuhHWM,https://www.heartland.org/news-opinion/news/pr...,heartland.org,https://www.heartland.org/news-opinion/news/pr...
2272,http://facebook.com/RepBrianMast/videos/184795...,https://www.facebook.com/RepBrianMast/videos/1...,facebook.com,https://www.facebook.com/RepBrianMast/videos/1...
2273,http://bit.ly/2m7gy9a,https://biggs.house.gov/media/press-releases/c...,house.gov,https://biggs.house.gov/media/press-releases/c...
2274,http://omaha.com/news/nation/trump-s-remarks-m...,http://omaha.com/__CLIENT_ERROR__,omaha.com,http://omaha.com/news/nation/trump-s-remarks-m...


### Process selected URL

In [None]:
urls.columns

In [None]:
urls['resolved_netloc'] = urls['resolved_url'].map(lambda x: urlexpander.url_utils.standardize_url(url=x,
                                                                                                   remove_scheme=True,
                                                                                                   replace_netloc_with_domain=False,
                                                                                                   remove_path=True,
                                                                                                   remove_query=True,
                                                                                                   remove_fragment=True,
                                                                                                   to_lowercase=True,))
urls['resolved_domain'] = urls['resolved_url'].map(lambda x: urlexpander.url_utils.get_domain(url=x))
urls['standardized_url'] = urls['resolved_url'].map(lambda x: urlexpander.url_utils.standardize_url(url=x,
                                                                                                    remove_scheme=True,
                                                                                                    replace_netloc_with_domain=False,
                                                                                                    remove_path=False,
                                                                                                    remove_query=False,
                                                                                                    remove_fragment=True,
                                                                                                    to_lowercase=True,))
urls['is_generic_url'] = urls['resolved_url'].map(lambda x: urlexpander.url_utils.is_generic_url(x))

In [None]:
def extract_error(resolved_url):
    if 'CLIENT_ERROR' in resolved_url:
        res = 'CLIENT_ERROR'
    elif 'CONNECTIONPOOL_ERROR' in resolved_url:
        res = 'CONNECTIONPOOL_ERROR'
    else:
        res = 'no error'
    return res

In [None]:
urls['urlexpander_error'] = urls['resolved_url'].map(lambda x: extract_error(x))

- export the URLs

In [None]:
urls = urls[['tweet_id', 'created_at', 'created_week', 'created_month',
             'created_year', 'text', 'author_id', 'username', 'tweet_url', 'url_id',
             'url', 'expanded_url', 'display_url', 'unwound_url',
             'most_unrolled_url', 'most_unrolled_field', 'is_dupe',
             'is_from_tw', 'resolved_url', 'resolved_netloc', 'resolved_domain',
             'standardized_url', 'is_generic_url', 'urlexpander_error']]

urls.to_pickle(os.path.join(dir_url, f'politicians_tweeted_urls_urlexpander.pkl'))

- spot-check the URLs

In [None]:
urls = pd.read_pickle(os.path.join(dir_url, f'politicians_tweeted_urls_urlexpander.pkl'))

In [None]:
# examples of generic URLs
with pd.option_context("display.max_rows", None, "display.max_colwidth", None):
    display(
        urls.loc[urls["is_generic_url"]][
            [
                "most_unrolled_url",
                "resolved_url",
                "standardized_url",
                "is_generic_url",
            ]
        ].sample(n=100, random_state=42)
    )


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    display(urls[['resolved_url', 'standardized_url', 'is_generic_url']].sample(n=100, random_state=42))

In [None]:
plt = (p9.ggplot(urls, p9.aes(x='created_week'))
+ p9.geom_line(p9.aes(color='urlexpander_error'), stat='count')
+ p9.scale_x_datetime(name='',
                      breaks=date_breaks('2 years'),
                      labels=date_format('%Y'))
+ p9.theme(figure_size=(6,3),
           legend_position='right',
           subplots_adjust={'hspace':0.5})
+ p9.labs(title = f"Expansion of tweeted URLs"))

plt

In [None]:
# count of retrieved URLs per outlet by month from 2016 - 2020
# plt.save(os.path.join(dir_fig, 'c0204_urlExpander_timeline.svg'))