In [16]:
from pathlib import Path

import pandas as pd
import requests
import wmfdata as wmf
from wmfdata.utils import (
    pd_display_all, 
    print_err
)

In [3]:
def make_sql_tuple(i):
    """
    Making an SQL 'tuple', for use in an IN clause, is hard. Doing it manually using 
    `", ".join` requires a lot of messing around with quote marks and escaping. Using the
    string representation of a Python tuple *almost* works, but fails when there's just
    one element, because SQL doesn't accept the trailing comma that Python uses.

    What we really want is the string representation of a Python list, but using parentheses
    instead of brackets. This function turns an iterable into just that.
    """
    if type(i) != list:
        i = [x for x in i]
        
    if len(i) == 0:
        return None

    list_repr = repr(i)

    return "(" + list_repr[1:-1] + ")"

In [10]:
YEAR = 2021
MONTH = 3
DAY = 3

COUNTRIES = [
    "NG", # Nigeria
    "PK", # Pakistan
    "TZ", # Tanzania
    "UG"  # Uganda
]

BAD_RECOMMENDATIONS = make_sql_tuple([
    "-",
    ".xxx",
    "Brazzers",
    "Main_Page",
    "News",
    "Pornography",
    "Sex",
    "XHamster",
    "XVideos",
    "XXX",
    "XXX_(film_series)",
    "XXX:_Return_of_Xander_Cage",
    "XXXTentacion",
    "XXXX"
])

In [11]:
try:
    trending_articles = pd.read_csv("trending_articles.csv", parse_dates=["date"])
except FileNotFoundError:
    pass

date = pd.Timestamp(YEAR, MONTH, DAY)
query = Path("trending_articles_for_country_day.sql").read_text()

for country in COUNTRIES:
    recent_start = date - pd.DateOffset(days=7)
    
    try:
        recently_trending = trending_articles.query(
            "country == @country "
            "& date >= @recent_start "
            "& date < @date "
            "& rank <= 5"
        )
        
        recently_trending = make_sql_tuple(recently_trending["article"])
    except NameError:
        recently_trending = []

    if recently_trending:
        not_recently_trending_clause = f"AND canonical_title NOT IN {recently_trending}"
    else:
        not_recently_trending_clause = ""

    formatted_query = query.format(
      country=country,
      year=YEAR,
      month=MONTH,
      day=DAY,
      bad_recommendations=BAD_RECOMMENDATIONS,
      not_recently_trending_clause=not_recently_trending_clause
    )

    results = wmf.spark.run(formatted_query, session_type="yarn-large")
    results["date"] = pd.to_datetime(results["date"])

    try:
        trending_articles = trending_articles.append(results)
    except NameError:
        trending_articles = results

trending_articles.sort_values(["date", "country", "rank"]).to_csv("trending_articles.csv", index=False)

PySpark executors will use /usr/bin/python3.7.
PySpark executors will use /usr/bin/python3.7.
PySpark executors will use /usr/bin/python3.7.
PySpark executors will use /usr/bin/python3.7.


In [31]:
for x in trending_articles.itertuples(index=False):
    print(x.article)

List_of_states_and_territories_of_the_United_States
Nigeria
List_of_capitals_of_states_of_Nigeria
List_of_Lagos_State_local_government_areas_by_population
Coat_of_arms_of_Nigeria
World_Trade_Organization
2021_Pakistan_Super_League
Pakistan_Super_League
Pakistan
2021_Pakistani_Senate_election
Ejaz_Durrani
Senate_of_Pakistan
Bashiru_Ally
Tanzania
2020–21_CAF_Champions_League
Mikocheni
History_of_Tanzania
Regions_of_Tanzania
2021_Africa_U-20_Cup_of_Nations
List_of_members_of_the_eleventh_Parliament_of_Uganda
Daniel_Kaluuya
Uganda
Africa_U-20_Cup_of_Nations
List_of_Ugandans_by_net_worth
The_Flash_(season_7)
Geopolitical_zones_of_Nigeria
States_of_Nigeria
Lagos
List_of_government_agencies_of_Nigeria
List_of_the_most_popular_names_in_the_1960s_in_the_United_States
List_of_Pakistan_Super_League_records_and_statistics
Senate_of_Pakistan
National_Assembly_of_Pakistan
Noor_Jehan
Muhammad_Ali_Jinnah
List_of_serving_generals_of_the_Pakistan_Army
Simba_S.C.
Yanga_SC
Dar_es_salaam
CAF_Champions_Leag

In [17]:
article = "Dar_es_salaam"
summary_data = requests.get("https://en.wikipedia.org/api/rest_v1/page/summary/" + article).json()
title = summary_data["title"]
description = summary_data["extract_html"]
image_url = summary_data["thumbnail"]["source"]

{'type': 'standard',
 'title': 'Dar es Salaam',
 'displaytitle': 'Dar es Salaam',
 'namespace': {'id': 0, 'text': ''},
 'wikibase_item': 'Q1960',
 'titles': {'canonical': 'Dar_es_Salaam',
  'normalized': 'Dar es Salaam',
  'display': 'Dar es Salaam'},
 'pageid': 8500,
 'thumbnail': {'source': 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ab/Dar_es_Salaam_-_Posta.jpg/320px-Dar_es_Salaam_-_Posta.jpg',
  'width': 320,
  'height': 174},
 'originalimage': {'source': 'https://upload.wikimedia.org/wikipedia/commons/a/ab/Dar_es_Salaam_-_Posta.jpg',
  'width': 4268,
  'height': 2327},
 'lang': 'en',
 'dir': 'ltr',
 'revision': '1013634446',
 'tid': 'ed5fc950-8b34-11eb-98eb-d7e87a3f4bc9',
 'timestamp': '2021-03-22T17:34:49Z',
 'description': 'Largest city in Tanzania',
 'description_source': 'local',
 'coordinates': {'lat': -6.8, 'lon': 39.28333333},
 'content_urls': {'desktop': {'page': 'https://en.wikipedia.org/wiki/Dar_es_Salaam',
   'revisions': 'https://en.wikipedia.org/wiki/Dar_e

In [15]:
trending_articles.query("country == 'TZ'")

Unnamed: 0,date,country,rank,article,views,views_computer_proportion
12,2021-03-01,TZ,1,Bashiru_Ally,367,0.288828
13,2021-03-01,TZ,2,Tanzania,276,0.246377
14,2021-03-01,TZ,3,2020–21_CAF_Champions_League,226,0.119469
15,2021-03-01,TZ,4,Mikocheni,148,0.222973
16,2021-03-01,TZ,5,History_of_Tanzania,129,0.217054
17,2021-03-01,TZ,6,Regions_of_Tanzania,119,0.235294
36,2021-03-02,TZ,1,Simba_S.C.,242,0.11157
37,2021-03-02,TZ,2,Yanga_SC,147,0.115646
38,2021-03-02,TZ,3,Dar_es_salaam,130,0.161538
39,2021-03-02,TZ,4,CAF_Champions_League,110,0.154545


In [None]:
import mwapi
import datetime as dt
from requests_oauthlib import OAuth1
import secrets.oauth as oauth_cfg
import re
import time
import html
import IPython.display as ipyd

In [None]:
USER_AGENT = (
    "CE Insights survey bot -- " +
    "https://github.com/wikimedia-research/Community-Engagement-Insights-sampling"
)

In [None]:
sess = mwapi.Session("https://meta.wikimedia.org", user_agent = USER_AGENT)

auth = OAuth1(
    oauth_cfg.consumer_token,
    oauth_cfg.consumer_secret,
    oauth_cfg.access_token,
    oauth_cfg.access_secret
)

def get_token():
    resp = sess.get(
        action="query", 
        meta="tokens", 
        type="csrf", 
        auth = auth
    )
    
    return resp["query"]["tokens"]["csrftoken"]

def api_get(*args, **kwargs):
    return sess.get(
        *args,
        format = "json",
        formatversion = 2,
        auth = auth,
        **kwargs
    )
    
def api_post(*args, **kwargs):
    return sess.post(
        *args,
        format = "json",
        formatversion = 2,
        auth = auth,
        token = get_token(),
        **kwargs
    )
