In [None]:
from pathlib import Path

import pandas as pd
import wmfdata as wmf
from wmfdata.utils import (
    pd_display_all, 
    print_err
)

In [None]:
def make_sql_tuple(i):
    """
    Making an SQL 'tuple', for use in an IN clause, is hard. Doing it manually using 
    `", ".join` requires a lot of messing around with quote marks and escaping. Using the
    string representation of a Python tuple *almost* works, but fails when there's just
    one element, because SQL doesn't accept the trailing comma that Python uses.

    What we really want is the string representation of a Python list, but using parentheses
    instead of brackets. This function turns an iterable into just that.
    """
    if type(i) != list:
        i = [x for x in i]

    list_repr = repr(i)

    return "(" + list_repr[1:-1] + ")"

In [None]:
YEAR = 2021
MONTH = 3
DAY = 3

COUNTRIES = [
    "NG", # Nigeria
    "PK", # Pakistan
    "TZ", # Tanzania
    "UG"  # Uganda
]

BAD_RECOMMENDATIONS = make_sql_tuple([
    "-",
    ".xxx",
    "Brazzers",
    "Main_Page",
    "News",
    "Pornography",
    "Sex",
    "XHamster",
    "XVideos",
    "XXX",
    "XXX_(film_series)",
    "XXX:_Return_of_Xander_Cage",
    "XXXTentacion",
    "XXXX"
])

In [None]:
try:
    trending_articles = pd.read_csv("trending_articles.csv", parse_dates=["date"])
except FileNotFoundError:
    pass

date = pd.Timestamp(YEAR, MONTH, DAY)
query = Path("trending_articles_for_country_day.sql").read_text()

for country in COUNTRIES:
    recent_start = date - pd.DateOffset(days=7)
    
    try:
        recently_trending = trending_articles.query(
            "country == @country"
            "date >= @recent_start "
            "& date < @date "
            "& rank <= 5"
        )
        
        recently_trending = make_sql_tuple(recently_trending["article"])
    except NameError:
        recently_trending = []

    if len(recently_trending) > 0 :
        not_recently_trending_clause = f"AND canonical_title NOT IN {recently_trending}"
    else:
        not_recently_trending_clause = ""

    formatted_query = query.format(
      country=country,
      year=YEAR,
      month=MONTH,
      day=DAY,
      bad_recommendations=BAD_RECOMMENDATIONS,
      not_recently_trending_clause=not_recently_trending_clause
    )

    results = wmf.spark.run(formatted_query, session_type="yarn-large")
    results["date"] = pd.to_datetime(results["date"])

    try:
        trending_articles = trending_articles.append(results)
    except NameError:
        trending_articles = results

trending_articles.sort_values(["date", "country", "rank"]).to_csv("trending_articles.csv", index=False)