In [1]:
import json
import subprocess

import pandas as pd
from lxml import etree


pd.options.display.float_format = '{:,.2f}'.format


def page(page):
    body = subprocess.check_output(f'''
        curl 'https://us4bg.org/?projects_ajax=true&lang=en&pagesize=10&paged={page}&project_year=all&project_area=all&project_search=' \
            -H 'authority: us4bg.org' \
            -H 'accept: */*' \
            -H 'accept-language: en-US,en;q=0.9' \
            -H 'cookie: _icl_visitor_lang_js=en_us; wp-wpml_current_language=en; cookielawinfo-checkbox-necessary=yes; cookielawinfo-checkbox-functional=yes; cookielawinfo-checkbox-analytics=yes; cookielawinfo-checkbox-advertisement=yes; viewed_cookie_policy=yes; cli_user_preference=en-cli-yes-checkbox-necessary-yes-checkbox-functional-yes-checkbox-analytics-yes-checkbox-advertisement-yes; CookieLawInfoConsent=eyJ2ZXIiOiIxIiwibmVjZXNzYXJ5IjoidHJ1ZSIsImZ1bmN0aW9uYWwiOiJ0cnVlIiwiYW5hbHl0aWNzIjoidHJ1ZSIsImFkdmVydGlzZW1lbnQiOiJ0cnVlIn0=; _ga=GA1.2.2046832275.1665519683; _gid=GA1.2.1169763692.1665519683; _gat_UA-89039721-1=1; _fbp=fb.1.1665519682781.2060814506; wpml_browser_redirect_test=0' \
            -H 'referer: https://us4bg.org/our-projects/?project_year=all&project_area=all&project_search=' \
            -H 'sec-ch-ua: "Not;A=Brand";v="99", "Chromium";v="106"' \
            -H 'sec-ch-ua-mobile: ?0' \
            -H 'sec-ch-ua-platform: "Linux"' \
            -H 'sec-fetch-dest: empty' \
            -H 'sec-fetch-mode: cors' \
            -H 'sec-fetch-site: same-origin' \
            -H 'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' \
            -H 'x-requested-with: XMLHttpRequest' \
            --compressed \
            2>/dev/null
    ''', shell=True)
    return json.loads(body)


def parse(html):

    dom = etree.HTML(html)
    if dom is None:
        return None
    dates = dom.xpath('//div[contains(@class, "us4bg-project-area-title")]/p/text()')[::2]
    categories = dom.xpath('//div[contains(@class, "us4bg-project-area-title")]/p/text()')[1::2]
    grantees = dom.xpath('//div[@class="col-md-9"]//span[2]/text()')
    projects = dom.xpath('//div[@class="col-md-9"]//span[1]/text()')
    amounts = dom.xpath('//div[@class="col-md-3"]//span/text()')
    logos = dom.xpath('//img[not(@class)]/@src')
    urls = [f'https://us4bg.org/{href}' for href in etree.HTML(html).xpath('//article/a/@href')]
    xs = pd.DataFrame({
        'Date': dates,
        'Category': categories,
        'Grantee': grantees,
        'Project': projects,
        'Amount': amounts,
        'Logo': logos,
        'URL': urls,
    })
    
    return xs

In [2]:
from itertools import count


xs = pd.DataFrame()
for i in count(1):
    resp = page(i)
    html = resp['data']['html']
    ys = parse(html)
    if ys is None:
        break
    xs = pd.concat([xs, ys])


KeyboardInterrupt



In [None]:
def f(s):
    if s.startswith('BGN'):
        factor = 1
    elif s.startswith('EUR'):
        factor = 1.95583
    elif s.startswith('USD'):
        factor = 2.0153181  # as of 2022-10-12
    else:
        raise ValueError(s)
    stripped = s[4:]
    sanitized = stripped.replace(',', '')
    parsed = int(sanitized)
    ans = parsed * factor
    return ans
xs.Amount = xs.Amount.map(f)

In [21]:
# Now I need to cherry-pick media projects.
columns = ['Grantee', 'Project']
keywords = ['media', 'journalis']
sensitive = ['TV']

zs = pd.DataFrame()
for c in columns:
    for k in keywords:
        zs = pd.concat([
            zs,
            xs[xs[c].str.lower().str.contains(k)],
        ])
    for k in sensitive:
        zs = pd.concat([
            zs,
            xs[xs[c].str.contains(k)],
        ])
zs = zs.drop_duplicates()

In [22]:
aliases = {
    'Economedia AD (Capital Weekly)': 'Economedia',
    'Economedia AD': 'Economedia',
    'Economedia AD (capital.bg and dnevnik.bg)': 'Economedia',
    'Info Space Foundation': 'Info Space',
    'Info Space Foundation (mediapool.bg)': 'Info Space',
    'InfoSpace Foundation': 'Info Space',
    'Mediapool.bg (InfoSpace Foundation)': 'Info Space',
    'ProVeritas Association': 'ProVeritas',
    'Pro Veritas Foundation': 'ProVeritas',
    'Association of European Journalists – Bulgaria': 'AEJ Bulgaria',
    'Association of European Journalists - Bulgaria': 'AEJ Bulgaria',
    'Bulgarion Association': 'Bulgarian Association',  # LOL
    'OffMedia Foundation': 'Off Media',
    'Off Media Foundation': 'Off Media',
    'Reduta.bg Association': 'Reduta.bg',
    'Reduta.bg Foundation': 'Reduta.bg',
    'bTV Media Group': 'bTV',
    'bTV Media Group EAD': 'bTV',
    'bTV Media Group, Documentary Unit': 'bTV',
    'Vagabond Media Ltd': 'Vagabond',
    'Vagabond Media Ltd.': 'Vagabond',
    '"Project Yes" Association': 'Project Yes',
    'Criticism and Humanism Association': 'Площад Славейков',
    'Culture Association': 'Площад Славейков',
}
zs.Grantee = zs.Grantee.str.strip().map(lambda x: aliases.get(x, x))

aliases = {
}
zs.Project = zs.Project.str.strip().map(lambda x: aliases.get(x, x))

# Manually drop some wrongly picked grants.
zs = zs[zs.Grantee != 'Professional Association of Mediators in Bulgaria']
zs = zs[zs.Grantee != 'Center for Dispute Resolution']

zs = zs.sort_values('Grantee').reset_index().drop('index', axis=1)

In [26]:
zs.to_csv('/tmp/afb.csv')

In [44]:
#########################

In [64]:
# TODO: At this point websites should be added manually!
zs = pd.read_csv('afb.csv', index_col='index')

In [65]:
import numpy as np

In [66]:
zs = zs.dropna()
zs.Amount = np.round(zs.Amount).astype(int)

In [67]:
months = {
    'January': 'януари',
    'February': 'февруари',
    'March': 'март',
    'April': 'април',
    'May': 'май',
    'June': 'юни',
    'July': 'юли',
    'August': 'август',
    'September': 'септември',
    'October': 'октомври',
    'November': 'ноември',
    'December': 'декември',
}

In [70]:
ans = {}

for (websites, row) in zs.set_index('Websites').iterrows():
    for website in websites.split():
        ans.setdefault(website, [])
        value = {k.lower(): v for (k, v) in row.to_dict().items()}
        value['grantor'] = 'Америка за България'
        value['date'] = value['date'].replace('\u2013', '-')
        for (e, b) in months.items():
            value['date'] = value['date'].replace(e, b)
        del value['category']
        del value['project']
        del value['logo']
        ans[website].append(value)

In [71]:
print(json.dumps(ans, indent=2))

{
  "aej-bulgaria.org": [
    {
      "date": "\u0430\u043f\u0440\u0438\u043b 2014 - \u0434\u0435\u043a\u0435\u043c\u0432\u0440\u0438 2015",
      "grantee": "AEJ Bulgaria",
      "amount": 178500,
      "url": "https://us4bg.org//our-projects/?project_id=668&project_year=all&project_area=all&project_search=",
      "grantor": "\u0410\u043c\u0435\u0440\u0438\u043a\u0430 \u0437\u0430 \u0411\u044a\u043b\u0433\u0430\u0440\u0438\u044f"
    },
    {
      "date": "\u043d\u043e\u0435\u043c\u0432\u0440\u0438 2016 - \u0444\u0435\u0432\u0440\u0443\u0430\u0440\u0438 2019",
      "grantee": "AEJ Bulgaria",
      "amount": 363420,
      "url": "https://us4bg.org//our-projects/?project_id=1720&project_year=all&project_area=all&project_search=",
      "grantor": "\u0410\u043c\u0435\u0440\u0438\u043a\u0430 \u0437\u0430 \u0411\u044a\u043b\u0433\u0430\u0440\u0438\u044f"
    },
    {
      "date": "\u044e\u043d\u0438 2018 - \u043e\u043a\u0442\u043e\u043c\u0432\u0440\u0438 2019",
      "grantee": "AEJ Bu