In [1]:
from pathlib import Path

from cleantext import clean
import dataset
import requests
import pycountry
from tqdm import tqdm

In [2]:
import gettext
german = gettext.translation('iso3166', pycountry.LOCALES_DIR, languages=['de'])
german.install()
# _("Germany")

In [3]:
auth = tuple(Path('secrets.txt').read_text().split()[1:])

In [4]:
all_incidents = []
all_src = []
all_chronicle = []

for p in Path('data').glob('*.db'):
    print(p)
    try:
        db = dataset.connect('sqlite:///'  + str(p))
        all_incidents += db['incidents'].all()
        all_src += db['sources'].all()
        all_chronicle += db['chronicles'].all()

    except:
        print('error')

data/mobile-opferberatung-scraper.db
data/raa-sachsen-scraper.db
data/opferperspektive-scraper.db


In [5]:
db = dataset.connect('sqlite:///rechtegewalt.db')

tab_incidents = db['incidents']
tab_incidents.insert_many(all_incidents)

tab_src = db['sources']
tab_src.insert_many(all_src)

tab_chro = db['chronicle']
tab_chro.insert_many(all_chronicle)

tab_incidents.create_index(['rg_id'])
tab_src.create_index(['rg_id'])

tab_incidents.create_index(['id'])

In [6]:
def construct_location_string(row):
    sub_array = eval(row['subdivisions'])
    chro = tab_chro.find_one(chronicler_name=row['chronicler_name'])
    sub = pycountry.subdivisions.get(code=chro['iso3166_2'])
    sub_array = [x if type(x) is str else x[1] for x in sub_array] + [sub.name, _(sub.country.name)]

    return ', '.join(sub_array)

In [7]:
def clean_subdivisions(sub):
    def cl(x):
        return clean(x, lang='de', lower=False)
    sub = eval(sub)
    r = []
    for x in sub:
        if type(x) is str:
            r.append(['', cl(x)])
        else:
            r.append(list(map(cl, x)))
    return str(r)

In [8]:
# clean location text because there were still some errors
for x in tqdm(tab_incidents.all()):
    x['subdivisions'] = clean_subdivisions(x['subdivisions'])
    x['location_string'] = construct_location_string(x)
    tab_incidents.update(x, ['id'])

8236it [00:33, 246.87it/s]


In [9]:
subs = list(map(lambda x: x['location_string'], tab_incidents.distinct('location_string')))

In [11]:
r = requests.post('https://geocode.app.vis.one/', auth=auth, json={'locations': [{'location': x} for x  in subs]})
r.raise_for_status()
subs_location = r.json()['locations']

In [12]:
good_locations = []

for x in subs_location:
    if len(x) == 1:
        print('error here, deleting for now')
        print(x)
        tab_incidents.delete(location_string=x['location'])
    else:
        # rename
        x['location_string'] = x.pop('location')
        good_locations.append(x)

error here, deleting for now
{'location': 'Bon Courage fassungslos über Naziübergriffe auf dem Bornaer Stadtfest, Leipzig, Sachsen, Deutschland'}


In [13]:
tab_loc = db['locations']
tab_loc.insert_many(good_locations)

In [14]:
tab_loc.create_index(['location_string'])
tab_loc.create_index(['id'])

In [15]:
# Not really sure whether a seperate table for location is needed. It was introduced because in some cases, multiple locations are associated with incident.
for x in tqdm(tab_incidents.all()):
    row_loc = tab_loc.find_one(location_string=x['location_string'])
    merged = {**row_loc, **x}
    tab_incidents.update(merged, ['id'])
    row_loc['subdivisions'] = x['subdivisions']
    tab_loc.update(row_loc, ['id'])

8235it [00:49, 165.61it/s]
