In [1]:
%load_ext lab_black

In [24]:
from pathlib import Path
import json
import gettext

from cleantext import clean
import dataset
import requests
import pycountry
from tqdm import tqdm
from datetime import datetime

In [25]:
german = gettext.translation("iso3166", pycountry.LOCALES_DIR, languages=["de"])
german.install()
# _("Germany")

In [26]:
with open("regions.json") as json_file:
    regions = json.load(json_file)

In [27]:
manual_fixes = {
    "ca0dee4be1029c2ab24cdea755b88086": {"city": "Halle (Saale)"},
    "https://www.raa-sachsen.de/support/chronik/vorfaelle/bon-courage-fassungslos-ueber-naziuebergriffe-auf-dem-bornaer-stadtfest-3219": {
        "city": "Borna"
    },
    "a10d62daa23594df92c9704e1606dcfc": {
        "city": "Oranienbaum-Wörlitz",
        "county": "Wittenberg",
    },
    "000dad6aace24b3d8ebae940a8de8e72": {"city": "Dessau"},
    "https://www.raa-sachsen.de/support/chronik/vorfaelle/goerlitz-2779": {
        "date": datetime(2010, 11, 14)
    },
    "https://www.raa-sachsen.de/support/chronik/vorfaelle/leipzig-reudnitz-4976": {
        "date": datetime(2020, 11, 13)
    },
}

In [28]:
regions = list(regions.values())

In [29]:
regions_counties = [x for x in regions if x["level"] == 3]

In [30]:
def is_valid_county(county):
    return (
        len(
            [
                x
                for x in regions_counties
                if x["name"].startswith(county)
                and x["duration"]["until"] == "2019-12-31T00:00:00.000Z"
            ]
        )
        != 0
    )

In [31]:
auth = tuple(Path("secrets.txt").read_text().split()[1:])

In [32]:
all_incidents = []
all_src = []
all_chronicle = []

for p in Path("data").glob("*.db"):
    print(p)
    try:
        db = dataset.connect("sqlite:///" + str(p))
        all_incidents += db["incidents"].all()
        all_src += db["sources"].all()
        all_chronicle += db["chronicle"].all()

    except:
        print("error")

data/mobile-opferberatung-scraper.db
data/raa-sachsen-scraper.db
data/opferperspektive-scraper.db


In [33]:
for x in all_chronicle:
    if "region" in x and len(x["region"]) > 0:
        continue
    if "iso3166_2" in x and x["iso3166_2"] != None and len(x["iso3166_2"]) > 0:
        x["region"] = pycountry.subdivisions.get(code=x["iso3166_2"]).name
    elif "iso3166_1" in x and x["iso3166_1"] != None and len(x["iso3166_1"]) > 0:
        x["region"] = pycountry.countries.get(alpha_2=x["iso3166_1"]).name
    else:
        raise ValueError("Need to specify region somehow")

In [34]:
db = dataset.connect("sqlite:///rechtegewalt.db")

tab_incidents = db["incidents"]
tab_incidents.insert_many(all_incidents)

tab_src = db["sources"]
tab_src.insert_many(all_src)

tab_chro = db["chronicles"]
tab_chro.insert_many(all_chronicle)

tab_incidents.create_index(["rg_id"])
tab_src.create_index(["rg_id"])

tab_incidents.create_index(["id"])

In [35]:
def add_state_country(row):
    chro = tab_chro.find_one(chronicler_name=row["chronicler_name"])
    sub = pycountry.subdivisions.get(code=chro["iso3166_2"])
    row["state"] = sub.name
    row["country"] = _(sub.country.name)
    return row

In [36]:
county_words = ["Landkreis", "Landkeis", "Kreis", "LK"]


def clean_county(x):
    if x is None or x == "None":
        return None
    x = clean_string(x)

    for w in county_words:
        w += " "
        if x.startswith(w):
            x = x[len(w) :]

    if not is_valid_county(x):
        print("removing", x)
        return None
    return x


def clean_city(x):
    x = clean_string(x)
    if x is None:
        return None
    assert len(x) > 0
    return x


def clean_string(x):
    x = clean(x, lang="de", lower=False)
    if len(x) == 0:
        return None
    return x

In [37]:
list(db.query('SELECT * FROM incidents where city=""'))

[OrderedDict([('id', 25980),
              ('description',
               'Zwei Männer beschimpfen gegen 19 Uhr in einer Straßenbahn an der Vogelweide einen 35-jährigen Mann aus Libyen. Als dieser am Marktplatz umsteigen will, schlagen die Unbekannten ihm gegen den Oberkörper. Dann versucht der 35-Jährige gleichzeitig mit den Angreifern in eine andere Bahn einzusteigen, wird aber von den beiden aus dem Fahrzeug gestoßen. Der Betroffene bleibt unverletzt. Der polizeiliche Staatsschutz ermittelt.'),
              ('date', '2015-09-22 00:00:00.000000'),
              ('url',
               'http://www.mobile-opferberatung.de/monitoring/chronik2015/'),
              ('rg_id', 'ca0dee4be1029c2ab24cdea755b88086'),
              ('city', ''),
              ('county', None),
              ('chronicler_name', 'Mobile Opferberatung'),
              ('title', None),
              ('orig_county', None),
              ('orig_city', None),
              ('state', None),
              ('country', Non

In [38]:
def fill_missing_county():
    statement = "SELECT * FROM incidents GROUP BY city, state having count(*) > 1"
    for row in db.query(statement):
        dupli = list(tab_incidents.find(city=row["city"], state=row["state"]))
        county_can = []
        contains_none = False
        for d in dupli:
            if d["county"] is not None:
                county_can.append(d["county"])
            else:
                contains_none = True

        unique_can = list(set(county_can))
        if contains_none and len(unique_can) == 1:
            tab_incidents.update(
                {"city": row["city"], "county": unique_can[0]}, ["city"]
            )
            print(unique_can)

In [None]:
# clean location text because there were still some errors
for x in tqdm(tab_incidents.all()):
    if x["rg_id"] in manual_fixes:
        x = {**x, **manual_fixes[x["rg_id"]]}

    x["orig_county"] = clean_string(x["county"])
    x["orig_city"] = clean_string(x["city"])

    x["county"] = clean_county(x["county"])
    x["city"] = clean_string(x["city"])

    x = add_state_country(x)

    if x["date"] is None:
        print(x)
        raise ValueError

    if x["date"].year < 1990:
        tab_incidents.delete(id=x["id"])

    #   manual fix
    if x["city"] == "Zerbst" and x["state"] == "Sachsen-Anhalt":
        x["city"] = "Zerbst/Anhalt"

    tab_incidents.update(x, ["id"])

4767it [00:08, 545.19it/s]

In [None]:
# fill_missing_county()

In [None]:
auth

In [21]:
def geocode_all():
    statement = "SELECT DISTINCT city, county, state, country FROM incidents"
    subs = list(db.query(statement))
    r = requests.post(
        "https://geocode.app.vis.one/",
        auth=auth,
        json={"provider": "here", "locations": [{"query": dict(x)} for x in subs]},
    )
    r.raise_for_status()
    subs_location = r.json()["locations"]

    return subs_location

In [22]:
subs_location = geocode_all()

TypeError: 'tuple' object is not callable

In [None]:
subs_location[0]

In [None]:
def geocode_second(subs_location):
    second_check = []
    second_check_ids = []

    for i, x in enumerate(subs_location):
        if len(x) == 1 and 'county' in x['query'] and x['query']['county'] is not None:
            x['query']['city'] = x['query']['city'] + ', ' + x['query']['county']
            x['query']['county'] = None
            second_check.append(x)
            second_check_ids.append(i)
            
    print(len(second_check))
    if len(second_check) == 0:
        return subs_location
    r = requests.post('https://geocode.app.vis.one/', auth=auth, json={'provider': 'here', 'locations': [{'query': x['query']} for x in second_check]})
    r.raise_for_status()
    
    for i, x in enumerate(r.json()['locations']):
        if len(x) != 1:
            subs_location[second_check_ids[i]] = x
            print('found!', x)
    return subs_location

In [None]:
# subs_location = geocode_second(subs_location)

In [None]:
good_locations = []

for x in subs_location:
    if len(x) == 1:
        print('error here, deleting for now', x)
#         tab_incidents.delete(**x['query'])
    else:
#         rename
        query = x.pop('query')
        x['query_county'] = query['county']
        x['query_city'] = query['city']
        good_locations.append(x)

In [None]:
tab_loc = db['locations']
tab_loc.insert_many(good_locations)

In [None]:
tab_loc.create_index(['id'])

In [None]:
# Not really sure whether a seperate table for location is needed. It was introduced because in some cases, multiple locations are associated with incident.
for x in tqdm(tab_incidents.all()):
    x_query = {name: x[name] for name in ['state', 'country']}
    row_loc = tab_loc.find_one(query_county=x['orig_county'], query_city=x['orig_city'], **x_query)
    if row_loc is None:
        row_loc = tab_loc.find_one(query_county=x['county'], query_city=x['orig_city'], **x_query)
        if row_loc is None:
            row_loc = tab_loc.find_one(query_county=x['orig_county'], query_city=x['city'], **x_query)
            if row_loc is None:
                row_loc = tab_loc.find_one(query_county=x['county'], query_city=x['city'], **x_query)
                if row_loc is None:
                    print(x)
                    print(x_query)
                    continue
    row_loc_geo = {name: row_loc[name] for name in ['latitude', 'longitude', 'postal_code', 'street', 'house_number', 'district', 'city', 'county', 'state', 'country']}
    merged = {**x, **row_loc_geo}
    tab_incidents.update(merged, ['id'])

In [None]:
final_loc =list(tab_loc.distinct(*['latitude', 'longitude', 'postal_code', 'street', 'house_number', 'district', 'city', 'county', 'state', 'country']))

In [None]:
len(final_loc)

In [None]:
tab_loc.drop()

In [None]:
# tab_loc_final = db['locations_final']
tab_loc.insert_many(final_loc)

In [None]:
len(list(tab_loc.all()))