# Combine and Clean Data Sources

This notbook is used to combine all data in to a single Sqlite database. The cleaning evolved heavily so at some point, all this cleaning should be refactored into a seperate module.

In [None]:
%load_ext lab_black

In [None]:
from pathlib import Path
import json
import gettext

from cleantext import clean
import dataset
import requests
import pycountry
from tqdm import tqdm
from datetime import date, datetime

In [None]:
german = gettext.translation("iso3166", pycountry.LOCALES_DIR, languages=["de"])
german.install()

## Manual Fixes

There are sometimes errors in the data. Since each incident has a uniquie id, fix it here.

In [None]:
manual_fixes = {
    "ca0dee4be1029c2ab24cdea755b88086": {"city": "Halle (Saale)"},
    "https://www.raa-sachsen.de/support/chronik/vorfaelle/bon-courage-fassungslos-ueber-naziuebergriffe-auf-dem-bornaer-stadtfest-3219": {
        "city": "Borna"
    },
    "mobile-beratung-a10d62daa23594df92c9704e1606dcfc": {
        "city": "Oranienbaum-WÃ¶rlitz",
        "county": "Wittenberg",
    },
    "mobile-beratung-000dad6aace24b3d8ebae940a8de8e72": {"city": "Dessau"},
    "https://www.raa-sachsen.de/support/chronik/vorfaelle/goerlitz-2779": {
        "date": date(2010, 11, 14)
    },
    "https://www.raa-sachsen.de/support/chronik/vorfaelle/leipzig-reudnitz-4976": {
        "date": date(2020, 11, 13)
    },
}

## Valid Regions

Taken regions.json from <https://github.com/datenguide/metadata>

These files contains a list of all valid regios. If a region is not in there, filter it out. This greatly the performance of the geocoding api. It's not optimal that we acutlaly throw this information await. This information should be kept actually. (TODO)

In [None]:
with open("regions.json") as json_file:
    regions = json.load(json_file)

regions = list(regions.values())
regions_counties = [x for x in regions if x["level"] == 3]


def is_valid_county(county):
    return (
        len(
            [
                x
                for x in regions_counties
                if x["name"].startswith(county)
                and x["duration"]["until"] == "2019-12-31T00:00:00.000Z"
            ]
        )
        != 0
    )

Read it some secrets later needed to comunicate with an internal API

In [None]:
auth = tuple(Path("secrets.txt").read_text().split()[1:])

In [None]:
all_incidents = []
all_src = []
all_chronicle = []

for p in Path("data").glob("*.db"):
    print(p)
    db = dataset.connect("sqlite:///" + str(p))
    all_incidents += db["incidents"].all()
    all_src += db["sources"].all()

    new_chro = list(db["chronicle"].all())
    if len(new_chro) == 0:
        new_chro = db["chronicles"].all()

    all_chronicle += new_chro

In [None]:
for x in all_chronicle:
    if "region" in x and len(x["region"]) > 0:
        continue
    if "iso3166_2" in x and x["iso3166_2"] != None and len(x["iso3166_2"]) > 0:
        x["region"] = pycountry.subdivisions.get(code=x["iso3166_2"]).name
    elif "iso3166_1" in x and x["iso3166_1"] != None and len(x["iso3166_1"]) > 0:
        x["region"] = pycountry.countries.get(alpha_2=x["iso3166_1"]).name
    else:
        raise ValueError("Need to specify region somehow")

In [None]:
# only using date (and without time (hour/minute)) for now


def ensure_date(x):
    if x is None:
        return x
    if isinstance(x, datetime):
        return x.date()
    elif isinstance(x, date):
        return x
    ValueError("neither date or datetime")


for row in all_src:
    if "date" in row:
        row["date"] = ensure_date(row["date"])
    else:
        row["date"] = None

In [None]:
db = dataset.connect("sqlite:///rechtegewalt.db")

In [None]:
tab_incidents = db["incidents"]
for x in all_incidents:
    if "id" in x:
        x.pop("id")
tab_incidents.insert_many(all_incidents)

tab_src = db["sources"]
for x in all_src:
    if "id" in x:
        x.pop("id")
tab_src.insert_many(all_src)

tab_chro = db["chronicles"]
for x in all_chronicle:
    if "id" in x:
        x.pop("id")
tab_chro.insert_many(all_chronicle)

tab_incidents.create_index(["rg_id"])
tab_src.create_index(["rg_id"])

# tab_incidents.create_index(["id"])

In [None]:
def add_state_country(row):
    chro = tab_chro.find_one(chronicler_name=row["chronicler_name"])
    if not "iso3166_2" in chro or chro["iso3166_2"] is None:
        row["country"] = "Deutschland"
        return row

    sub = pycountry.subdivisions.get(code=chro["iso3166_2"])
    assert sub is not None
    row["state"] = sub.name
    row["country"] = _(sub.country.name)
    return row

In [None]:
county_words = ["Landkreis", "Landkeis", "Kreis", "LK"]


def clean_county(x):
    if x is None or x == "None":
        return None
    x = clean_string(x)

    for w in county_words:
        w += " "
        if x.startswith(w):
            x = x[len(w) :]

    if not is_valid_county(x):
        print("removing", x)
        return None
    return x


def clean_city(x):
    x = clean_string(x)
    if x is None:
        return None
    assert len(x) > 0
    return x


def clean_string(x):
    x = clean(x, lang="de", lower=False)
    if len(x) == 0:
        return None
    return x

In [None]:
def fill_missing_county():
    statement = "SELECT * FROM incidents GROUP BY city, state having count(*) > 1"
    for row in db.query(statement):
        dupli = list(tab_incidents.find(city=row["city"], state=row["state"]))
        county_can = []
        contains_none = False
        for d in dupli:
            if d["county"] is not None:
                county_can.append(d["county"])
            else:
                contains_none = True

        unique_can = list(set(county_can))
        if contains_none and len(unique_can) == 1:
            tab_incidents.update(
                {"city": row["city"], "county": unique_can[0]}, ["city"]
            )
            print(unique_can)

In [None]:
# clean location text because there were still some errors
for x in tqdm(tab_incidents.all()):
    if x["rg_id"] in manual_fixes:
        x = {**x, **manual_fixes[x["rg_id"]]}

    if not "county" in x:
        no_county = True
        x["county"] = None

    x["county"] = x["orig_county"] = clean_string(x["county"])
    x["city"] = x["orig_city"] = clean_string(x["city"])

    if "address" in x and x["address"] is not None:
        x["address"] = x["orig_address"] = clean_string(x["address"])

    x = add_state_country(x)

    if x["date"] is None:
        print("date is broken, skipping")
        print(x)
        tab_incidents.delete(id=x["id"])
        continue
    #         raise ValueError

    # ignore older data
    if x["date"].year < 2000:
        tab_incidents.delete(id=x["id"])

    x["date"] = ensure_date(x["date"])

    #   manual fix
    if x["city"] == "Zerbst" and x["state"] == "Sachsen-Anhalt":
        x["city"] = "Zerbst/Anhalt"

    tab_incidents.update(x, ["id"])

In [None]:
# fill_missing_county()

In [None]:
def geocode_all():
    tab_incidents.create_column("district", db.types.text)
    if "address" not in tab_incidents.columns:
        tab_incidents.create_column("address", db.types.text)
    if "state" not in tab_incidents.columns:
        tab_incidents.create_column("state", db.types.text)
    if "longitude" not in tab_incidents.columns:
        tab_incidents.create_column("longitude", db.types.float)
    if "latitude" not in tab_incidents.columns:
        tab_incidents.create_column("latitude", db.types.float)
    statement = (
        "SELECT DISTINCT address, city, district, county, state, country FROM incidents"
    )
    subs = list(db.query(statement))

    # the geocoding api has problems with Leipzig as County. (There is a Landkreis Leipzig and a seperate City Leipzig)
    removed_counties_with_ids = []
    for i, x in enumerate(subs):
        if x["state"] is None:
            print(x)
        if x["county"] == "Leipzig":
            removed_county = x.pop("county")
            x["county"] = None
            removed_counties_with_ids.append([removed_county, i])

    r = requests.post(
        "https://geocode.app.vis.one/",
        auth=auth,
        json={"provider": "here", "locations": [{"query": dict(x)} for x in subs]},
    )
    r.raise_for_status()
    subs_location = r.json()["locations"]

    # add back county since it was correct
    for x, i in removed_counties_with_ids:
        subs_location[i]["query"]["county"] = x

    return subs_location

In [None]:
subs_location = geocode_all()

In [None]:
def geocode_second(subs_location):
    second_check = []
    second_check_ids = []

    for i, x in enumerate(subs_location):
        if len(x) == 1 and "county" in x["query"] and x["query"]["county"] is not None:
            x["query"]["city"] = x["query"]["city"] + ", " + x["query"]["county"]
            x["query"]["county"] = None
            second_check.append(x)
            second_check_ids.append(i)

    print(len(second_check))
    if len(second_check) == 0:
        return subs_location
    r = requests.post(
        "https://geocode.app.vis.one/",
        auth=auth,
        json={
            "provider": "here",
            "locations": [{"query": x["query"]} for x in second_check],
        },
    )
    r.raise_for_status()

    for i, x in enumerate(r.json()["locations"]):
        if len(x) != 1:
            subs_location[second_check_ids[i]] = x
            print("found!", x)
    return subs_location

In [None]:
# not used right now?
# subs_location = geocode_second(subs_location)

In [None]:
good_locations = []

for x in subs_location:
    if len(x) == 1:
        #         could not geocode these locations
        print("error here, deleting for now", x)
    #         tab_incidents.delete(**x['query'])
    else:
        #         rename
        query = x.pop("query")
        x["query_county"] = query["county"]
        x["query_city"] = query["city"]
        x["query_address"] = query["address"]
        good_locations.append(x)

In [None]:
tab_loc = db["locations"]
tab_loc.insert_many(good_locations)

In [None]:
tab_loc.create_index(["id"])

## Merging Locations

We are trying out several ways to merge the location with geolocation back to the old without. Since we maniputlated the county etc., we have to try varioous ways how to merge. This should get improved (TDOO)


Not really sure whether a seperate table for location is needed. It was introduced because in some cases, multiple locations are associated with incident.

In [None]:
for x in tqdm(tab_incidents.all()):
    real_lat, real_long = x["latitude"], x["longitude"]
    x_query = {name: x[name] for name in ["state", "country"]}
    row_loc = tab_loc.find_one(
        query_county=x["orig_county"],
        query_city=x["orig_city"],
        query_address=x["address"],
        **x_query
    )
    if row_loc is None:
        row_loc = tab_loc.find_one(
            query_county=x["county"],
            query_city=x["orig_city"],
            query_address=x["address"],
            **x_query
        )
        if row_loc is None:
            row_loc = tab_loc.find_one(
                query_county=x["orig_county"],
                query_city=x["city"],
                query_address=x["address"],
                **x_query
            )
            if row_loc is None:
                row_loc = tab_loc.find_one(
                    query_county=x["county"],
                    query_city=x["city"],
                    query_address=x["address"],
                    **x_query
                )
                if row_loc is None:
                    print(x)
                    print(x_query)
                    continue
    row_loc_geo = {
        name: row_loc[name]
        for name in [
            "latitude",
            "longitude",
            "postal_code",
            "street",
            "house_number",
            "district",
            "city",
            "county",
            "state",
            "country",
        ]
    }

    if real_lat is not None and real_long is not None:
        row_loc_geo["latitude"] = real_lat
        row_loc_geo["longitude"] = real_long
        print("added again!")
        print(x)

    merged = {**x, **row_loc_geo}
    tab_incidents.update(merged, ["id"])

In [None]:
# we are using the final output locations of the incidents because we actually manupulated / discared some locations (if there already had a geolocation)
final_loc = list(
    tab_incidents.distinct(
        *[
            "latitude",
            "longitude",
            "postal_code",
            "street",
            "house_number",
            "district",
            "city",
            "county",
            "state",
            "country",
        ]
    )
)

In [None]:
len(final_loc)

In [None]:
tab_loc.drop()

In [None]:
# tab_loc_final = db['locations_final']
tab_loc.insert_many(final_loc)

In [None]:
len(list(tab_loc.all()))