### Data manipulations after first plot
When I plotted criminal cases on the map, I understood that this data representation is not informative. All streets and districts seem to have an equal share of cases without any clear structure. 
<img src="picture1.PNG">
There why I decided to plot city districts instead of separate cases. For this, I've extracted districts polygons from Wikimapia and calculated a number of cases for each district.

In [52]:
import http.client as http
from urllib.parse import quote
import json
from time import sleep

def wikimapia_request(function, additional_query):
    key = "B0B82BBD-14BF66A1-570E2639-F470C417-38844BAF-BDB44776-EDE430B8-E7E4F573"

    query = "/?function={0}&key={1}&format=json&".format(function, key) + additional_query

    conn = http.HTTPConnection("api.wikimapia.org")
    conn.request("GET", query)
    res = conn.getresponse()
    res_str = res.read().decode("utf-8")
    return json.loads(res_str)

districts = {"Сихівський район": 4556239, 
             "Франківський район": 5130168, 
             "Залізничний район": 5028866, 
             "Шевченківський район": 5130576, 
             "Галицький район": 5130483, 
             "Личаківський район": 5130668}

def get_object_polygon(object_id, object_name):
    function = "place.getbyid"
    additional_query="id={0}".format(object_id)
    response = wikimapia_request(function, additional_query)
    #print(response)
    result = {"properties": {"name" : object_name},
              "type":"Feature", 
              "geometry": { "type": "Polygon", "coordinates": [list(map(lambda c: [c["x"], c["y"]], response["polygon"]))] }}
    
    return result

districts_geojson = {"type":"FeatureCollection", "features": []}
for dis_name in districts.keys():
    dis_id = districts[dis_name]
    districts_geojson["features"].append(get_object_polygon(dis_id, dis_name))

In [53]:
import codecs
with codecs.open("./district.geo.json",'w', 'utf8') as f:
    json.dump(districts_geojson, f, indent = 2, ensure_ascii = False)

In [54]:
def inside_polygon(x, y, points):
    """
    Return True if a coordinate (x, y) is inside a polygon defined by
    a list of verticies [(x1, y1), (x2, x2), ... , (xN, yN)].

    Reference: http://www.ariel.com.au/a/python-point-int-poly.html
    """
    n = len(points)
    inside = False
    [p1x, p1y] = points[0]
    for i in range(1, n + 1):
        [p2x, p2y] = points[i % n]
        if y > min(p1y, p2y):
            if y <= max(p1y, p2y):
                if x <= max(p1x, p2x):
                    if p1y != p2y:
                        xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x
                    if p1x == p2x or x <= xinters:
                        inside = not inside
        p1x, p1y = p2x, p2y
    return inside

In [10]:
import pandas as pd
criminal = pd.read_csv("criminal.csv", encoding='utf8')

In [55]:
def detect_district(lon, lat):
    for district in districts_geojson["features"]:
        coordinates = district["geometry"]["coordinates"][0]
        if(inside_polygon(lon, lat, coordinates)):
            return district["properties"]["name"]

criminal["Район"] = criminal.apply(lambda row: detect_district(row["Longitude"], row["Latitude"]), axis=1)
criminal.to_csv("criminal.csv", index=False, encoding='utf8')

I already had an information about case police department, but I was wondering if this information is relevant to case geographical position. Also lot of cases was registered outside of Lviv boundaries. Thus I decided to check quality of obtained statistics.

First I checked number of empty records.

In [56]:
len(criminal[criminal["Район"].isnull()])

1474

Number 1474 of wrong records is OK to go. Let's check number of cases with difference in police department and actual location.

In [57]:
def district_is_not_department(district, department):
    district_name = district.split(" ")[0].lower()
    department = department.lower()
    return 1 if department.find(district_name) == -1 else 0

wrong_dep = criminal[criminal["Район"].notnull()].apply(lambda row: district_is_not_department(row["Район"], row["Орган"]), axis=1)
sum(wrong_dep)

1118

Small number of the cases have different police department and actual location. Let's check visualization of cases by department to be sure that it is not an error of our computations.
<img src="picture2.PNG">
It shows, that many cases was registered in police department that doesn't belong to this location. So it is not an error.

Next let's calculate a district criminal rate as a relation of number of cases to district area and district population.

In [8]:
import numpy as np
import pyproj    
import shapely
import shapely.ops as ops
from shapely.geometry.polygon import Polygon
from functools import partial

def poly_area(xy):
    geom = Polygon(list(map(lambda c: tuple(c), xy)))
    geom_area = ops.transform(
        partial(
            pyproj.transform,
            pyproj.Proj(init='EPSG:4326'),
            pyproj.Proj(
                proj='aea',
                lat1=geom.bounds[1],
                lat2=geom.bounds[3])),
        geom)
    return geom_area.area/1000000 # square kilometers

In [11]:
popuplations = {"Сихівський район": 151371, 
               "Франківський район": 151100, 
               "Галицький район": 58812,
               "Шевченківський район": 144242,
               "Личаківський район": 102639,
               "Залізничний район": 126020,}

for district in districts_geojson["features"]:
    name = district["properties"]["name"]
    area = poly_area(district["geometry"]["coordinates"][0])
    population = popuplations[name]
    
    district["properties"]["area"] = area 
    district["properties"]["population"] = population 
    
    district["properties"]["total_cases"] = len(criminal[criminal["Район"] == name])
    district["properties"]["fraud_cases"] = len(criminal[criminal["Район"] == name][criminal["Тип"] == "Шахрайство"])
    district["properties"]["theft_cases"] = len(criminal[criminal["Район"] == name][criminal["Тип"] == "Крадіжка"])
    district["properties"]["robbery_cases"] = len(criminal[criminal["Район"] == name][criminal["Тип"] == "Пограбування"])
    
    district["properties"]["total_by_area"] = district["properties"]["total_cases"]/area
    district["properties"]["fraud_by_area"] = district["properties"]["fraud_cases"]/area
    district["properties"]["theft_by_area"] = district["properties"]["theft_cases"]/area
    district["properties"]["robbery_by_area"] = district["properties"]["robbery_cases"]/area
    
    district["properties"]["total_by_population"] = district["properties"]["total_cases"]/population
    district["properties"]["fraud_by_population"] = district["properties"]["fraud_cases"]/population
    district["properties"]["theft_by_population"] = district["properties"]["theft_cases"]/population
    district["properties"]["robbery_by_population"] = district["properties"]["robbery_cases"]/population

with codecs.open("./district.geo.json",'w', 'utf8') as f:
    json.dump(districts_geojson, f, indent = 2, ensure_ascii = False)



In [12]:
districts_geojson["features"][0]["properties"]

{'area': 33.82083009859082,
 'fraud_by_area': 6.564002106182778,
 'fraud_by_population': 0.0017616251388668466,
 'fraud_cases': 222,
 'name': 'Залізничний район',
 'population': 126020,
 'robbery_by_area': 3.222865898981634,
 'robbery_by_population': 0.0008649420726868751,
 'robbery_cases': 109,
 'theft_by_area': 39.91622902408446,
 'theft_by_population': 0.010712585303920013,
 'theft_cases': 1350,
 'total_by_area': 49.703097029248866,
 'total_by_population': 0.013339152515473735,
 'total_cases': 1681}

Besides the map, I also want to show usual barcharts with rates for each district. To do this, I need to save this information in csv to use it as dataframe in shiny application.

In [1]:
import codecs
import json

with codecs.open("./district.geo.json",'r', 'utf8') as f:
    districts_geojson = json.load(f)

In [6]:
import pandas as pd
    
cols = ["name", "area", "population", "total_cases", "fraud_cases", "theft_cases", "robbery_cases", \
        "total_by_area", "fraud_by_area", "theft_by_area", "robbery_by_area", "total_by_population", \
        "fraud_by_population", "theft_by_population", "robbery_by_population" ]
districts = pd.DataFrame(columns=cols)
for i in range(len(districts_geojson["features"])):
    properties = districts_geojson["features"][i]["properties"]
    vals = [properties[col] for col in cols]
    districts.loc[i] = vals
districts.to_csv("districts.csv", index=False, encoding='utf8')