<span style="font-family:Lucida Bright;">

<hr style="border:2px solid black"> </hr>

<p style="margin-bottom:1cm"></p>

<center>
<font size="7"><b>Social Data Analysis and Visualization</b></font>
<p style="margin-bottom:1cm"></p>
<font size="6.8"><b>Final Project</b></font>   
<p style="margin-bottom:0.8cm"></p>
<font size="3"><b>Wojciech Mazurkiewicz, DTU, 14 May 2021</b></font>
<br>
<font size="3"><b></b></font>

</center>

<p style="margin-bottom:0.7cm"></p>

<hr style="border:2px solid black"> </hr>

<hr style="border:2px solid black"> </hr>

<span style="font-family:Lucida Bright;">

# Initialization

## How to read this notebook

In this notebook, the questions are either specified in the section title, or marked with

> __bold quote__

The answers are marked with <span style="font-family:Lucida Bright;">*Lucida Bright italics*</span>.

Please note that the pre-rendered outputs will first display properly when the notebook is __trusted__.
    
</span>

## Imports

In [1]:
%matplotlib inline

import bokeh.plotting as bplt
import calendar
import datetime
import folium
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import scipy.stats
import seaborn as sns
import urllib.request

from bokeh.io import output_file
from bokeh.io import output_notebook
from bokeh.io import show
from bokeh.models import Legend
from bokeh.models.ranges import FactorRange
from bokeh.models.sources import ColumnDataSource
from folium.map import FeatureGroup
from folium.plugins import HeatMap, HeatMapWithTime
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display
from IPython.display import Markdown
from IPython.display import YouTubeVideo
from matplotlib.colors import Normalize
from matplotlib.image import NonUniformImage
from matplotlib import cm
from mpl_toolkits.axes_grid1 import make_axes_locatable
from pathlib import Path
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.datasets import fetch_20newsgroups
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

##  Configuration

In [2]:
# Show bokeh figures in the notebook.
output_notebook()

# Below decide which output is shown below the cells.
InteractiveShell.ast_node_interactivity = "none"

# Decide how to handle the "SettingWithCopyWarning" warning
pd.options.mode.chained_assignment = None  # default='warn'

## Function definitions

In [3]:
# A function that will print a markdown text.
def printmd(string):
    display(Markdown(string))


# A function that applies default formatting to an axes.
def format_axes(axes: plt.Axes,
                keep_box=False):
    if not keep_box:
        axes.spines['top'].set_color('white')
        axes.spines['right'].set_color('white')

    axes.set_facecolor("white")


# A function that applies default formatting to annotation
# of an axes.
def format_axes_annotation(axes: plt.Axes):
    axes.xaxis.label.set_fontsize(14)
    axes.yaxis.label.set_fontsize(14)
    axes.title.set_fontsize(16)


# A function for creating common x-label for the figure.
def figure_x_label(figure: plt.Figure,
                   label: str,
                   y_position=0.04,
                   font_size=16):
    figure.text(0.5, y_position, label,
                ha='center',
                fontdict={'size': font_size})


# A function for creating common y-label for the figure.
def figure_y_label(figure: plt.Figure,
                   label: str,
                   x_position=0.04,
                   font_size=16):
    figure.text(x_position, 0.5, label,
                va='center',
                rotation='vertical',
                fontdict={'size': font_size})


# A function for balancing a dataframe so that the number of rows
# containing each value present in the designated column will be the same.
def balance_dataframe(df: pd.DataFrame, column_name):
    # Get the number of crimes for the least frequent crime.
    lowest_frequency = df['Category'].value_counts().min()

    # Create an empty dataframe for storing the balanced data
    df_balanced = pd.DataFrame()

    # For each value in column, randomly choose the number of samples
    # that corresponds to the least frequent value in the column.
    for value in df[column_name].unique():
         df_balanced = df_balanced.append(
             df
             .loc[df[column_name] == value]
             .sample(lowest_frequency)
         )

    return df_balanced


# A function that evaluates a dictionary of models on data from
# a pandas dataframe.
def evaluate_models(models: dict,
                    df: pd.DataFrame,
                    predictor_labels: list,
                    target_label: str,
                    test_size=0.33):

    # Get the dataset.
    X = df.loc[:, predictor_labels].values
    y = df.loc[:, target_label].values

    # Split the dataset into a test and training set.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=32)

    # Fit the models to the data.
    for model_name, model in models.items():

        # Print the name of the model.
        printmd(f'*__{model_name}:__*')

        # Train the model on the training set.
        model.fit(X_train, y_train)

        # Get the predictions on the test set.
        predictions = model.predict(X_test)

        # Print the classification report.
        print(classification_report(y_test, predictions,
                                    zero_division=0,
                                    digits=4))

<!-- ## Paths -->

In [4]:
path_root = Path(r'C:\GDrive\DTU\Kurser\Social_Data_Analysis_and_Visualization_02806\final_project')
path_resources_root = path_root / 'resources'

<hr style="border:2px solid black"> </hr>

# Data Cleaning



In [5]:
class Data:
    def __init__(self):
        self.df_trash_bins = pd.read_csv(
            path_resources_root / 'affaldskurve_puma.csv')

# Load the data        
data = Data()

## Trash bins

### Load data

In [19]:
df_trash_bins_raw = pd.read_csv(path_resources_root / 'affaldskurve_puma.csv')

### Show data

In [20]:
display(df_trash_bins_raw)

Unnamed: 0,FID,id,wkb_geometry,status,driftsplan_navn,arbejdssted_navn,arbejdssted_id,tekst_id,lokaludvalg,byrumstype,underordnet_byrumstype,stednavn,underordnet_stednavn,kgb_kategori,geoobjekttypeid,kommentar,har_sensor,puma_geoobjektid
0,affaldskurve_puma.1,1,MULTIPOINT ((12.565692814694078 55.67932485271...,IDrift,Ørstedsparken,Ørstedsparken,144.0,Ørste_Ørste_144,Indre By,Parker,Klassisk park,Ørstedsparken,,12.2.3 Richard Müller T5,a591f7eb-3cae-4aa3-bedf-a907000222ee,,False,c03654e3-60e6-4408-bf2a-99eef82c91a9
1,affaldskurve_puma.2,2,MULTIPOINT ((12.56586038890235 55.679331097100...,IDrift,Ørstedsparken,Ørstedsparken,144.0,Ørste_Ørste_144,Indre By,Parker,Klassisk park,Ørstedsparken,,12.2.3 Richard Müller T5,a591f7eb-3cae-4aa3-bedf-a907000222ee,,False,10f099de-b107-45b6-8e94-17506cb7254f
2,affaldskurve_puma.3,3,MULTIPOINT ((12.566933919761878 55.68263944689...,IDrift,Ørstedsparken,Ørstedsparken,144.0,Ørste_Ørste_144,Indre By,Parker,Klassisk park,Ørstedsparken,,12.2.2 Københavnerkurv,5ec05775-cc06-4dd6-8208-0a6148437eeb,,False,ef13707a-7fe3-420e-855f-f43be1afb796
3,affaldskurve_puma.4,4,MULTIPOINT ((12.566245988511534 55.68270722067...,IDrift,Ørstedsparken,Ørstedsparken,144.0,Ørste_Ørste_144,Indre By,Parker,Klassisk park,Ørstedsparken,,12.2.2 Københavnerkurv,5ec05775-cc06-4dd6-8208-0a6148437eeb,,False,f74b90a1-8942-451c-a474-6400f2bd15cb
4,affaldskurve_puma.5,5,MULTIPOINT ((12.566669489138766 55.68256451356...,IDrift,Ørstedsparken,Ørstedsparken,144.0,Ørste_Ørste_144,Indre By,Parker,Lege- og aktivitetsområde,Ørstedsparken,,12.2.2 Københavnerkurv,5ec05775-cc06-4dd6-8208-0a6148437eeb,,False,ed4afaac-6fc1-403c-b052-9f205134486a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5241,affaldskurve_puma.7323,7323,MULTIPOINT ((12.612868979515607 55.68517836069...,IDrift,Bydækkende drift,,,,Christianshavn,Gader og pladser,Øvrig gade,Gade i Indre By,,12.2.3 Richard Müller T5,a591f7eb-3cae-4aa3-bedf-a907000222ee,,False,23ca2957-573e-4eaf-ae8b-8c33695da5c5
5242,affaldskurve_puma.7325,7325,MULTIPOINT ((12.560076688652794 55.71294009667...,IDrift,Bydækkende drift,,,,Østerbro,Gader og pladser,Øvrig gade,Kvarteret ved Skt. Kjelds Plads,,12.2.2 Københavnerkurv,5ec05775-cc06-4dd6-8208-0a6148437eeb,,True,56ccc289-2ab7-436d-914b-54e0a10c9528
5243,affaldskurve_puma.7326,7326,MULTIPOINT ((12.603868260188998 55.66482177747...,IDrift,Bydækkende drift,,,,Amager Øst,Gader og pladser,Gade med grønt,Kvarteret ved Amagerbro Torv,,12.2.2 Københavnerkurv,5ec05775-cc06-4dd6-8208-0a6148437eeb,,False,9ed1a77e-b47e-455f-b0c4-a0cdb41be9b5
5244,affaldskurve_puma.7327,7327,MULTIPOINT ((12.587134919160713 55.67407987871...,IDrift,Bydækkende drift,,,,,,,,,12.2.8 Pizzakurv,8755aaa9-70d1-49d7-b3aa-415dcf01d0d2,,True,a1934124-5378-4df1-a99c-1eec227cd795


### Clean data

In [55]:
# Get the latitude and longitude of each bin.
df_trash_bins_clean = pd.DataFrame(
    df_trash_bins_raw
    .loc[:, 'wkb_geometry']
    .str.replace('MULTIPOINT', '', regex=False)
    .str.replace('(', '', regex=False)
    .str.replace(')', '', regex=False)
    .str.strip()
    .str.split()
    .to_list(),
    columns=['Longitude', 'Latitude']
).astype(float)

# Add other colums of interest.
df_trash_bins_clean = pd.concat(
    [df_trash_bins_clean,
     data.df_trash_bins.loc[:, ['lokaludvalg',
                                'underordnet_byrumstype',
                                'kgb_kategori']]],
    axis='columns'
).rename(columns={'lokaludvalg': 'District',
                  'underordnet_byrumstype': 'Topology',
                  'kgb_kategori': 'Bin category'})

# Trim the bin categories.
df_trash_bins_clean['Bin category'] = \
    df_trash_bins_clean['Bin category'].str[7:].str.strip()


# Show the dataframe.
display(df_trash_bins_clean)

Unnamed: 0,Longitude,Latitude,District,Topology,Bin category
0,12.565693,55.679325,Indre By,Klassisk park,Richard Müller T5
1,12.565860,55.679331,Indre By,Klassisk park,Richard Müller T5
2,12.566934,55.682639,Indre By,Klassisk park,Københavnerkurv
3,12.566246,55.682707,Indre By,Klassisk park,Københavnerkurv
4,12.566669,55.682565,Indre By,Lege- og aktivitetsområde,Københavnerkurv
...,...,...,...,...,...
5241,12.612869,55.685178,Christianshavn,Øvrig gade,Richard Müller T5
5242,12.560077,55.712940,Østerbro,Øvrig gade,Københavnerkurv
5243,12.603868,55.664822,Amager Øst,Gade med grønt,Københavnerkurv
5244,12.587135,55.674080,,,Pizzakurv


### Statistics

#### Data types

In [60]:
display(df_trash_bins_clean.dtypes.to_frame('Data types'))

Unnamed: 0,Data types
Longitude,float64
Latitude,float64
District,object
Topology,object
Bin category,object


#### Value oveview

In [56]:
display(df_trash_bins_clean.describe(include='all'))

Unnamed: 0,Longitude,Latitude,District,Topology,Bin category
count,5246.0,5246.0,4929,4627,5230
unique,,,12,12,21
top,,,Indre By,Klassisk park,Københavnerkurv
freq,,,1272,1243,2684
mean,12.556653,55.681802,,,
std,0.036906,0.020691,,,
min,12.453209,55.614237,,,
25%,12.532132,55.66698,,,
50%,12.563074,55.681257,,,
75%,12.581039,55.697867,,,


#### Missing values

In [57]:
display(df_trash_bins_clean.isna().sum().to_frame('Number of missing values'))

Unnamed: 0,Number of missing values
Longitude,0
Latitude,0
District,317
Topology,619
Bin category,16


## Sickness benefits

### Load data

In [52]:
df_sickness_benefits_raw = pd.read_csv(
    path_resources_root / 'sygedagpenge-201501---201605-ydelsesmodtageregeo.csv')

### Show data

In [42]:
display(df_sickness_benefits_raw)

Unnamed: 0,Bydele Navn,Roder,År,Måned,Gennemsnitligt antal sager
0,Amager Vest,325,2015,Januar,1875
1,Amager Vest,325,2015,Februar,1575
2,Amager Vest,325,2015,Marts,166
3,Amager Vest,325,2015,April,1725
4,Amager Vest,325,2015,Maj,155
...,...,...,...,...,...
5394,Ukendt,999,2015,November,186
5395,Ukendt,999,2015,December,18
5396,Ukendt,999,2016,Januar,21
5397,Ukendt,999,2016,Februar,226


### Clean data

In [54]:
# Get the latitude and longitude of each bin.
df_sickness_benefits_clean = (
    df_sickness_benefits_raw
    .drop(['Roder'], axis=1)
    .rename(columns={'Bydele Navn': 'District',
                     'År': 'Year',
                     'Måned': 'Month',
                     'Gennemsnitligt antal sager': 'Average number of cases'}) 
)

# Show the dataframe.
display(df_sickness_benefits_clean)

Unnamed: 0,District,Year,Month,Average number of cases
0,Amager Vest,2015,Januar,1875
1,Amager Vest,2015,Februar,1575
2,Amager Vest,2015,Marts,166
3,Amager Vest,2015,April,1725
4,Amager Vest,2015,Maj,155
...,...,...,...,...
5394,Ukendt,2015,November,186
5395,Ukendt,2015,December,18
5396,Ukendt,2016,Januar,21
5397,Ukendt,2016,Februar,226


### Statistics

#### Data types

In [61]:
display(df_sickness_benefits_clean.dtypes.to_frame('Data types'))

Unnamed: 0,Data types
District,object
Year,int64
Month,object
Average number of cases,object


#### Value oveview

In [62]:
display(df_sickness_benefits_clean.describe(include='all'))

Unnamed: 0,District,Year,Month,Average number of cases
count,5399,5399.0,5399,5399.0
unique,11,,12,366.0
top,Nørrebro,,Januar,15.0
freq,784,,721,82.0
mean,,2015.20226,,
std,,0.401722,,
min,,2015.0,,
25%,,2015.0,,
50%,,2015.0,,
75%,,2015.0,,


#### Value counts

In [68]:
display(
    df_sickness_benefits_clean['District']
    .value_counts()
    .to_frame('Number of listings')
)

Unnamed: 0,Number of listings
Nørrebro,784
Østerbro,659
Indre by,653
Vesterbro/Kgs. Enghave,606
Amager Vest,508
Bispebjerg,499
Amager Øst,492
Valby,448
Brønshøj-Husum,375
Vanløse,345


#### Missing values

In [65]:
display(df_sickness_benefits_clean.isna().sum().to_frame('Number of missing values'))

Unnamed: 0,Number of missing values
District,0
Year,0
Month,0
Average number of cases,0


<hr style="border:2px solid black"> </hr>

# Template



XXX asdfasdfa dasdf    
    
