In [1]:
### Read healthcare cost data from vektis https://www.vektis.nl/intelligence/open-data ###
### Please read the data description before using the data ###

import json
import func
import numpy as np
import pandas as pd

In [2]:
# "Vektis2012.csv", "Vektis2013.csv", "Vektis2014.csv", "Vektis2015.csv","Vektis2016.csv","Vektis2017.csv" #
file = "Vektis2011.csv"
year = file[6:-4]
df = pd.read_csv(file, delimiter=';')

In [3]:
### Select features you are interested in ###
### Feature descriptions are provided by https://www.vektis.nl/intelligence/open-data ###
# KOSTEN_MEDISCH_SPECIALISTISCHE_ZORG
col = ["GESLACHT", "AANTAL_BSN","KOSTEN_MEDISCH_SPECIALISTISCHE_ZORG","KOSTEN_HUISARTS_INSCHRIJFTARIEF", 
        "KOSTEN_HUISARTS_CONSULT","KOSTEN_HUISARTS_OVERIG", "KOSTEN_FARMACIE", "KOSTEN_MONDZORG", 
        "KOSTEN_ZIEKENVERVOER_ZITTEND", "KOSTEN_ZIEKENVERVOER_LIGGEND", "KOSTEN_GRENSOVERSCHRIJDENDE_ZORG",
        "KOSTEN_PARAMEDISCHE_ZORG_FYSIOTHERAPIE", "KOSTEN_PARAMEDISCHE_ZORG_OVERIG","KOSTEN_OVERIG",
        "KOSTEN_GERIATRISCHE_REVALIDATIEZORG","KOSTEN_VERPLEGING_EN_VERZORGING",
        "KOSTEN_EERSTELIJNS_PSYCHOLOGISCHE_ZORG","KOSTEN_TWEEDELIJNS_GGZ","KOSTEN_SPECIALISTISCHE_GGZ",\
        "KOSTEN_GENERALISTISCHE_BASIS_GGZ","KOSTEN_LANGDURIGE_GGZ"]

In [4]:
### As some features are available in some years, we need to check before select certain features ###
data_col = df.columns
present = []
for c in col:
    if c in data_col:
        present.append(col.index(c))
        
df_vektis = df[np.array(col)[present]]

In [5]:
### Give new columns names which are understandable for yourself ###
# medical_specialist
name_col = ["SEX", "BSNs","medical_specialist", "GP_registration","GP_consult","GP_others","pharmacy","dental","transport_seat", 
            "transport_land","abroad","paramedical_phy","paramedical_others", "others","rehabilitation","nursing",
            "firstLinePsy","secondLineGGZ","specialGGZ","basicGGZ","longGGZ"]
new_col = np.array(name_col)[present]
df_vektis.columns = new_col

In [6]:
### Change the types (int,float,str --> float) of values in the AGE column ###
age = []
for i in df['LEEFTIJDSKLASSE']:
    if type(i) == str:
        try:
            age.append(float(i))
        except:
            age.append(float(i[:-1]))
    elif type(i) == float:
        age.append(i)
    elif type(i) == int:
        age.append(i)

### Add new age column ###
df_vektis['AGE'] = age
### Remove the first row (sum) ###
df_vektis = df_vektis[1:]

In [7]:
### For getting some basic info ###
# Check missings #
func.check_missing(df, col, year)

# Export description of data #
func.data_describe(df, col, year)

### Plotting ###
# Age groups #
loop = [[0,4],[5,12],[13,18],[19,29],[30,39],[40,49],[50,59],[60,69],[70,79],[80,90]]
for i in loop:
    df_avg = func.groupAgeRange(df_vektis, i, 0)
    
    # Correlation matrix #
    func.corr_Matrix(df_avg, i, year)

    # Pie chart #
    func.pie_Chart(df_avg, i, year)

    # Distribution plot #
    func.dist_Plot(df_avg,'SUM', i, year)

### Stack area plot ###
loop = list(range(0,90,1))
df_stack = pd.DataFrame()
for i in loop:
    df_avg = func.groupAgeRange(df_vektis, i, df_stack)
    df_stack[i] = df_avg.mean(axis=0, skipna=True)
    df_stack_trans = df_stack.transpose()
    df_stack_trans = func.merge(df_stack_trans)
func.stacked_Plot(df_stack_trans, loop, year)

Totally, 3 features have missing values (blanks).
Check missing outcome is saved to Output/2011_missings.csv
Missing values check is done!
There is 136143 rows and 21 columns
Data description is done!
The number of insured people between 0 to 4:  7578
Correlation Matrix plot is done
Pie plot is done
Distribution plot is done
The number of insured people between 5 to 12:  12332
Correlation Matrix plot is done
Pie plot is done
Distribution plot is done
The number of insured people between 13 to 18:  9261
Correlation Matrix plot is done
Pie plot is done
Distribution plot is done
The number of insured people between 19 to 29:  16602
Correlation Matrix plot is done
Pie plot is done
Distribution plot is done
The number of insured people between 30 to 39:  15164
Correlation Matrix plot is done
Pie plot is done
Distribution plot is done
The number of insured people between 40 to 49:  15593
Correlation Matrix plot is done
Pie plot is done
Distribution plot is done
The number of insured people b