In [1]:
import geopandas as gpd
import pandas as pd

In [2]:
def get_data(path, columns_to_delete=[], geo=False, sep=','):
    '''
    Function to load the data using pandas and geopandas.
    Loads all geospacial data regardles of the file extention.
    Non-geospatial data is limited to CSV files with definable separators.
    Inputs:
        path: file path to the data file
        columns_to_delete: string or list with column names to drop (default: [])
        geo: bool to define whether the file contains geospatial data (default: False)
        sep: CSV separator (default: ',')
    Returns:
        A pandas.DataFrame object or a geopandas.GeoDataFrame
        depending on the geo parameter
    '''
    if geo:
        return (
            gpd.read_file(path)
                .drop(columns_to_delete, axis=1)
        )
    else:
        return (
            pd.read_csv(path, sep=sep)
                .drop(columns_to_delete, axis=1)
        )

In [3]:
df_geo = get_data('data_raw\\utils.geojson', [], True)
df_jobs = get_data('data_raw\jobs_by_buurt.csv')
df_green_neighborhood = get_data('data_raw\green_live_buurt_complete.csv', 'Unnamed: 0')
df_price = get_data('data_raw\Housing Prices 2015-2022.csv')
df_prox = get_data('data_raw\proximity.csv')
df_nuisance = get_data('data_raw\\nuisance.csv', sep=';')
df_crime = get_data('data_raw\crime_by_type.csv', 'Unnamed: 0')
df_drugs = get_data('data_raw\drugscords.csv', 
                    ['totalScore',
                     'reviewsCount',
                     'street',
                     'city',
                     'state',
                     'countryCode',
                     'website',
                     'phone',
                     'categoryName',
                     'url'])

In [4]:
df_drugs = gpd.GeoDataFrame(df_drugs,
                            geometry=gpd.points_from_xy(df_drugs['longitude'], df_drugs['latitude'], crs="EPSG:4326"))
df_drugs = df_drugs.drop(['latitude', 'longitude'], axis=1)

In [5]:
df_prox = df_prox[df_prox['within'] == 'Within 3 km'].drop(['Unnamed: 0', 'within'], axis=1)
df_prox = df_prox.groupby(['zones', 'catetgory'], as_index=False)['Value'].sum()
df_prox = df_prox.pivot_table(index='zones', columns='catetgory', values='Value')

In [6]:
df_geo.head(1)

Unnamed: 0,regions,neighborhood,light_count,sport_building_count,workplace_count,inhabitants,light_per_1000,sport_building_per_1000,area_sqkm,distance_from_centre_km,geometry
0,Bavel,Bavel,1253,2,2,5445,230.119376,0.367309,1.750374,4.59422,"POLYGON ((4.83844 51.56120, 4.83911 51.56024, ..."


In [7]:
df_jobs.head(1)

Unnamed: 0,Buurt,Value
0,Chassé,1920


In [8]:
df_green_neighborhood.head(1)

Unnamed: 0,BUURT,green_score,livability_score
0,Bavel,30.375317,2.35557


In [9]:
df_price = df_price.rename(columns={
    '2015': 'price_2015',
    '2016': 'price_2016',
    '2017': 'price_2017',
    '2018': 'price_2018',
    '2019': 'price_2019',
    '2020': 'price_2020',
    '2021': 'price_2021',
    '2022': 'price_2022'
})
df_price.head(1)

Unnamed: 0,neighborhood,price_2015,price_2016,price_2017,price_2018,price_2019,price_2020,price_2021,price_2022
0,Centrum,175000,173000,181000,195000,212000,231000,244000,264000


In [10]:
df_prox.head(1)

catetgory,Childcare,Education,Health and well-being,Hospitality,Retail
zones,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bavel,11.3,2.6,3.5,10.7,15.8


In [11]:
df_nuisance.head(1)

Unnamed: 0,Districts and neighbourhoods,Total nuisance registrations,Nuisance by confused person,Youth nuisance report,Nuisance due to alcohol/drugs,Nuisance drifters,Public intoxication,Noise nuisance catering,Noise nuisance event,Other noise nuisance
0,Breda,5672,1800,889,675,498,49,27,15,1719


In [12]:
df_crime.head(1)

Unnamed: 0,neighborhood,Accidents (road),Encroachment on public order,Fraud (other),Horizontal Fraud,Human trafficking,Nature and landscape,Quality of life (other),Road (other),Spatial planning,...,Theft/burglary box/garage/shed,"Theft/burglary of companies, etc.",Thefts (water),Threat,Total felonies,Under the influence (air),Under the influence (away),Vertical Fraud,Waste,Water
0,Bavel,37,0,0,54,0,0,0,5,0,...,4,5,0,16,366,0,12,0,0,0


In [13]:
df_drugs.head(1)

Unnamed: 0,title,geometry
0,the Baron,POINT (4.78469 51.59150)


In [14]:
df_join = df_geo.sjoin(df_drugs, how='left')

In [15]:
df_join = df_join.dissolve(by=['regions', 'neighborhood', 'light_count', 'sport_building_count',
                                'workplace_count', 'inhabitants', 'light_per_1000',
                                'sport_building_per_1000', 'area_sqkm', 'distance_from_centre_km',],
                            aggfunc='count', as_index=False)

In [16]:
df_join = df_join.merge(df_green_neighborhood, how='inner', left_on='neighborhood', right_on='BUURT')

In [17]:
df_join = df_join.merge(df_jobs, how='inner', left_on='neighborhood', right_on='Buurt')

In [18]:
df_join = df_join.drop(['BUURT', 'Buurt', 'index_right'], axis=1)
df_join = df_join.rename(columns={
    'title': 'drug_store_count',
    'Value': 'jobs_count'
})

In [19]:
df_join = df_join.merge(df_price, how='left', on='neighborhood')

In [20]:
df_join = df_join.merge(df_crime, how='left', on='neighborhood')

In [21]:
df_join = df_join.merge(df_nuisance, how='left', left_on='neighborhood', right_on='Districts and neighbourhoods')
df_join = df_join.drop('Districts and neighbourhoods', axis=1)

In [22]:
df_join = df_join.merge(df_prox, how='left', left_on='neighborhood', right_on='zones')

In [23]:
df_join['proximity_score'] = (df_join['Childcare']+df_join['Education']+df_join['Health and well-being']
                                +df_join['Hospitality']+df_join['Retail']) / 5

In [24]:
df_join.columns

Index(['regions', 'neighborhood', 'light_count', 'sport_building_count',
       'workplace_count', 'inhabitants', 'light_per_1000',
       'sport_building_per_1000', 'area_sqkm', 'distance_from_centre_km',
       'geometry', 'drug_store_count', 'green_score', 'livability_score',
       'jobs_count', 'price_2015', 'price_2016', 'price_2017', 'price_2018',
       'price_2019', 'price_2020', 'price_2021', 'price_2022',
       'Accidents (road)', 'Encroachment on public order', 'Fraud (other)',
       'Horizontal Fraud', 'Human trafficking', 'Nature and landscape',
       'Quality of life (other)', 'Road (other)', 'Spatial planning',
       'Special Laws', 'Transport of hazardous substances',
       'Under the influence (water)', 'Abuse', 'Air (other)', 'Animals',
       'Arms Trade', 'Building materials', 'Cybercrime',
       'Destruction or. property damage', 'Discrimination',
       'Domestic Violation', 'Drug trafficking', 'Drugs/drink nuisance',
       'Fire/Explosion', 'Fireworks', '

In [25]:
df_join = df_join[['regions', 'neighborhood', 'light_count', 'sport_building_count',
       'workplace_count', 'inhabitants', 'light_per_1000',
       'sport_building_per_1000', 'area_sqkm', 'distance_from_centre_km',
       'drug_store_count', 'green_score', 'livability_score',
       'jobs_count', 'price_2015', 'price_2016', 'price_2017', 'price_2018',
       'price_2019', 'price_2020', 'price_2021', 'price_2022',
       'Accidents (road)', 'Encroachment on public order', 'Fraud (other)',
       'Horizontal Fraud', 'Human trafficking', 'Nature and landscape',
       'Quality of life (other)', 'Road (other)', 'Spatial planning',
       'Special Laws', 'Transport of hazardous substances',
       'Under the influence (water)', 'Abuse', 'Air (other)', 'Animals',
       'Arms Trade', 'Building materials', 'Cybercrime',
        'Discrimination',
       'Domestic Violation', 'Drug trafficking', 'Drugs/drink nuisance',
       'Fire/Explosion', 'Fireworks', 'Food safety', 'Home theft/burglary',
       'Immigration care', 'Most', 'Motor Vehicle Theft',
       'Murder, Manslaughter', 'Neighbor rumor (relationship problems)',
       'Open violence (person)', 'Other property crimes', 'People smuggling',
       'Pesticides', 'Pickpocketing', 'Robbery', 'Shoplifting', 'Soil',
       'Street robbery', 'Structure of the Environmental Management Act',
       'Theft from/from motor vehicles',
       'Theft of mopeds, mopeds and bicycles',
       'Theft/burglary box/garage/shed', 'Theft/burglary of companies, etc.',
       'Thefts (water)', 'Threat', 'Total felonies',
       'Under the influence (air)', 'Under the influence (away)',
       'Vertical Fraud', 'Waste', 'Water', 'Total nuisance registrations',
       'Nuisance by confused person', 'Youth nuisance report',
       'Nuisance due to alcohol/drugs', 'Nuisance drifters',
       'Public intoxication', 'Noise nuisance catering',
       'Noise nuisance event', 'Other noise nuisance', 'Childcare',
       'Education', 'Health and well-being', 'Hospitality', 'Retail', 'proximity_score', 'geometry']]

In [26]:
df_join = df_join.rename(columns={'Under the influence (away)': 'Under the influence (road)'})

In [27]:
df_join.head(1)

Unnamed: 0,regions,neighborhood,light_count,sport_building_count,workplace_count,inhabitants,light_per_1000,sport_building_per_1000,area_sqkm,distance_from_centre_km,...,Noise nuisance catering,Noise nuisance event,Other noise nuisance,Childcare,Education,Health and well-being,Hospitality,Retail,proximity_score,geometry
0,Bavel,Bavel,1253,2,2,5445,230.119376,0.367309,1.750374,4.59422,...,0.0,3.0,24.0,11.3,2.6,3.5,10.7,15.8,8.78,"POLYGON ((4.83844 51.56120, 4.83911 51.56024, ..."


In [28]:
df_join.shape

(56, 91)

In [29]:
df_join.to_file("data_merged\\full_join.geojson", driver='GeoJSON')