In [None]:
import pandas
import geopandas
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def load_frame(type, path):
    if type == 'pandas':
        dataframe = pandas.read_csv(path)
    else:
        dataframe = geopandas.read_file(path)
    return dataframe 

In [None]:
full_join = load_frame('geopandas', 'data_merged/full_join.geojson')
livability_green_index = load_frame('pandas', 'data_raw/green_live_buurt_complete.csv')
house_price = load_frame('pandas', 'data_raw/housing_prices.csv')

In [None]:
full_join.columns

In [None]:
data = full_join
""" data = data.rename(columns = {'neighborhood' : 'BUURT'})
data = pandas.merge(data, house_price, how = 'right', on = 'BUURT')
data = pandas.merge(data, livability_green_index, how = 'right', on = 'BUURT' ) """

In [None]:
data

In [None]:
full_join = full_join[full_join['green_score'].notna()]

In [None]:
# data = data.drop(['Unnamed: 0'], axis = 1)
data = data.rename(columns = {'price_2022' : 'price'})
data = data.rename(columns = {'neighborhood' : 'BUURT'})
data['price'] = data['price'].replace(0,np.nan)
data['price'] = data['price'].fillna(data['price'].mean())

In [None]:
def plot_below(data,column):
        data = data.sort_values(by = column, ascending= False).reset_index()
        top_20 = data.head(20)
        
        fig, ax = plt.subplots(2)

        ax[0].plot(top_20['BUURT'],top_20['price'])
        ax[0].set_xticklabels(
                top_20['BUURT'], rotation=45, ha='right', rotation_mode='anchor')
        ax[0].title.set_text('Price')
        ax[1].plot(top_20['BUURT'],top_20[column])
        ax[1].set_xticklabels(
                top_20['BUURT'], rotation=45, ha='right', rotation_mode='anchor')
        ax[1].title.set_text(column)
        fig.tight_layout()
        print("Spearman's Corr for price and",column, ' =', top_20['price'].corr(top_20[column], method='spearman'))

In statistics, Spearman's rank correlation coefficient is a nonparametric measure of rank correlation (statistical dependence between the rankings of two variables). It assesses how well the relationship between two variables can be described using a monotonic function.

Translation for non-nerds: Spearman's correlation coefficient gives a better understanding of the following: will x definetly increase if we increase y?

In [None]:
image = plt.imread('Spearman.png') 
plt.imshow(image)

First Section | house price vs neighbourhood size

In [None]:
plot_below(data,'area_sqkm')

In [None]:
plot_below(data,'jobs_count')

In [None]:
plot_below(data,'inhabitants')

Second Section | house price vs safety/criminality

In [None]:
plot_below(data,'drug_store_count')

In [None]:
plot_below(data,'Total felonies')

In [None]:
plot_below(data,'Theft of mopeds, mopeds and bicycles')

Third Section | house price vs livability/others

In [None]:
plot_below(data,'green_score')

In [None]:
plot_below(data,'livability_score')

In [None]:
plot_below(data, 'proximity_score')

In [None]:
data['jobs_count']

Keep in mind, 0 values for house prices have been replace with mean of house prices. (This will change corr coefficients)

In [None]:
columns = ['BUURT', 'light_count', 'sport_building_count', 'inhabitants', 'light_per_1000',
       'sport_building_per_1000', 'area_sqkm', 'green_score',
       'livability_score', 'price', 'jobs_count', 'Nuisance by confused person',
       'Youth nuisance report', 'Nuisance due to alcohol/drugs','Public intoxication', 'Accidents (road)',
       'Abuse', 'Discrimination','Drugs/drink nuisance', 'Home theft/burglary',
       'Pickpocketing', 'Robbery', 'Shoplifting', 'Street robbery',
       'Theft of mopeds, mopeds and bicycles',
       'Theft/burglary box/garage/shed', 'Total felonies','drug_store_count', 'Childcare',
       'proximity_score', 'geometry']

In [None]:
data_stripped = data

In [None]:
data_stripped['drug_store_count'].replace(np.nan,0, inplace=True)

In [None]:
data_stripped.dtypes

In [None]:
data_stripped['ultimate'] = (data_stripped['light_count']+ data_stripped['sport_building_count']
                            +data_stripped['inhabitants']+ data_stripped['light_per_1000']
                            +data_stripped['sport_building_per_1000']+ data_stripped['area_sqkm']+ data_stripped['green_score']
                            +data_stripped['livability_score']+ data_stripped['jobs_count']+ data_stripped['Nuisance by confused person']
                            +data_stripped['Youth nuisance report']+ data_stripped['Nuisance due to alcohol/drugs']+data_stripped['Public intoxication']
                            +data_stripped['Accidents (road)']
                            +data_stripped['Abuse']+ data_stripped['Discrimination']+data_stripped['Drugs/drink nuisance']+ data_stripped['Home theft/burglary']
                            +data_stripped['Pickpocketing']+ data_stripped['Robbery']+ data_stripped['Shoplifting']+ data_stripped['Street robbery']
                            +data_stripped['Theft of mopeds, mopeds and bicycles']
                            +data_stripped['Theft/burglary box/garage/shed']+ data_stripped['Total felonies']+data_stripped['drug_store_count']
                            +data_stripped['proximity_score']) / 28

In [None]:
mega_corr = data_stripped['price'].corr(data_stripped['ultimate'], method='spearman')

In [None]:
print(mega_corr)

In [None]:
data_stripped.columns

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
le.fit(data_stripped['BUURT'])
data_stripped['BUURT'] = le.transform(data_stripped['BUURT'])

In [None]:
data_stripped = data_stripped.replace(np.nan, -1)

In [None]:
data_stripped.to_csv('data_merged/full_join_test.csv')

In [None]:
from sdv.datasets.local import load_csvs

# assume that my_folder contains 1 CSV file named 'guests.csv'
datasets = load_csvs(folder_name='data_merged/')

# the data is available under the file name
guests_table = datasets['full_join_test']

In [None]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_csv(filepath='data_merged/full_join_test.csv')

In [None]:
ctgan = CTGANSynthesizer(metadata,batch_size=10,epochs=1000, pac = 5 ,verbose=False)
ctgan.fit(guests_table)
ctgan.save('ctgan_full_join.pkl')
samples = ctgan.sample(200)
samples.head()

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=guests_table,
    synthetic_data=samples,
    metadata=metadata)

In [None]:
synthetic_data = ctgan.sample(num_rows=500)

In [None]:
synthetic_data

In [None]:
quality_report = evaluate_quality(
    real_data=guests_table,
    synthetic_data=synthetic_data,
    metadata=metadata)

In [None]:
synthetic_data['BUURT'] = le.inverse_transform(synthetic_data['BUURT'])

In [None]:
synthetic_data.to_csv('data_merged/fake_full_join.csv')