### Importing packages

In [2]:
import pandas as pd
import numpy as np
from shapely.wkt import loads as load_wkt
import geopandas as gpd

### Importing data


In [3]:
zipcodes = pd.read_csv('../data/ZIPCODES.csv')

### Generating centroids based on zipcodes

In [7]:
# Convert the_geom to Shapely geometry
zipcodes['geometry'] = zipcodes['the_geom'].apply(load_wkt)

In [8]:
# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(zipcodes, geometry='geometry')

In [9]:
gdf['centroid'] = gdf['geometry'].centroid

In [10]:
gdf

Unnamed: 0,MAIL_CITY,the_geom,ZIPCODE,Shape_Leng,Shape_Area,geometry,centroid
0,GLEN ECHO,MULTIPOLYGON (((-77.14303625193239 38.97166593...,20812,14504.506513,9.810937e+06,"MULTIPOLYGON (((-77.14304 38.97167, -77.14287 ...",POINT (-77.14294 38.96663)
1,BETHESDA,MULTIPOLYGON (((-77.13906620687646 38.96881414...,20816,58716.124845,1.175655e+08,"MULTIPOLYGON (((-77.13907 38.96881, -77.13927 ...",POINT (-77.11806 38.95648)
2,CABIN JOHN,MULTIPOLYGON (((-77.15860566618579 38.98484318...,20818,31103.235156,3.599683e+07,"MULTIPOLYGON (((-77.15861 38.98484, -77.15859 ...",POINT (-77.16111 38.97314)
3,BETHESDA,MULTIPOLYGON (((-77.09628232676539 38.99377978...,20894,1298.401522,2.547457e+04,"MULTIPOLYGON (((-77.09628 38.99378, -77.09625 ...",POINT (-77.09644 38.99422)
4,BETHESDA,MULTIPOLYGON (((-77.09646148390605 38.99533342...,20892,140.742936,7.448763e+02,"MULTIPOLYGON (((-77.09646 38.99533, -77.09643 ...",POINT (-77.09647 38.99526)
...,...,...,...,...,...,...,...
92,BOYDS,MULTIPOLYGON (((-77.31723216974609 39.24504591...,20841,250533.236842,7.723518e+08,"MULTIPOLYGON (((-77.31723 39.24505, -77.31713 ...",POINT (-77.32652 39.18828)
93,GERMANTOWN,MULTIPOLYGON (((-77.22902654924928 39.25281687...,20876,135940.688675,3.258196e+08,"MULTIPOLYGON (((-77.22903 39.25282, -77.22884 ...",POINT (-77.23272 39.20877)
94,BROOKEVILLE,MULTIPOLYGON (((-77.09290544438616 39.26236508...,20833,207931.092330,5.585343e+08,"MULTIPOLYGON (((-77.09291 39.26237, -77.09266 ...",POINT (-77.05685 39.20370)
95,DICKERSON,MULTIPOLYGON (((-77.32545880875539 39.27754835...,20842,357054.654503,1.131929e+09,"MULTIPOLYGON (((-77.32546 39.27755, -77.32559 ...",POINT (-77.43622 39.19001)


In [11]:
gdf['centroid_latitude'] = gdf['centroid'].y
gdf['centroid_longitude'] = gdf['centroid'].x

In [12]:
gdf

Unnamed: 0,MAIL_CITY,the_geom,ZIPCODE,Shape_Leng,Shape_Area,geometry,centroid,centroid_latitude,centroid_longitude
0,GLEN ECHO,MULTIPOLYGON (((-77.14303625193239 38.97166593...,20812,14504.506513,9.810937e+06,"MULTIPOLYGON (((-77.14304 38.97167, -77.14287 ...",POINT (-77.14294 38.96663),38.966627,-77.142937
1,BETHESDA,MULTIPOLYGON (((-77.13906620687646 38.96881414...,20816,58716.124845,1.175655e+08,"MULTIPOLYGON (((-77.13907 38.96881, -77.13927 ...",POINT (-77.11806 38.95648),38.956480,-77.118062
2,CABIN JOHN,MULTIPOLYGON (((-77.15860566618579 38.98484318...,20818,31103.235156,3.599683e+07,"MULTIPOLYGON (((-77.15861 38.98484, -77.15859 ...",POINT (-77.16111 38.97314),38.973140,-77.161105
3,BETHESDA,MULTIPOLYGON (((-77.09628232676539 38.99377978...,20894,1298.401522,2.547457e+04,"MULTIPOLYGON (((-77.09628 38.99378, -77.09625 ...",POINT (-77.09644 38.99422),38.994224,-77.096444
4,BETHESDA,MULTIPOLYGON (((-77.09646148390605 38.99533342...,20892,140.742936,7.448763e+02,"MULTIPOLYGON (((-77.09646 38.99533, -77.09643 ...",POINT (-77.09647 38.99526),38.995261,-77.096473
...,...,...,...,...,...,...,...,...,...
92,BOYDS,MULTIPOLYGON (((-77.31723216974609 39.24504591...,20841,250533.236842,7.723518e+08,"MULTIPOLYGON (((-77.31723 39.24505, -77.31713 ...",POINT (-77.32652 39.18828),39.188277,-77.326518
93,GERMANTOWN,MULTIPOLYGON (((-77.22902654924928 39.25281687...,20876,135940.688675,3.258196e+08,"MULTIPOLYGON (((-77.22903 39.25282, -77.22884 ...",POINT (-77.23272 39.20877),39.208771,-77.232719
94,BROOKEVILLE,MULTIPOLYGON (((-77.09290544438616 39.26236508...,20833,207931.092330,5.585343e+08,"MULTIPOLYGON (((-77.09291 39.26237, -77.09266 ...",POINT (-77.05685 39.20370),39.203696,-77.056854
95,DICKERSON,MULTIPOLYGON (((-77.32545880875539 39.27754835...,20842,357054.654503,1.131929e+09,"MULTIPOLYGON (((-77.32546 39.27755, -77.32559 ...",POINT (-77.43622 39.19001),39.190007,-77.436218


### Generating dataframe for data warehouse

In [13]:
# Create the LocationAreaKey
gdf['LocationAreaKey'] = (gdf['centroid_longitude'].astype(str) + '_' + gdf['centroid_latitude'].astype(str)).apply(lambda x: str(abs(hash(x))))

# Create the LocationAreaDim DataFrame
location_area_dim = pd.DataFrame({
    'LocationAreaKey': gdf['LocationAreaKey'],
    'Zipcode': gdf['ZIPCODE'],
    'MailCity': gdf['MAIL_CITY'],
    'ShapeLength': gdf['Shape_Leng'],
    'ShapeArea': gdf['Shape_Area'],
    'CeontroidLatitude': gdf['centroid_latitude'],
    'CeontroidLongitude': gdf['centroid_longitude']
})

        LocationAreaKey  Zipcode     MailCity    ShapeLength     ShapeArea  \
0   1765469966564180105    20812    GLEN ECHO   14504.506513  9.810937e+06   
1   7961571706959143536    20816     BETHESDA   58716.124845  1.175655e+08   
2   3540836994776163547    20818   CABIN JOHN   31103.235156  3.599683e+07   
3   3573323627920319747    20894     BETHESDA    1298.401522  2.547457e+04   
4   4724237814791786877    20892     BETHESDA     140.742936  7.448763e+02   
..                  ...      ...          ...            ...           ...   
92  1615070481990266122    20841        BOYDS  250533.236842  7.723518e+08   
93  9058900681168067254    20876   GERMANTOWN  135940.688675  3.258196e+08   
94  1879160579523463433    20833  BROOKEVILLE  207931.092330  5.585343e+08   
95  6008151881358816533    20842    DICKERSON  357054.654503  1.131929e+09   
96  4903927293200828750    20842    DICKERSON    8182.016222  3.119688e+06   

    CeontroidLatitude  CeontroidLongitude  
0           38.9666

In [14]:
location_area_dim

Unnamed: 0,LocationAreaKey,Zipcode,MailCity,ShapeLength,ShapeArea,CeontroidLatitude,CeontroidLongitude
0,1765469966564180105,20812,GLEN ECHO,14504.506513,9.810937e+06,38.966627,-77.142937
1,7961571706959143536,20816,BETHESDA,58716.124845,1.175655e+08,38.956480,-77.118062
2,3540836994776163547,20818,CABIN JOHN,31103.235156,3.599683e+07,38.973140,-77.161105
3,3573323627920319747,20894,BETHESDA,1298.401522,2.547457e+04,38.994224,-77.096444
4,4724237814791786877,20892,BETHESDA,140.742936,7.448763e+02,38.995261,-77.096473
...,...,...,...,...,...,...,...
92,1615070481990266122,20841,BOYDS,250533.236842,7.723518e+08,39.188277,-77.326518
93,9058900681168067254,20876,GERMANTOWN,135940.688675,3.258196e+08,39.208771,-77.232719
94,1879160579523463433,20833,BROOKEVILLE,207931.092330,5.585343e+08,39.203696,-77.056854
95,6008151881358816533,20842,DICKERSON,357054.654503,1.131929e+09,39.190007,-77.436218


In [15]:
LocationAreaDim = location_area_dim

In [16]:
LocationAreaDim

Unnamed: 0,LocationAreaKey,Zipcode,MailCity,ShapeLength,ShapeArea,CeontroidLatitude,CeontroidLongitude
0,1765469966564180105,20812,GLEN ECHO,14504.506513,9.810937e+06,38.966627,-77.142937
1,7961571706959143536,20816,BETHESDA,58716.124845,1.175655e+08,38.956480,-77.118062
2,3540836994776163547,20818,CABIN JOHN,31103.235156,3.599683e+07,38.973140,-77.161105
3,3573323627920319747,20894,BETHESDA,1298.401522,2.547457e+04,38.994224,-77.096444
4,4724237814791786877,20892,BETHESDA,140.742936,7.448763e+02,38.995261,-77.096473
...,...,...,...,...,...,...,...
92,1615070481990266122,20841,BOYDS,250533.236842,7.723518e+08,39.188277,-77.326518
93,9058900681168067254,20876,GERMANTOWN,135940.688675,3.258196e+08,39.208771,-77.232719
94,1879160579523463433,20833,BROOKEVILLE,207931.092330,5.585343e+08,39.203696,-77.056854
95,6008151881358816533,20842,DICKERSON,357054.654503,1.131929e+09,39.190007,-77.436218


In [30]:
def generate_location_area_dim(zipcodes):
    zipcodes['geometry'] = zipcodes['the_geom'].apply(load_wkt)
    gdf = gpd.GeoDataFrame(zipcodes, geometry='geometry')
    gdf['centroid'] = gdf['geometry'].centroid
    gdf['centroid_latitude'] = gdf['centroid'].y
    gdf['centroid_longitude'] = gdf['centroid'].x
    gdf['LocationAreaKey'] = (gdf['centroid_longitude'].astype(str) + '_' + gdf['centroid_latitude'].astype(str)).apply(lambda x: str(abs(hash(x))))
    location_area_dim = pd.DataFrame({
        'LocationAreaKey': gdf['LocationAreaKey'],
        'Zipcode': gdf['ZIPCODE'],
        'MailCity': gdf['MAIL_CITY'],
        'ShapeLength': gdf['Shape_Leng'],
        'ShapeArea': gdf['Shape_Area'],
        'CeontroidLatitude': gdf['centroid_latitude'],
        'CeontroidLongitude': gdf['centroid_longitude']
    })
    return location_area_dim

In [31]:
test = generate_location_area_dim(zipcodes)