# Set Up

In [1]:
# Install module needed for the project
!pip install geoalchemy2
!pip install geopandas
!pip install psycopg2-binary



In [2]:
# Import statements needed for the project
import json
import pathlib
import urllib.parse
import os

import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db

from sqlalchemy.orm import declarative_base
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, Column, String, Integer, Date, Float, MetaData, Table, create_engine
import psycopg2

# Part 1: Data Preprocessing

## 1.1) NYC Open Data

In [3]:
## Store URL and application token
url_complaint = "https://data.cityofnewyork.us/resource/erm2-nwe9.geojson"
url_tree = "https://data.cityofnewyork.us/resource/5rq2-4hqu.geojson"
app_token = "YgXUUmVq41Z9433qxe5qpLOMG"

# Define date range
start_date = "2022-10-01"
end_date = "2023-09-30"

## Define parameter
params_complaint = {"$$app_token": app_token,
                    "$select": "incident_zip,created_date,location",
                    "$where": f"created_date between '{start_date}T00:00:00' and '{end_date}T23:59:59'",
                    "$limit": 1000}

params_tree = {"$$app_token": app_token,
               "$select": "zipcode,created_at, the_geom, spc_common,health,status",
               "$limit": 1000}

## Import NYC Open Data
raw_data_complaint = requests.get(url_complaint, params=params_complaint).json()
raw_data_tree = requests.get(url_tree, params=params_tree).json()

# Convert JSON to DataFrame
gdf_complaint = gpd.GeoDataFrame.from_features(raw_data_complaint['features'])
gdf_tree = gpd.GeoDataFrame.from_features(raw_data_tree['features'])

In [31]:
# Rename columns to have consistent naming
gdf_complaint.rename(columns={'incident_zip': 'zipcode'}, inplace=True)

# Rename columns to have consistent naming
gdf_complaint.rename(columns={'created_date': 'created_at'}, inplace=True)


# Convert data types if necessary (for example, ensuring zip_code is a string)
gdf_complaint['zipcode'] = gdf_complaint['zipcode'].astype(str)
gdf_tree['zipcode'] = gdf_tree['zipcode'].astype(str)


# Convert 'created_at' to datetime data type in both dataframes
gdf_complaint['created_at'] = pd.to_datetime(gdf_complaint['created_at'])
gdf_tree['created_at'] = pd.to_datetime(gdf_tree['created_at'])

In [32]:
gdf_complaint

Unnamed: 0,geometry,zipcode,created_at,geometry_wkb
0,POINT (-73.95918 40.65567),11226,2023-09-30 23:59:58,01010000009b6a5727637d52c0ac06650fed534440
1,POINT (-73.78752 40.76676),11361,2023-09-30 23:59:38,01010000006e75e4b3667252c037a92a0f25624440
2,POINT (-73.98487 40.71950),10002,2023-09-30 23:59:35,0101000000e0fa870d087f52c090f78e90185c4440
3,POINT (-73.79729 40.68750),11435,2023-09-30 23:59:34,01010000000518efcd067352c0cb1428faff574440
4,POINT (-73.95795 40.65220),11226,2023-09-30 23:59:28,0101000000049babfe4e7d52c071d4335c7b534440
...,...,...,...,...
995,POINT (-73.98164 40.76330),10019,2023-09-30 22:35:17,010100000073c21322d37e52c0e640d4cfb3614440
996,POINT (-73.89735 40.75889),11370,2023-09-30 22:35:17,010100000064cfb9396e7952c00fa1624423614440
997,POINT (-73.95548 40.80428),10026,2023-09-30 22:35:07,0101000000b94d43a8267d52c0eff9cea4f2664440
998,POINT (-73.91600 40.81874),10455,2023-09-30 22:35:04,0101000000a160d7b09f7a52c0bde5f487cc684440


In [33]:
gdf_tree

Unnamed: 0,geometry,health,zipcode,spc_common,status,created_at
0,POINT (-73.84422 40.72309),Fair,11375,red maple,Alive,2015-08-27
1,POINT (-73.81868 40.79411),Fair,11357,pin oak,Alive,2015-09-03
2,POINT (-73.93661 40.71758),Good,11211,honeylocust,Alive,2015-09-05
3,POINT (-73.93446 40.71354),Good,11211,honeylocust,Alive,2015-09-05
4,POINT (-73.97598 40.66678),Good,11215,American linden,Alive,2015-08-30
...,...,...,...,...,...,...
995,POINT (-74.12944 40.56929),,10306,,Dead,2015-08-31
996,POINT (-74.00015 40.68505),Poor,11231,Norway maple,Alive,2015-08-26
997,POINT (-73.95752 40.62479),Good,11230,Norway maple,Alive,2015-09-01
998,POINT (-74.09660 40.59259),Good,10304,Norway maple,Alive,2015-08-26


## 1.2) Geometric boundary data 

In [34]:
### Define the path to the SHP file (and related files)
# Path for Irene 'Documents', 'GitHub', 'Tools-for-Analytic-Project', 'data', 'nyc_zipcodes.shp'
# Path for Sahaphon r'C:\Users\USER\Documents\GitHub\Tools-for-Analytic-Project\data\nyc_zipcodes.shp'
Geom_file_path = os.path.join(os.getcwd(),r'C:\Users\USER\Documents\GitHub\Tools-for-Analytic-Project\data\nyc_zipcodes.shp')

### Load the shapefile using geopandas
gdf_zipcode = gpd.read_file(Geom_file_path)
gdf_zipcode


Unnamed: 0,ZIPCODE,BLDGZIP,PO_NAME,POPULATION,AREA,STATE,COUNTY,ST_FIPS,CTY_FIPS,URL,SHAPE_AREA,SHAPE_LEN,geometry
0,11436,0,Jamaica,18681.0,2.269930e+07,NY,Queens,36,081,http://www.usps.com/,0.0,0.0,"POLYGON ((1038098.252 188138.380, 1038141.936 ..."
1,11213,0,Brooklyn,62426.0,2.963100e+07,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((1001613.713 186926.440, 1002314.243 ..."
2,11212,0,Brooklyn,83866.0,4.197210e+07,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((1011174.276 183696.338, 1011373.584 ..."
3,11225,0,Brooklyn,56527.0,2.369863e+07,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((995908.365 183617.613, 996522.848 18..."
4,11218,0,Brooklyn,72280.0,3.686880e+07,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((991997.113 176307.496, 992042.798 17..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,10310,0,Staten Island,25003.0,5.346328e+07,NY,Richmond,36,085,http://www.usps.com/,0.0,0.0,"POLYGON ((950767.507 172848.969, 950787.510 17..."
259,11693,0,Far Rockaway,11052.0,3.497516e+06,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((1028453.995 167153.410, 1027813.010 ..."
260,11249,0,Brooklyn,28481.0,1.777221e+07,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((995877.318 203206.075, 995968.511 20..."
261,10162,1,New York,0.0,2.103489e+04,NY,New York,36,061,http://www.usps.com/,0.0,0.0,"POLYGON ((997731.761 219560.922, 997641.948 21..."


In [35]:
# Reproject gdf to EPSG:4326
gdf_zipcode = gdf_zipcode.to_crs(epsg=4326)

# Check the new CRS to confirm the change
print(gdf_zipcode.crs)

EPSG:4326


## 1.3) Rent data 

In [36]:
### Define selected column (date)
rent_selecte_column = ['2023-08-31', '2023-01-31']

### Define the path to the CSV file
# Path for Irene 'data', 'zillow_rent_data.csv'
# Path for Sahaphon r'C:\Users\USER\Documents\GitHub\Tools-for-Analytic-Project\data\zillow_rent_data.csv'
rent_file_path = os.path.join(os.getcwd(),r'C:\Users\USER\Documents\GitHub\Tools-for-Analytic-Project\data\zillow_rent_data.csv')

### Convert CSV to DataFrame
df = pd.read_csv(rent_file_path,usecols=rent_selecte_column)
df

Unnamed: 0,2023-01-31,2023-08-31
0,2027.438438,2053.486247
1,1738.217986,1795.384582
2,1706.900064,1757.602011
3,1458.063897,1488.180414
4,2895.699421,3064.476503
...,...,...
6717,3509.210744,3310.302151
6718,,2639.938102
6719,,
6720,2169.143026,2383.185013


In [37]:
## Part 2: Storing Data

In [38]:
## Create a new database in PostgreSQL
#!createdb group48project

^C


In [39]:
## Turn on the PostGIS extension
#!psql --dbname group48project -c 'CREATE EXTENSION postgis;'

^C


In [42]:
## Create Schema File (schema.sql):

ZIPCODE_SCHEMA = """
CREATE TABLE IF NOT EXISTS nyc_zip_codes (
    zip_code VARCHAR(10) PRIMARY KEY,
    borough VARCHAR(255),
    neighborhood VARCHAR(255)
);
"""

NYC_311_SCHEMA = """
CREATE TABLE IF NOT EXISTS complaints_311 (
    complaint_id SERIAL PRIMARY KEY,
    geometry GEOMETRY(Point),
    incident_zip VARCHAR(10),
    created_date TIMESTAMP
);
"""

NYC_TREE_SCHEMA = """
CREATE TABLE IF NOT EXISTS trees (
    tree_id SERIAL PRIMARY KEY,
    zipcode VARCHAR(10),
    created_at DATE,
    spc_common VARCHAR(255),
    health VARCHAR(50),
    status VARCHAR(50)
);
"""

ZILLOW_SCHEMA = """
CREATE TABLE IF NOT EXISTS historical_rents (
    id SERIAL PRIMARY KEY,
    zip_code VARCHAR(10),
    date DATE,
    average_rent NUMERIC
);  
"""

In [43]:
# create that required schema.sql file
with open('schema.sql', 'w') as file:
    file.write(ZIPCODE_SCHEMA)
    file.write('\n')
    file.write(NYC_311_SCHEMA)
    file.write('\n')
    file.write(NYC_TREE_SCHEMA)
    file.write('\n')
    file.write(ZILLOW_SCHEMA)

In [44]:
from sqlalchemy import text

In [45]:
from sqlalchemy import create_engine
from geoalchemy2 import Geometry
import geopandas as gpd

# Database credentials
dbname = 'group48project'
user = 'postgres'
host = 'localhost'
port = '5432'
password = 'postgres'

# Create an SQLAlchemy engine
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}')

# Filter Out Invalid Geometries
gdf_complaint = gdf_complaint[gdf_complaint['geometry'].notnull()]

# Use the known SRID since we have normalized the data
srid = 4326

# Define the table name
table_name='complaints_311'
print(engine)


# Directly insert the GeoDataFrame into the database 
gdf_complaint.to_sql(table_name, engine, if_exists='append', index=False, 
                     dtype={'geometry': Geometry('POINT', srid=4326)})

# Query to view the contents of the table
with engine.connect() as connection:
    query = text(f"SELECT * FROM {table_name};")
    result_df = pd.read_sql_query(query, connection)

# Display the first few rows of the DataFrame
print(result_df.head())

# Close the engine
engine.dispose()


Engine(postgresql+psycopg2://postgres:***@localhost:5432/group48project)


ProgrammingError: (psycopg2.ProgrammingError) can't adapt type 'Point'
[SQL: INSERT INTO complaints_311 (geometry, zipcode, created_at, geometry_wkb) VALUES (ST_GeomFromEWKT(%(geometry)s), %(zipcode)s, %(created_at)s, %(geometry_wkb)s)]
[parameters: ({'geometry': <POINT (-73.959 40.656)>, 'zipcode': '11226', 'created_at': datetime.datetime(2023, 9, 30, 23, 59, 58), 'geometry_wkb': '01010000009b6a5727637d52c0ac06650fed534440'}, {'geometry': <POINT (-73.788 40.767)>, 'zipcode': '11361', 'created_at': datetime.datetime(2023, 9, 30, 23, 59, 38), 'geometry_wkb': '01010000006e75e4b3667252c037a92a0f25624440'}, {'geometry': <POINT (-73.985 40.719)>, 'zipcode': '10002', 'created_at': datetime.datetime(2023, 9, 30, 23, 59, 35), 'geometry_wkb': '0101000000e0fa870d087f52c090f78e90185c4440'}, {'geometry': <POINT (-73.797 40.687)>, 'zipcode': '11435', 'created_at': datetime.datetime(2023, 9, 30, 23, 59, 34), 'geometry_wkb': '01010000000518efcd067352c0cb1428faff574440'}, {'geometry': <POINT (-73.958 40.652)>, 'zipcode': '11226', 'created_at': datetime.datetime(2023, 9, 30, 23, 59, 28), 'geometry_wkb': '0101000000049babfe4e7d52c071d4335c7b534440'}, {'geometry': <POINT (-73.827 40.686)>, 'zipcode': '11419', 'created_at': datetime.datetime(2023, 9, 30, 23, 59, 23), 'geometry_wkb': '010100000074835412eb7452c052589886c9574440'}, {'geometry': <POINT (-73.824 40.829)>, 'zipcode': '10465', 'created_at': datetime.datetime(2023, 9, 30, 23, 59, 22), 'geometry_wkb': '0101000000a6cfdbf0c37452c084102061166a4440'}, {'geometry': <POINT (-73.906 40.683)>, 'zipcode': '11207', 'created_at': datetime.datetime(2023, 9, 30, 23, 59, 6), 'geometry_wkb': '0101000000d39209ddfd7952c092a7a39961574440'}  ... displaying 10 of 986 total bound parameter sets ...  {'geometry': <POINT (-73.916 40.819)>, 'zipcode': '10455', 'created_at': datetime.datetime(2023, 9, 30, 22, 35, 4), 'geometry_wkb': '0101000000a160d7b09f7a52c0bde5f487cc684440'}, {'geometry': <POINT (-73.94 40.707)>, 'zipcode': '11206', 'created_at': datetime.datetime(2023, 9, 30, 22, 35, 2), 'geometry_wkb': '0101000000ec07fe4b257c52c01b7d7fb98a5a4440'})]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [49]:
from sqlalchemy import create_engine, text
from geoalchemy2 import Geometry
import geopandas as gpd
import pandas as pd
from shapely import wkb

# Database credentials
dbname = 'group48project'
user = 'postgres'
host = 'localhost'
port = '5432'
password = 'postgres'

# Create an SQLAlchemy engine
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}')

# Filter Out Invalid Geometries
gdf_complaint = gdf_complaint[gdf_complaint['geometry'].notnull()]

# Use the known SRID since we have normalized the data
srid = 4326

# Convert the 'geometry' column to WKB format
gdf_complaint['geometry_wkb'] = gdf_complaint['geometry'].apply(lambda geom: wkb.dumps(geom).hex())

# Define the table name
table_name = 'complaints_311'
print(engine)

# Directly insert the GeoDataFrame into the database 
insert_query = f"INSERT INTO {table_name} (geometry, incident_zip, created_date) VALUES (ST_GeomFromWKB(:geometry_wkb), :incident_zip, :created_date)"
with engine.connect() as connection:
    for index, row in gdf_complaint.iterrows():
        params = {
            'geometry_wkb': bytes.fromhex(row['geometry_wkb']),  # Convert WKB hex string to bytes
            'incident_zip': row['zipcode'],  # Fix the parameter name here
            'created_date': row['created_at']
        }
        connection.execute(text(insert_query), params)
        
# Query to view the contents of the table
with engine.connect() as connection:
    query = text(f"SELECT ST_AsText(ST_GeomFromWKB(geometry::geometry)) as geometry, incident_zip, created_date FROM {table_name};")
    result_df = pd.read_sql_query(query, connection)

# Display the first few rows of the DataFrame
print(result_df.head())

# Close the engine
engine.dispose()


Engine(postgresql+psycopg2://postgres:***@localhost:5432/group48project)
                                       geometry incident_zip  \
0  POINT(-73.95917686020623 40.655672001198894)        11226   
1   POINT(-73.78751847563191 40.76675595839554)        11361   
2   POINT(-73.98486650733275 40.71949965458691)        10002   
3   POINT(-73.7972903094197 40.687499303408536)        11435   
4    POINT(-73.9579464603267 40.65220215349917)        11226   

              created_date  
0  2023-09-30T23:59:58.000  
1  2023-09-30T23:59:38.000  
2  2023-09-30T23:59:35.000  
3  2023-09-30T23:59:34.000  
4  2023-09-30T23:59:28.000  


In [None]:
from sqlalchemy import create_engine, text
from geoalchemy2 import Geometry
import geopandas as gpd
import pandas as pd

# Database credentials
dbname = 'group48project'
user = 'postgres'
host = 'localhost'
port = '5432'
password = 'postgres'

# Create an SQLAlchemy engine
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}')

# Filter Out Invalid Geometries
gdf_complaint = gdf_complaint[gdf_complaint['geometry'].notnull()]

# Use the known SRID since we have normalized the data
srid = 4326

# Define the table name
table_name = 'complaints_311'
print(engine)

# Directly insert the GeoDataFrame into the database 
insert_query = f"INSERT INTO {table_name} (geometry, incident_zip, created_date) VALUES (ST_SetSRID(ST_MakePoint(:lon, :lat), {srid}), :incident_zip, :created_date)"
with engine.connect() as connection:
    for index, row in gdf_complaint.iterrows():
        params = {
            'lon': row['geometry'].x,
            'lat': row['geometry'].y,
            'incident_zip': row['incident_zip'],
            'created_date': row['created_date']
        }
        connection.execute(text(insert_query), params)

# Query to view the contents of the table
with engine.connect() as connection:
    query = text(f"SELECT * FROM {table_name};")
    result_df = pd.read_sql_query(query, connection)

# Display the first few rows of the DataFrame
print(result_df.head())

# Close the engine
engine.dispose()
