# NYC Apartment Search

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an idea of a possible approach.**_

## Setup

In [15]:
# All import statements needed for the project, for example:
#!pip install geoalchemy2
#!pip install geopandas
import json
import pathlib
import urllib.parse

import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db

from sqlalchemy.orm import declarative_base

In [16]:
# Any constants you might need; some have been added for you

# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path(data)
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "nyc_zipcodes.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "aQ9WaK19vkxI27LB8CNNI6E7Y"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/resource"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "real_estate_nyc"  # Replace with your actual database name
DB_USER = "4501_project_team"  # Replace with your actual database user
#DB_USER = "jt3467" 
DB_URL = f"postgres+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"

# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [17]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing

In [18]:
import json
import urllib.parse
import requests
from pathlib import Path

def download_nyc_geojson_data(url, force=False):
    parsed_url = urllib.parse.urlparse(url)
    url_path = parsed_url.path.strip("/")
    
    # Create a Path object for the filename
    filename = DATA_DIR / url_path

    # Check if the file exists or if force download is requested
    if force or not filename.exists():
        print(f"Downloading {url} to {filename}...")

        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Create directories if they don't exist
        filename.parent.mkdir(parents=True, exist_ok=True)

        # Write the content to a file
        with open(filename, "w") as f:
            json.dump(response.json(), f)
        print(f"Done downloading {url}.")

    else:
        print(f"Reading from {filename}...")

    return filename


In [19]:
def download_nyc_geojson_data(url, app_token, force=False):
    headers = {
        'X-App-Token': app_token,
        'Accept': 'application/json'
    }

    parsed_url = urllib.parse.urlparse(url)
    url_path = parsed_url.path.strip("/")
    
    filename = DATA_DIR / url_path

    if force or not filename.exists():
        print(f"Downloading {url} to {filename}...")

        response = requests.get(url, headers=headers)
        response.raise_for_status()

        filename.parent.mkdir(parents=True, exist_ok=True)

        with open(filename, "w") as f:
            json.dump(response.json(), f)
        print(f"Done downloading {url}.")

    else:
        print(f"Reading from {filename}...")

    return filename


In [20]:
def load_and_clean_zipcodes(ZIPCODE_DATA_FILE):
  # Load the shapefile using geopandas
    gdf_nyc_zipcodes = gpd.read_file(ZIPCODE_DATA_FILE)
   # print(gdf_nyc_zipcodes.crs)  
   # print(gdf_nyc_zipcodes.head()) 
   # gdf_nyc_zipcodes.plot()
   # plt.show()
   # Define the columns to keep. For the purpose of this example, we'll assume the project does not require building-specific ZIP codes, FIPS codes, or URLs.
    columns_to_keep = [
    'ZIPCODE', 'PO_NAME', 'POPULATION', 'AREA', 'STATE', 'COUNTY', 'geometry']

  # Remove unnecessary columns
    gdf_nyc_zipcodes_cleaned = gdf_nyc_zipcodes[columns_to_keep]

# Check for and remove any invalid geometries
    gdf_nyc_zipcodes_cleaned = gdf_nyc_zipcodes_cleaned[~gdf_nyc_zipcodes_cleaned.is_empty & gdf_nyc_zipcodes_cleaned.is_valid]

# Normalize column names to lowercase with underscores
    gdf_nyc_zipcodes_cleaned.columns = gdf_nyc_zipcodes_cleaned.columns.str.lower().str.replace(' ', '_')
    
    return gdf_nyc_zipcodes_cleaned

In [21]:
def download_and_clean_311_data():
    # Download the data using SoQL filters for the correct date range
    # This may involve constructing a URL with query parameters
    #data_311_url = BASE_NYC_DATA_URL + NYC_DATA_311
    #data_311_file = download_nyc_geojson_data(data_311_url)
    
    data_311_file = r"/Users/jz/Desktop/4501project/311_request_data.csv"
    #data_311_file = r"C:\Users\Tzz\Desktop\4501project\311_request_data.csv"
    def_311 = gpd.read_file(data_311_file)
    df_311_columns_to_keep = [
    'Unique Key', 'Created Date',  'Agency', 'Complaint Type', 
    'Descriptor', 'Location Type', 'Incident Zip', 'City', 'Borough', 
    'Latitude', 'Longitude'
]

    df_311_cleaned = def_311[df_311_columns_to_keep]

    df_311_cleaned = df_311_cleaned.dropna(subset=df_311_columns_to_keep)

    df_311_cleaned.columns = df_311_cleaned.columns.str.lower().str.replace(' ', '_')

    return df_311_cleaned

In [22]:
def download_and_clean_tree_data():
     # Download the data
    #tree_data_url = BASE_NYC_DATA_URL + NYC_DATA_TREES
    #tree_datafile = download_nyc_geojson_data(tree_data_url)
    
    tree_datafile =r"/Users/jz/Desktop/4501project/2015StreetTreesCensus_TREES.csv"
    #tree_datafile =r"C:\Users\Tzz\Desktop\4501project\2015StreetTreesCensus_TREES.csv"
    # Load the data
    gdf_tree = gpd.read_file(tree_datafile)
    # print(gdf_tree.crs)
    # print(gdf_tree.head()) 
    # gdf_tree.plot() 
    # plt.show()
    # Define the columns to keep. For the purpose of this example, we'll assume the project does not require building-specific ZIP codes, FIPS codes, or URLs.
    columns_to_keep = [
    'tree_id', 'block_id', 'status', 'address', 'zipcode', 'zip_city',
    'Latitude','longitude','x_sp','y_sp','geometry'
    ]

# Remove unnecessary columns
    gdf_tree_cleaned = gdf_tree[columns_to_keep]

# Check for and remove any invalid geometries
    gdf_tree_cleaned = gdf_tree_cleaned[~gdf_tree_cleaned.is_empty & gdf_tree_cleaned.is_valid]

# Normalize column names to lowercase with underscores
    gdf_tree_cleaned.columns = gdf_tree_cleaned.columns.str.lower().str.replace(' ', '_')
    
    return gdf_tree_cleaned
    

In [23]:
def load_and_clean_zillow_data():
    df_zillow = pd.read_csv(ZILLOW_DATA_FILE)
    zillow_columns_to_keep = [
    'RegionID', 'RegionName', 'City', 'State', 'Metro', 'CountyName', 
    "2023-01-31","2023-02-28","2023-03-31","2023-04-30","2023-05-31",
    "2023-06-30","2023-07-31","2023-08-31","2023-09-30"
]
    # Filter the dataframe to keep only the selected columns
    df_zillow_cleaned = df_zillow[zillow_columns_to_keep]

# Handle missing values by filling with the previous value in the column, as a simple method of imputation
    df_zillow_cleaned = df_zillow_cleaned.fillna(method='ffill', axis=1)

# Normalize column names to lowercase with underscores
    df_zillow_cleaned.columns = df_zillow_cleaned.columns.str.lower().str.replace('-', '_')
    
    return df_zillow_cleaned

    

In [24]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data()
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [None]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

In [None]:
geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
zipcode.info()

In [None]:
# Show basic info about each dataframe
geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
geodf_zipcode_data.info()

In [None]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

In [None]:
geodf_311_data = download_and_clean_311_data()
geodf_311_data.info()

In [None]:
geodf_311_data.head()

In [None]:
 geodf_tree_data= download_and_clean_tree_data()
geodf_tree_data.info()

In [None]:
geodf_tree_data.head()

In [None]:
df_zillow_data = load_and_clean_zillow_data()
df_zillow_data.info()

In [None]:
df_zillow_data.head()

## Part 2: Storing Data

In [None]:
def setup_new_postgis_database(username, db_name):
    """
    Create a new PostgreSQL database and enable the PostGIS extension.

    - username (str): The username of the PostgreSQL user.
    - db_name (str): The name of the new database created.
    """

    # Create a new PostgreSQL database
    create_db_command = f"createdb {db_name} --username={username}"
    !{create_db_command}

    # Enable the PostGIS extension for the created database
    enable_postgis_command = f"psql --dbname={db_name} --username={username} -c 'CREATE EXTENSION postgis;'"
    !{enable_postgis_command}

In [None]:
setup_new_postgis_database(DB_USER, DB_NAME)

### Creating Tables


These are just a couple of options to creating your tables; you can use one or the other, a different method, or a combination.

In [None]:
engine = db.create_engine(DB_URL)

#### Using SQL

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the SQL statements to create your 4 tables
ZIPCODE_SCHEMA = """
CREATE TABLE nyc_zipcodes (
    zipcodes_id SERIAL PRIMARY KEY,
    zipcode VARCHAR(10),
    po_name VARCHAR(255),
    population INTEGER,
    area FLOAT,
    state VARCHAR(255),
    county VARCHAR(255),
    geometry GEOMETRY(Point, 4326)
)
"""

NYC_311_SCHEMA = """
CREATE TABLE nyc_311 (
    complaints_id SERIAL PRIMARY KEY,
    unique_key VARCHAR(255),
    created_date TIMESTAMP,
    agency VARCHAR(255),
    complaint_type VARCHAR(255),
    descriptor VARCHAR(255),
    location_type VARCHAR(255),
    incident_zip VARCHAR(10),
    city VARCHAR(255),
    borough VARCHAR(255),
    latitude FLOAT,
    longitude FLOAT
)
"""

NYC_TREE_SCHEMA = """
CREATE TABLE nyc_trees (
    trees_id SERIAL PRIMARY KEY,
    tree_id INTEGER,
    block_id INTEGER,
    status VARCHAR(255),
    address VARCHAR(255),
    zipcode VARCHAR(10),
    zip_city VARCHAR(255),
    latitude FLOAT,
    longitude FLOAT,
    x_sp FLOAT,
    y_sp FLOAT,
    geometry GEOMETRY(Point, 4326)
)
"""

ZILLOW_SCHEMA = """
CREATE TABLE nyc_historical_average_rents (
    zillow_id SERIAL PRIMARY KEY,
    region_id INTEGER,
    region_name VARCHAR(255),
    city VARCHAR(255),
    state VARCHAR(255),
    metro VARCHAR(255),
    county_name VARCHAR(255),
    january_2023 FLOAT,
    february_2023 FLOAT,
    march_2023 FLOAT,
    april_2023 FLOAT,
    may_2023 FLOAT,
    june_2023 FLOAT,
    july_2023 FLOAT,
    august_2023 FLOAT,
    september_2023 FLOAT
)
"""

In [None]:
# create that required schema.sql file
with open(DB_SCHEMA_FILE, "w") as f:
    f.write(ZIPCODE_SCHEMA)
    f.write(NYC_311_SCHEMA)
    f.write(NYC_TREE_SCHEMA)
    f.write(ZILLOW_SCHEMA)

In [None]:
# If using SQL (as opposed to SQLAlchemy), execute the schema files to create tables
with engine.connect() as connection:
    pass

### Add Data to Database

These are just a couple of options to write data to your tables; you can use one or the other, a different method, or a combination.

#### Using SQL

In [None]:
def write_dataframes_to_table(tablename_to_dataframe):
    # write INSERT statements or use pandas/geopandas to write SQL
    engine = create_engine(DB_URL)
    for table_name, dataframe in tablename_to_dataframe.items():
        dataframe.to_sql(table_name, engine, if_exists="replace", index=False)

In [None]:
tablename_to_dataframe = {
    "zipcodes": geodf_zipcode_data,
    "complaints": geodf_311_data,
    "trees": geodf_tree_data,
    "rents": df_zillow_data,
}

In [None]:
write_dataframes_to_table(tablename_to_dataframe)

## Part 3: Understanding the Data

### Query 1

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

In [None]:
QUERY_1_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_1 = """
FILL_ME_IN
"""

In [None]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_1))
    for row in result:
        print(row)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query your database for the data needed.
    # You can put the data queried into a pandas/geopandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)