In [13]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np 
import math
import geopandas as gpd
import csv

#import geoplot as gplt


from datetime import datetime
from pylab import *
from shapely.ops import nearest_points

In [18]:
df = pd.read_csv('nyc_airbnb.csv')
df2 = pd.read_csv('NYC_Citywide_Annualized_Calendar_Sales_Update.csv')
geo_ny = gpd.read_file('./Individual_Landmark_Lots/Individual_Landmark_Lots.shp')


In [3]:
#CLEAN AIRBNB DATASET (df)

#drop unused column
df.drop(['host_name','host_id','name'], axis=1, inplace=True)

#Only keep data with positive price
df = df[df["price"] > 0]

#drop duplicate
df = df.drop_duplicates()

#check null data
df.isnull().sum()

id                                    0
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10051
reviews_per_month                 10051
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [4]:
#EDIT/ADD COLUMNS TO AIRBNB DATASET (df)

#add activity column
#activity = No Record if no last review
#activity = Inactive if last review is not within 1year of the latest last_review
#activity = Active otherwise

#convert to datetime

df['last_review'] = pd.to_datetime(df['last_review'])

time_threshold = pd.to_datetime('2018/12/06')

df.loc[df['last_review'] >= time_threshold, 'activity'] = 'Active'
df.loc[df['last_review'] < time_threshold, 'activity'] = 'Inactive'
df.loc[df['last_review'].isnull(), 'activity'] = 'No Record'

df_active = df.loc[df['activity'] == 'Active']
df_active

#add occupancy_% column
df_active['occupancy_%'] = round(100 - df_active['availability_365']/ 365 * 100).astype('float')

#reset index
df_active.reset_index(inplace=True)
df_active.drop(['index'], axis=1, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [5]:
#DETERMINE ON AVERAGE HOW FAR AN AIRBNB FROM ALL THE LANDMARKS OF NYC

def haversine_distance(lat1, lon1, lat2, lon2):
    '''
    function to calculated distance in km based on long and lat
    '''
    r = 6371
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
    res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
    return np.round(res, 2)

def find_distance(df, df2):
    how_far_list = []
    for idx, item in df['coords'].iteritems():
        distance_list = []
        
        for idx2, item2 in df2['coords'].iteritems():
            #print(item, item2)
            l = haversine_distance(item[1], item[0], item2[1], item2[0])
            distance_list.append(l)
            
        mean_dist = mean(distance_list)   
        how_far_list.append(mean_dist)
        if idx % 1000 == 0:
            print(idx)
    return how_far_list

   
# geo_ny = geo_ny[['OBJECTID', 'geometry']]

# geo_ny = geo_ny.to_crs("EPSG:4326")  #convert to correct projection
# geo_ny['coords'] = geo_ny['geometry'].apply(lambda x: x.representative_point().coords[:]) 
# geo_ny['coords'] = [coords[0] for coords in geo_ny['coords']]

# df_geo = df_active[['latitude','longitude']]
# df_geo = gpd.GeoDataFrame(df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude))

# df_geo['coords'] = df_geo['geometry'].apply(lambda x: x.representative_point().coords[:]) 
# df_geo['coords'] = [coords[0] for coords in df_geo['coords']]


# how_far_km = find_distance(df_geo, geo_ny)

#save to csv so can import csv later to save time
#how_far_df = pd.DataFrame(how_far_km, columns=["how_far_km"])
#how_far_df.to_csv('df_active_how_far_km2.csv', index=False)


In [6]:
#read how_far_km_csv_file and add to df_active
df3 = pd.read_csv('df_active_how_far_km.csv')
df3

df_active = pd.concat([df_active, df3], axis = 1)
df_active.to_csv('clean_nyc_airbnb.csv', index=False)

In [19]:
#CLEAN SALE_PRICE DATASET (df2)

#drop unused column
df2 = df2[['BOROUGH','BUILDING CLASS CATEGORY','NEIGHBORHOOD','SALE PRICE','SALE DATE','Latitude','Longitude']]

#drop duplicate
df2 = df2.drop_duplicates()

#drop null data
df2 = df2.dropna()

#Rename borough

df2.columns = ['neighbourhood_group','building_class','neighbourhood','sale_price','sale_date','latitude','longitude']

df2['neighbourhood_group'] = df2['neighbourhood_group'].astype('str')

df2['neighbourhood_group'].replace('1','Manhattan', inplace = True)
df2['neighbourhood_group'].replace('2','Brooklyn', inplace = True)
df2['neighbourhood_group'].replace('3','Queens', inplace = True)
df2['neighbourhood_group'].replace('4','Bronx', inplace = True)
df2['neighbourhood_group'].replace('5','Staten Island', inplace = True)

In [20]:
#FILTER OUT BUILDING CLASS (df2)

airbnb_options = [
    '01 ONE FAMILY DWELLINGS',
    '02 TWO FAMILY DWELLINGS',
    '10 COOPS - ELEVATOR APARTMENTS',
    '13 CONDOS - ELEVATOR APARTMENTS',
    '03 THREE FAMILY DWELLINGS',
    '07 RENTALS - WALKUP APARTMENTS',
    '09 COOPS - WALKUP APARTMENTS',
    '04 TAX CLASS 1 CONDOS',
    '15 CONDOS - 2-10 UNIT RESIDENTIAL',
    '12 CONDOS - WALKUP APARTMENTS',
    '17 CONDO COOPS',
    '14 RENTALS - 4-10 UNIT',
    '08 RENTALS - ELEVATOR APARTMENTS',
    '16 CONDOS - 2-10 UNIT WITH COMMERCIAL UNIT'      
]

df2 = df2[df2['building_class'].isin(airbnb_options)]

In [21]:
df2['neighbourhood'].unique()

array(['CHELSEA', 'UPPER EAST SIDE (79-96)', 'BAY RIDGE', 'BOROUGH PARK',
       'BROOKLYN HEIGHTS', 'GREENPOINT', 'SHEEPSHEAD BAY', 'SUNSET PARK',
       'WILLIAMSBURG-CENTRAL', 'ELMHURST', 'LONG ISLAND CITY',
       'MIDDLE VILLAGE', 'REGO PARK', 'CASTLETON CORNERS', 'NEW BRIGHTON',
       'GRAMERCY', 'MURRAY HILL', 'UPPER WEST SIDE (59-79)', 'BAYCHESTER',
       'BATH BEACH', 'BUSHWICK', 'OCEAN HILL', 'FLUSHING-NORTH',
       'ROSEDALE', 'ARROCHAR-SHORE ACRES', 'GREAT KILLS', 'CLINTON',
       'FINANCIAL', 'FLATIRON', 'GREENWICH VILLAGE-WEST',
       'HARLEM-CENTRAL', 'INWOOD', 'LOWER EAST SIDE', 'MANHATTAN VALLEY',
       'MIDTOWN CBD', 'MIDTOWN EAST', 'UPPER EAST SIDE (59-79)',
       'UPPER WEST SIDE (79-96)', 'BEDFORD PARK/NORWOOD', 'BRONXDALE',
       'MORRISANIA/LONGWOOD', 'RIVERDALE', 'SCHUYLERVILLE/PELHAM BAY',
       'WILLIAMSBRIDGE', 'BEDFORD STUYVESANT', 'BENSONHURST',
       'BRIGHTON BEACH', 'CANARSIE', 'CARROLL GARDENS', 'CLINTON HILL',
       'CONEY ISLAND', 'CROWN HE

In [17]:
#EDIT/ADD COLUMNS TO SALE_PRICE DATASET (df2)

#convert to datetime
df2['sale_date'] = pd.to_datetime(df2['sale_date'])

lower_time_threshold = pd.to_datetime('2018/12/06')
upper_time_threshold = pd.to_datetime('2019/12/06')

df2 = df2.loc[df2['sale_date'] >= lower_time_threshold ]
df_sale = df2.loc[df2['sale_date'] <= upper_time_threshold]

#reset index
df_sale.reset_index(inplace=True)
df_sale.drop(['index'], axis=1, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [10]:
# #DETERMINE ON AVERAGE HOW FAR A SALE PROPERTY FROM ALL THE LANDMARKS OF NYC

# geo_ny = geo_ny[['OBJECTID', 'geometry']]
# geo_ny = geo_ny.to_crs("EPSG:4326")  #convert to correct projection
# geo_ny['coords'] = geo_ny['geometry'].apply(lambda x: x.representative_point().coords[:]) 
# geo_ny['coords'] = [coords[0] for coords in geo_ny['coords']]

# df_geo = df_sale[['latitude','longitude']]

# df_geo = gpd.GeoDataFrame(df_geo, geometry=gpd.points_from_xy(df_geo.longitude, df_geo.latitude))
# df_geo['coords'] = df_geo['geometry'].apply(lambda x: x.representative_point().coords[:]) 
# df_geo['coords'] = [coords[0] for coords in df_geo['coords']]

# how_far_km = find_distance(df_geo, geo_ny)

# print(df_geo['coords'].count)
# print(len(how_far_km))

# #save to csv so can import csv later to save time
# how_far_df = pd.DataFrame(how_far_km, columns=["how_far_km"])
# how_far_df.to_csv('df_sale_how_far_km2.csv', index=False)

In [11]:
#read csv_file and add to df_active
df4 = pd.read_csv('df_sale_how_far_km.csv')
df4

df_sale = pd.concat([df_sale, df4], axis = 1)
df_sale.to_csv('clean_nyc_sale.csv', index = False)

In [12]:
df_sale


Unnamed: 0,neighbourhood_group,building_class,neighbourhood,sale_price,sale_date,latitude,longitude,how_far_km
0,Manhattan,13 CONDOS - ELEVATOR APARTMENTS,CHELSEA,5200000,2018-12-06,40.742107,-73.998510,7.454557
1,Manhattan,17 CONDO COOPS,CHELSEA,540000,2018-12-06,40.745466,-73.998917,7.501481
2,Manhattan,10 COOPS - ELEVATOR APARTMENTS,CLINTON,336648,2018-12-06,40.768225,-73.986419,7.868896
3,Manhattan,13 CONDOS - ELEVATOR APARTMENTS,FASHION,10,2018-12-06,40.750038,-73.983409,7.270112
4,Manhattan,13 CONDOS - ELEVATOR APARTMENTS,FLATIRON,545000,2018-12-06,40.742021,-73.986742,7.239504
...,...,...,...,...,...,...,...,...
73239,Staten Island,01 ONE FAMILY DWELLINGS,WOODROW,0,2019-01-25,40.539191,-74.222818,30.246898
73240,Staten Island,02 TWO FAMILY DWELLINGS,WOODROW,590000,2019-01-11,40.536236,-74.209270,29.721766
73241,Staten Island,02 TWO FAMILY DWELLINGS,WOODROW,0,2019-01-31,40.535069,-74.215249,30.141944
73242,Staten Island,02 TWO FAMILY DWELLINGS,WOODROW,0,2019-01-24,40.539712,-74.215793,29.808690


In [None]:
#Define a new neighbourhood column to compare with neighbourhood data from sale
df['neighbourhood'] = df['neighbourhood'].str.upper()
df['neigh_comp'] = df['neighbourhood']

#rename neighbour hood to match sale data
#some neighbourhood are not on the sale list, can assume name of nearby neighbourhood
replace = [
    ['CONCOURSE VILLAGE', 'CONCOURSE'],['WEST BRIGHTON', 'WEST NEW BRIGHTON'],
    ['KEW GARDENS HILLS', 'KEW GARDENS'], ['BAY TERRACE, STATEN ISLAND', 'BAY TERRACE'],
    ['WESTCHESTER SQUARE', 'WESTCHESTER'],["BULL'S HEAD", 'BULLS HEAD'],
    ["NEW DORP BEACH", 'NEW DORP-BEACH'],["BEDFORD-STUYVESANT", 'BEDFORD STUYVESANT'],
    ["EAST HARLEM", 'HARLEM-EAST'],["FLATIRON DISTRICT", 'FLATIRON'],
    ["NORTH RIVERDALE", 'RIVERDALE'],["EAST MORRISANIA", 'MORRISANIA'],
    ["EAST FLATBUSH", 'FLATBUSH-EAST'],["PRINCE'S BAY", 'PRINCES BAY'],
    ["PRINCE'S BAY", 'PRINCES BAY'],["FINANCIAL DISTRICT", 'FINANCIAL'],
    ["SOUTH SLOPE", 'PARK SLOPE SOUTH'],["HELL'S KITCHEN", 'MIDTOWN WEST'],
    ["WEST VILLAGE", 'GREENWICH VILLAGE'],['NOLITA','LITTLE ITALY'],
    ['PROSPECT-LEFFERTS GARDENS','PROSPECT HEIGHTS'],['ROCKAWAY BEACH','ROCKAWAY PARK'],
    ['BAYSWATER','FAR ROCKAWAY'], ['EASTCHESTER', 'WESTCHESTER'],
    ['DITMARS STEINWAY','ASTORIA'],['THEATER DISTRICT','MIDTOWN WEST'],
    ['EDGEMERE','ROCKAWAY PARK'], ['COLUMBIA ST', 'LOWER EAST SIDE'],
    ['BATTERY PARK CITY','FINANCIAL'],['TWO BRIDGES','SOUTHBRIDGE'],
    ['STUYVESANT TOWN','GRAMERCY'],['UNIVERSITY HEIGHTS',"KINGSBRIDGE HTS/UNIV HTS"],
    ['MARBLE HILL', 'KINGSBRIDGE/JEROME PARK'],['NOHO','EAST VILLAGE'],
    ['RANDALL MANOR','WEST NEW BRIGHTON'],['GRANITEVILLE','MARINERS HARBOR'],
    ['HOWLAND HOOK','BLOOMFIELD'],['LIGHTHOUSE HILL','RICHMONDTOWN-LIGHTHS HILL'],
    ['VINEGAR HILL','NAVY YARD'], ['DUMBO','DOWNTOWN-FULTON FERRY'],
    ['SEA GATE', 'CONEY ISLAND'],['DOWNTOWN BROOKLYN','DOWNTOWN-FULTON MALL'],
    ['FORT HAMILTON','DYKER HEIGHTS'],['OLINVILLE', 'WILLIAMSBRIDGE'],
    ['CLAREMONT','VILLAGE MORRISANIA'], ['ALLERTON','PELHAM PARKWAY NORTH'],
    ['SPUYTEN DUYVIL','RIVERDALE'],['EDENWALD','WESTCHESTER'],
    ['WEST FARMS','CROTONA PARK'],['CLASON POINT', 'SOUNDVIEW']   
]

for i in replace:
    df['neigh_comp'] = df['neigh_comp'].str.replace(i[0], i[1])

In [None]:
#DETERMINE MEDIAN PRICE OF PROPERTY IN EACH NEIGHBOURHOOD


neighbour_list = df['neigh_comp'].to_list()
neighbour_list = set(neighbour_list)
neighbour_dict = {}

for item in neighbour_list:       
    temp_list = []
    for index, row in df2.iterrows():
        
        if item in row['neighbourhood']:
            price = row['sale_price']
            temp_list.append(price)
            
    neighbour_dict[item]= temp_list
    if len(temp_list) == 0:
        print(item)
