<a href="https://colab.research.google.com/github/thomreid11/DAS7000_PORT1/blob/main/DAS7000_port1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Thomas Reid - DAS7000 - PORT1**

Exploratory Data Analysis (EDA) on Manhattan Property Dataset investigating property types and prices within Manhattan, New York

#### **Importing necessary libraries**

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

#### **Mounting Google Drive to Google Collab and importing dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
drive_path = '/content/drive/MyDrive/Manhattan property dataset csv.csv'

In [None]:
# Converting csv to df
df = pd.read_csv(drive_path)

In [None]:
df.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,1,ALPHABET CITY,01 ONE FAMILY DWELLINGS,1,376,43,,S1,743 EAST 6 STREET,,...,1.0,1.0,2.0,2090,3680,1940.0,1,S1,0,01/23/2025
1,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,372,19,,C7,"292 EAST 3 STREET, 4B",,...,9.0,1.0,10.0,2401,6920,1920.0,2,C7,0,02/07/2025
2,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,396,26,,C4,638 EAST 14 STREET,,...,20.0,0.0,20.0,2779,10740,1920.0,2,C4,0,05/01/2025
3,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,396,27,,C4,640 EAST 14 STREET,,...,20.0,0.0,20.0,2831,10900,1920.0,2,C4,0,05/01/2025
4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,400,11,,C4,"510 EAST 5TH STREET, 9",,...,18.0,0.0,18.0,2404,6875,1900.0,2,C4,280000,04/29/2025


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18491 entries, 0 to 18490
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   BOROUGH                         18491 non-null  int64  
 1   NEIGHBORHOOD                    18491 non-null  object 
 2   BUILDING CLASS CATEGORY         18491 non-null  object 
 3   TAX CLASS AT PRESENT            18491 non-null  object 
 4   BLOCK                           18491 non-null  int64  
 5   LOT                             18491 non-null  int64  
 6   EASEMENT                        0 non-null      float64
 7   BUILDING CLASS AT PRESENT       18491 non-null  object 
 8   ADDRESS                         18491 non-null  object 
 9   APARTMENT NUMBER                8634 non-null   object 
 10  ZIP CODE                        18491 non-null  int64  
 11  RESIDENTIAL UNITS               9366 non-null   float64
 12  COMMERCIAL UNITS                

#### **Data Preprocessing**

In [None]:
# Making columns neat and uniform
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.title()

In [None]:
# Dropping irrelevant column
df = df.drop(columns=["Easement"])

In [None]:
# Correcting the data types
# Changing strings to floats and removing the commas
df['Land_Square_Feet'] = df['Land_Square_Feet'].str.replace(',', '', regex=False).astype(float)
df['Gross_Square_Feet'] = df['Gross_Square_Feet'].str.replace(',', '', regex=False).astype(float)
df['Sale_Price'] = df['Sale_Price'].str.replace(',', '', regex=False).astype(float)

# Changing the remaining data types
df['Sale_Date'] = pd.to_datetime(df['Sale_Date'])
df["Tax_Class_At_Time_Of_Sale"] = df["Tax_Class_At_Time_Of_Sale"].astype("string")
df["Tax_Class_At_Present"] = df["Tax_Class_At_Present"].astype("string")


# Changing 3 columns in one go for efficiency
unit_cols = ["Residential_Units", "Commercial_Units", "Total_Units"]
for col in unit_cols:
    df[col] = df[col].astype("Int64")


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18491 entries, 0 to 18490
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Borough                         18491 non-null  int64         
 1   Neighborhood                    18491 non-null  object        
 2   Building_Class_Category         18491 non-null  object        
 3   Tax_Class_At_Present            18491 non-null  string        
 4   Block                           18491 non-null  int64         
 5   Lot                             18491 non-null  int64         
 6   Building_Class_At_Present       18491 non-null  object        
 7   Address                         18491 non-null  object        
 8   Apartment_Number                8634 non-null   object        
 9   Zip_Code                        18491 non-null  int64         
 10  Residential_Units               9366 non-null   Int64         
 11  Co