# Imports

Includes all the packages used to run the notebook.

In [1]:
import pandas as pd 
import numpy as np
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# Load Datasets (Q1)

Each of the datasets is loaded using the `pd.read_csv()` function below. 

In [2]:
# salary data
url1 = 'https://raw.githubusercontent.com/data602sps/datasetspractice/main/salarydata.csv'
# weather data 
url2 = 'https://raw.githubusercontent.com/data602sps/datasetspractice/main/weather.csv'
# nyc population data
url3 = 'https://data.cityofnewyork.us/resource/xywu-7bv9.csv'
# nyc building data 
url4 = 'https://data.cityofnewyork.us/resource/hg8x-zxpr.csv'

df1 = pd.read_csv(url1)
df2 = pd.read_csv(url2, index_col=0)
df3 = pd.read_csv(url3)
df4 = pd.read_csv(url4)

# Clean and View Datasets (Q2, Q5, EC pt. 2)

After any necessary data cleaning steps, the cells below use the `head()` method to show the first 5 rows of each dataframe. This allows us to get a quick glance into the contents of each. Similarly, the `tail()` method could also be used, which would instead show the last 5 rows of a dataframe. 

In [3]:
# clean and view df1
df1 = df1[df1.columns[0:2]]
df1.columns = ['state', 'avg_salary']
df1['avg_salary'] = df1['avg_salary'].map(lambda x: x.replace(',', '')).astype(int)
df1.head()

Unnamed: 0,state,avg_salary
0,Alabama,90620
1,Alaska,99180
2,Arizona,88800
3,Arkansas,81430
4,California,97110


In [4]:
# view df2 (no cleaning necessary)
df2.head()

Unnamed: 0,origin,year,month,day,hour,temp,dewp,humid,wind_dir,wind_speed,wind_gust,precip,pressure,visib
1,EWR,2013,1.0,1.0,0.0,37.04,21.92,53.97,230.0,10.35702,11.918651,0.0,1013.9,10.0
2,EWR,2013,1.0,1.0,1.0,37.04,21.92,53.97,230.0,13.80936,15.891535,0.0,1013.0,10.0
3,EWR,2013,1.0,1.0,2.0,37.94,21.92,52.09,230.0,12.65858,14.567241,0.0,1012.6,10.0
4,EWR,2013,1.0,1.0,3.0,37.94,23.0,54.51,230.0,13.80936,15.891535,0.0,1012.7,10.0
5,EWR,2013,1.0,1.0,4.0,37.94,24.08,57.04,240.0,14.96014,17.21583,0.0,1012.8,10.0


In [5]:
# clean and view df3 (attempt for extra credit cleaning)
df3 = df3[df3.columns[1:]]
tmp1 = df3.melt(id_vars = 'borough', value_vars = df3.columns[1::2], var_name = 'year', value_name = 'total_pop')
tmp2 = df3.melt(id_vars = 'borough', value_vars = df3.columns[2::2], var_name = 'year', value_name = 'perc_of_pop')
tmp1['year'] = tmp1['year'].map(lambda x: x.replace('_', '')[:4])
tmp2['year'] = tmp2['year'].map(lambda x: x.replace('_', '')[:4])
df3 = pd.merge(tmp1, tmp2)
df3.head()

Unnamed: 0,borough,year,total_pop,perc_of_pop
0,NYC Total,1950,7891957,100.0
1,Bronx,1950,1451277,18.39
2,Brooklyn,1950,2738175,34.7
3,Manhattan,1950,1960101,24.84
4,Queens,1950,1550849,19.65


In [6]:
# view df4 (no cleaning necessary)
df4.head()

Unnamed: 0,project_id,project_name,project_start_date,project_completion_date,building_id,house_number,street_name,borough,postcode,bbl,...,_2_br_units,_3_br_units,_4_br_units,_5_br_units,_6_br_units,unknown_br_units,counted_rental_units,counted_homeownership_units,all_counted_units,total_units
0,73260,CONFIDENTIAL,2022-12-30T00:00:00.000,,,----,----,Brooklyn,,,...,0,0,0,0,0,1,0,1,1,1
1,73326,2316 CLARENDON ROAD,2022-12-30T00:00:00.000,,220777.0,2316,CLARENDON ROAD,Brooklyn,11226.0,3051890000.0,...,3,0,0,0,0,0,5,0,5,15
2,73325,61 CLARKSON AVE,2022-12-29T00:00:00.000,,221432.0,61,CLARKSON AVENUE,Brooklyn,11226.0,3050550000.0,...,0,0,0,0,0,0,4,0,4,11
3,66225,MORNINGSIDE CLUSTER,2022-12-28T00:00:00.000,,25246.0,384,MANHATTAN AVENUE,Manhattan,10026.0,1019430000.0,...,1,8,0,0,0,0,0,9,9,9
4,66225,MORNINGSIDE CLUSTER,2022-12-28T00:00:00.000,,25300.0,494,MANHATTAN AVENUE,Manhattan,10027.0,1019470000.0,...,0,0,0,0,0,0,0,9,9,9


# Summarize Data (Q3, Q4)

We can use the `.shape` attribute of each dataframe to get the number of rows and columns in each.

In [7]:
dfs = [df1, df2, df3, df4]
i = 1

for df in dfs:
    print('Dataframe: df%d\nNumber of rows: %d\nNumber of Columns: %d\n' %(i, df.shape[0], df.shape[1]))
    i += 1

Dataframe: df1
Number of rows: 51
Number of Columns: 2

Dataframe: df2
Number of rows: 8719
Number of Columns: 14

Dataframe: df3
Number of rows: 60
Number of Columns: 4

Dataframe: df4
Number of rows: 1000
Number of Columns: 41



The `dtypes()` attribute can be used to analyze the datatypes of the fields of each dataframe.

In [8]:
i = 1

for df in dfs:
    print('Dataframe: df%d\n%s\n' %(i, df.dtypes))
    i += 1

Dataframe: df1
state         object
avg_salary     int64
dtype: object

Dataframe: df2
origin         object
year            int64
month         float64
day           float64
hour          float64
temp          float64
dewp          float64
humid         float64
wind_dir      float64
wind_speed    float64
wind_gust     float64
precip        float64
pressure      float64
visib         float64
dtype: object

Dataframe: df3
borough         object
year            object
total_pop        int64
perc_of_pop    float64
dtype: object

Dataframe: df4
project_id                         int64
project_name                      object
project_start_date                object
project_completion_date           object
building_id                      float64
house_number                      object
street_name                       object
borough                           object
postcode                         float64
bbl                              float64
bin                              float64
co

The `describe()` method provides even more information about a dataframe, by providing summary statistics (count, mean standard deviation, min, max, and quartiles) for each of the fields in a dataframe. 

In [9]:
i = 1
for df in dfs:
    print('Dataframe: df%d\n%s\n' %(i, df.describe()))
    i += 1

Dataframe: df1
          avg_salary
count      51.000000
mean    87563.333333
std      6117.818456
min     75010.000000
25%     83800.000000
50%     86690.000000
75%     90655.000000
max    107920.000000

Dataframe: df2
         year        month          day         hour         temp  \
count  8719.0  8718.000000  8718.000000  8718.000000  8718.000000   
mean   2013.0     6.506538    15.679628    11.515715    55.484942   
std       0.0     3.439961     8.768250     6.916846    18.345817   
min    2013.0     1.000000     1.000000     0.000000    10.940000   
25%    2013.0     4.000000     8.000000     6.000000    39.920000   
50%    2013.0     7.000000    16.000000    12.000000    55.940000   
75%    2013.0     9.000000    23.000000    18.000000    71.060000   
max    2013.0    12.000000    31.000000    23.000000   100.040000   

              dewp        humid     wind_dir   wind_speed    wind_gust  \
count  8718.000000  8718.000000  8486.000000  8718.000000  8718.000000   
mean     4