In [120]:
import pandas as pd
import numpy as np

from datetime import datetime,date

import warnings
warnings.filterwarnings('ignore')

In [121]:
all_nyc_weather = pd.read_csv('resources/nyc_weather.csv')

## 1. Initial exploration of dataset

In [122]:
all_nyc_weather.head()

Unnamed: 0.1,Unnamed: 0,Date,Max.TemperatureF,Mean.TemperatureF,Min.TemperatureF,Max.Dew.PointF,MeanDew.PointF,Min.DewpointF,Max.Humidity,Mean.Humidity,...,Min.VisibilityMiles,Max.Wind.SpeedMPH,Mean.Wind.SpeedMPH,Max.Gust.SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees.br...,city,season
0,1,1948-07-01,84,78.0,72,71,65,58,93,65,...,2.0,16,8,,0.0,0.0,Fog,264<br />,New York City (USA),Summer
1,2,1948-07-02,82,72.0,63,62,53,49,76,51,...,10.0,16,10,,0.0,0.0,,315<br />,New York City (USA),Summer
2,3,1948-07-03,78,71.0,64,66,58,53,84,62,...,5.0,14,6,,0.0,0.0,,203<br />,New York City (USA),Summer
3,4,1948-07-04,84,76.0,68,68,63,56,90,67,...,2.0,12,5,,0.0,0.0,Fog,198<br />,New York City (USA),Summer
4,5,1948-07-05,93,82.0,70,74,69,65,93,71,...,3.0,18,8,,0.0,0.0,Fog-Rain-Thunderstorm,218<br />,New York City (USA),Summer


In [123]:
all_nyc_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24560 entries, 0 to 24559
Data columns (total 26 columns):
Unnamed: 0                   24560 non-null int64
Date                         24560 non-null object
Max.TemperatureF             24560 non-null int64
Mean.TemperatureF            24558 non-null float64
Min.TemperatureF             24560 non-null int64
Max.Dew.PointF               24560 non-null int64
MeanDew.PointF               24560 non-null int64
Min.DewpointF                24560 non-null int64
Max.Humidity                 24560 non-null int64
Mean.Humidity                24560 non-null int64
Min.Humidity                 24560 non-null int64
Max.Sea.Level.PressureIn     24560 non-null float64
Mean.Sea.Level.PressureIn    24560 non-null float64
Min.Sea.Level.PressureIn     24560 non-null float64
Max.VisibilityMiles          24545 non-null float64
Mean.VisibilityMiles         24545 non-null float64
Min.VisibilityMiles          24545 non-null float64
Max.Wind.SpeedMPH         

In [124]:
all_nyc_weather.isna().sum()

Unnamed: 0                       0
Date                             0
Max.TemperatureF                 0
Mean.TemperatureF                2
Min.TemperatureF                 0
Max.Dew.PointF                   0
MeanDew.PointF                   0
Min.DewpointF                    0
Max.Humidity                     0
Mean.Humidity                    0
Min.Humidity                     0
Max.Sea.Level.PressureIn         0
Mean.Sea.Level.PressureIn        0
Min.Sea.Level.PressureIn         0
Max.VisibilityMiles             15
Mean.VisibilityMiles            15
Min.VisibilityMiles             15
Max.Wind.SpeedMPH                0
Mean.Wind.SpeedMPH               0
Max.Gust.SpeedMPH            15538
PrecipitationIn                  0
CloudCover                    2781
Events                       12839
WindDirDegrees.br...             0
city                             0
season                           0
dtype: int64

In [125]:
# Finding range of dates for all reports
max_date = all_nyc_weather['Date'].max()
min_date = all_nyc_weather['Date'].min()
print(f'Dataset starts from {min_date} to {max_date}')

Dataset starts from 1948-07-01 to 2015-12-31


In [126]:
all_nyc_weather.columns

Index(['Unnamed: 0', 'Date', 'Max.TemperatureF', 'Mean.TemperatureF',
       'Min.TemperatureF', 'Max.Dew.PointF', 'MeanDew.PointF', 'Min.DewpointF',
       'Max.Humidity', 'Mean.Humidity', 'Min.Humidity',
       'Max.Sea.Level.PressureIn', 'Mean.Sea.Level.PressureIn',
       'Min.Sea.Level.PressureIn', 'Max.VisibilityMiles',
       'Mean.VisibilityMiles', 'Min.VisibilityMiles', 'Max.Wind.SpeedMPH',
       'Mean.Wind.SpeedMPH', 'Max.Gust.SpeedMPH', 'PrecipitationIn',
       'CloudCover', 'Events', 'WindDirDegrees.br...', 'city', 'season'],
      dtype='object')

We're interested in the following columns in the nyc inspection data:

- 'Max.TemperatureF'
- 'Min.TemperatureF'
- 'Mean.TemperatureF'
- 'Max.Humidity'
- 'Min.Humidity'
- 'Mean.Humidity'
- 'Max.Sea.Level.PressureIn'
- 'Min.Sea.Level.PressureIn'
- 'Mean.Sea.Level.PressureIn'
- 'PrecipitationIn'
- 'Events'
- 'season'

**Also we'll remove all rows with missing or incorrectly formatted values**

## 2. Data Cleanup

In [127]:
# The dates we're interested in are between 2010 and 2017
nyc_weather = all_nyc_weather.loc[all_nyc_weather['Date']>'2010-01-01']
nyc_weather.reset_index(drop=True, inplace=True)
nyc_weather.drop(['Unnamed: 0', 
                  'Max.Dew.PointF', 
                  'MeanDew.PointF', 
                  'Min.DewpointF', 
                  'Max.VisibilityMiles', 
                  'Mean.VisibilityMiles', 
                  'Min.VisibilityMiles', 
                  'Max.Wind.SpeedMPH', 
                  'Mean.Wind.SpeedMPH', 
                  'Max.Gust.SpeedMPH', 
                  'CloudCover', 
                  'WindDirDegrees.br...', 
                  'Events'], # We'll remove this as it's missing too many entries
                 axis=1, inplace=True)
nyc_weather.head()

Unnamed: 0,Date,Max.TemperatureF,Mean.TemperatureF,Min.TemperatureF,Max.Humidity,Mean.Humidity,Min.Humidity,Max.Sea.Level.PressureIn,Mean.Sea.Level.PressureIn,Min.Sea.Level.PressureIn,PrecipitationIn,city,season
0,2010-01-02,33,25.0,17,89,68,46,29.7,29.67,29.62,0.01,New York City (USA),Winter
1,2010-01-03,24,21.0,18,62,53,44,29.64,29.55,29.47,T,New York City (USA),Winter
2,2010-01-04,32,26.0,20,62,53,43,29.76,29.71,29.66,0.00,New York City (USA),Winter
3,2010-01-05,32,27.0,21,62,53,43,29.76,29.73,29.68,0.00,New York City (USA),Winter
4,2010-01-06,36,31.0,26,63,52,40,29.8,29.73,29.68,0.00,New York City (USA),Winter


In [128]:
# Checking missing entries
nyc_weather.isna().sum()

Date                         0
Max.TemperatureF             0
Mean.TemperatureF            0
Min.TemperatureF             0
Max.Humidity                 0
Mean.Humidity                0
Min.Humidity                 0
Max.Sea.Level.PressureIn     0
Mean.Sea.Level.PressureIn    0
Min.Sea.Level.PressureIn     0
PrecipitationIn              0
city                         0
season                       0
dtype: int64

In [129]:
# The precipitation column contains both int values and str for trace amounts (T)
# Converting those entries to 0
nyc_weather[nyc_weather['PrecipitationIn'] == 'T']
nyc_weather.replace({'PrecipitationIn': 'T'}, 0, inplace=True)

# Converting values to float
nyc_weather['PrecipitationIn'] = nyc_weather['PrecipitationIn'].astype(float)

In [130]:
# Confirming change
nyc_weather[nyc_weather['PrecipitationIn'] == 'T'].count()

Date                         0
Max.TemperatureF             0
Mean.TemperatureF            0
Min.TemperatureF             0
Max.Humidity                 0
Mean.Humidity                0
Min.Humidity                 0
Max.Sea.Level.PressureIn     0
Mean.Sea.Level.PressureIn    0
Min.Sea.Level.PressureIn     0
PrecipitationIn              0
city                         0
season                       0
dtype: int64

In [131]:
# Renaming columns for clarity
nyc_weather.rename(columns={'Max.TemperatureF': 'Max Temp (F)', 
                            'Mean.TemperatureF': 'Mean Temp (F)', 
                            'Min.TemperatureF': 'Min Temp (F)', 
                            'Max.Humidity': 'Max Humidity', 
                            'Mean.Humidity' : 'Mean Humidity', 
                            'Min.Humidity': 'Min Humidity',
                            'Max.Sea.Level.PressureIn': 'Max Pressure (In)',
                            'Mean.Sea.Level.PressureIn': 'Mean Pressure (In)',
                            'Min.Sea.Level.PressureIn': 'Min Pressure (In)',
                            'PrecipitationIn': 'Precipitation (In)',
                            'city': 'City',
                            'season': 'Season'}, inplace=True)

In [132]:
nyc_weather.head()

Unnamed: 0,Date,Max Temp (F),Mean Temp (F),Min Temp (F),Max Humidity,Mean Humidity,Min Humidity,Max Pressure (In),Mean Pressure (In),Min Pressure (In),Precipitation (In),City,Season
0,2010-01-02,33,25.0,17,89,68,46,29.7,29.67,29.62,0.01,New York City (USA),Winter
1,2010-01-03,24,21.0,18,62,53,44,29.64,29.55,29.47,0.0,New York City (USA),Winter
2,2010-01-04,32,26.0,20,62,53,43,29.76,29.71,29.66,0.0,New York City (USA),Winter
3,2010-01-05,32,27.0,21,62,53,43,29.76,29.73,29.68,0.0,New York City (USA),Winter
4,2010-01-06,36,31.0,26,63,52,40,29.8,29.73,29.68,0.0,New York City (USA),Winter


In [139]:
min_date = nyc_weather['Date'].min()
max_date = nyc_weather['Date'].max()

row_col = nyc_weather.shape

# Description of cleaned dataset we'll use for our analysis

print('Cleaned NYC Weather')
print('-----------------------')
print(f'Table contains {row_col[0]} rows and {row_col[1]}')
print(f'Dates range from {min_date} to {max_date}')

Cleaned NYC Weather
-----------------------
Table contains 2190 rows and 13
Dates range from 2010-01-02 to 2015-12-31


In [140]:
# Saving final csv for analysis
nyc_weather.to_csv(r'clean_csv/final_weather.csv')