In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import datetime as dt

In [2]:
df = pd.read_csv("bq-results.csv", na_values=[9999.9, 999.9])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109897 entries, 0 to 109896
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   year                  109897 non-null  int64  
 1   mo                    109897 non-null  int64  
 2   da                    109897 non-null  int64  
 3   temp                  109897 non-null  float64
 4   dewp                  105381 non-null  float64
 5   slp                   81761 non-null   float64
 6   stp                   61606 non-null   float64
 7   visib                 106936 non-null  float64
 8   wdsp                  109419 non-null  float64
 9   mxpsd                 108153 non-null  float64
 10  gust                  44051 non-null   float64
 11  max                   109880 non-null  float64
 12  min                   109884 non-null  float64
 13  prcp                  109897 non-null  float64
 14  sndp                  77 non-null      float64
 15  

In [4]:
df_drop = df.drop(["stp", "gust", "tornado_funnel_cloud", "sndp"], axis=1)
df_drop = df_drop.dropna()
df_drop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79409 entries, 56 to 109896
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              79409 non-null  int64  
 1   mo                79409 non-null  int64  
 2   da                79409 non-null  int64  
 3   temp              79409 non-null  float64
 4   dewp              79409 non-null  float64
 5   slp               79409 non-null  float64
 6   visib             79409 non-null  float64
 7   wdsp              79409 non-null  float64
 8   mxpsd             79409 non-null  float64
 9   max               79409 non-null  float64
 10  min               79409 non-null  float64
 11  prcp              79409 non-null  float64
 12  fog               79409 non-null  int64  
 13  rain_drizzle      79409 non-null  int64  
 14  snow_ice_pellets  79409 non-null  int64  
 15  hail              79409 non-null  int64  
 16  thunder           79409 non-null  int6

In [5]:
df_drop.head()

Unnamed: 0,year,mo,da,temp,dewp,slp,visib,wdsp,mxpsd,max,min,prcp,fog,rain_drizzle,snow_ice_pellets,hail,thunder
56,2016,10,27,71.7,59.5,1021.3,8.9,4.2,8.9,84.0,60.1,0.0,1,0,0,0,0
59,2016,3,30,73.2,71.1,1011.1,6.3,10.2,17.1,77.0,71.1,0.06,0,1,0,0,0
70,2016,6,28,84.2,74.9,1017.0,8.1,3.0,15.9,93.9,77.0,0.0,1,1,0,0,1
197,2016,9,18,86.3,75.9,1014.6,9.8,2.6,8.9,99.0,77.0,0.0,0,0,0,0,0
198,2016,12,18,50.3,35.6,1022.0,9.1,13.5,21.0,78.1,37.9,0.02,0,1,0,0,0


In [6]:
dfr = df_drop.rename(columns={"mo":"month", "da":"day"})
dfr["Date"] = pd.to_datetime(dfr[["year", "month", "day"]])
dfr.head()

Unnamed: 0,year,month,day,temp,dewp,slp,visib,wdsp,mxpsd,max,min,prcp,fog,rain_drizzle,snow_ice_pellets,hail,thunder,Date
56,2016,10,27,71.7,59.5,1021.3,8.9,4.2,8.9,84.0,60.1,0.0,1,0,0,0,0,2016-10-27
59,2016,3,30,73.2,71.1,1011.1,6.3,10.2,17.1,77.0,71.1,0.06,0,1,0,0,0,2016-03-30
70,2016,6,28,84.2,74.9,1017.0,8.1,3.0,15.9,93.9,77.0,0.0,1,1,0,0,1,2016-06-28
197,2016,9,18,86.3,75.9,1014.6,9.8,2.6,8.9,99.0,77.0,0.0,0,0,0,0,0,2016-09-18
198,2016,12,18,50.3,35.6,1022.0,9.1,13.5,21.0,78.1,37.9,0.02,0,1,0,0,0,2016-12-18


In [8]:
dfr = dfr.sort_values("Date", ignore_index=True)
dfr.head()


Unnamed: 0,year,month,day,temp,dewp,slp,visib,wdsp,mxpsd,max,min,prcp,fog,rain_drizzle,snow_ice_pellets,hail,thunder,Date
0,1941,7,1,84.3,75.0,1015.4,83.7,6.4,16.9,93.4,75.4,99.99,0,1,0,0,1,1941-07-01
1,1941,7,2,83.3,75.1,1016.6,75.8,7.0,15.0,92.3,76.3,0.0,0,0,0,0,1,1941-07-02
2,1941,7,3,85.6,73.8,1016.5,19.8,7.5,13.0,93.4,76.3,0.0,0,0,0,0,0,1941-07-03
3,1941,7,4,86.6,71.5,1014.4,87.2,9.0,21.0,92.3,75.4,0.0,0,0,0,0,0,1941-07-04
4,1941,7,5,84.6,69.7,1012.6,14.3,10.3,13.0,91.2,74.3,0.0,0,0,0,0,0,1941-07-05


In [11]:
df_stgrp = dfr.groupby(["Date"], as_index=False).mean()
df_stgrp.head()

Unnamed: 0,Date,year,month,day,temp,dewp,slp,visib,wdsp,mxpsd,max,min,prcp,fog,rain_drizzle,snow_ice_pellets,hail,thunder
0,1941-07-01,1941.0,7.0,1.0,84.3,75.0,1015.4,83.7,6.4,16.9,93.4,75.4,99.99,0.0,1.0,0.0,0.0,1.0
1,1941-07-02,1941.0,7.0,2.0,83.3,75.1,1016.6,75.8,7.0,15.0,92.3,76.3,0.0,0.0,0.0,0.0,0.0,1.0
2,1941-07-03,1941.0,7.0,3.0,85.6,73.8,1016.5,19.8,7.5,13.0,93.4,76.3,0.0,0.0,0.0,0.0,0.0,0.0
3,1941-07-04,1941.0,7.0,4.0,86.6,71.5,1014.4,87.2,9.0,21.0,92.3,75.4,0.0,0.0,0.0,0.0,0.0,0.0
4,1941-07-05,1941.0,7.0,5.0,84.6,69.7,1012.6,14.3,10.3,13.0,91.2,74.3,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
dates = df_stgrp["Date"]
day_count = []
for day in dates:
    count = day - dates.min()
    day_count.append(count.days)

df_days = df_stgrp
df_days["days"] = day_count

In [31]:
def seasonality(day_counts):
    year = 365.2425
    year_sin = np.sin(day_counts * (2 * np.pi / year))
    year_cos = np.cos(day_counts * (2 * np.pi / year))
    return year_sin, year_cos


In [36]:
year_sin, year_cos = seasonality(df_days.days)
df_days["year sin"] = year_sin
df_days["year cos"] = year_cos
df_days.head()

Unnamed: 0,Date,year,month,day,temp,dewp,slp,visib,wdsp,mxpsd,...,min,prcp,fog,rain_drizzle,snow_ice_pellets,hail,thunder,days,year sin,year cos
0,1941-07-01,1941.0,7.0,1.0,84.3,75.0,1015.4,83.7,6.4,16.9,...,75.4,99.99,0.0,1.0,0.0,0.0,1.0,0,0.0,1.0
1,1941-07-02,1941.0,7.0,2.0,83.3,75.1,1016.6,75.8,7.0,15.0,...,76.3,0.0,0.0,0.0,0.0,0.0,1.0,1,0.017202,0.999852
2,1941-07-03,1941.0,7.0,3.0,85.6,73.8,1016.5,19.8,7.5,13.0,...,76.3,0.0,0.0,0.0,0.0,0.0,0.0,2,0.034399,0.999408
3,1941-07-04,1941.0,7.0,4.0,86.6,71.5,1014.4,87.2,9.0,21.0,...,75.4,0.0,0.0,0.0,0.0,0.0,0.0,3,0.051585,0.998669
4,1941-07-05,1941.0,7.0,5.0,84.6,69.7,1012.6,14.3,10.3,13.0,...,74.3,0.0,0.0,0.0,0.0,0.0,0.0,4,0.068757,0.997633


In [46]:
df_short = df_days.drop(df_days.loc[df_days["year"] <= 1945].index)


In [49]:

df_clean = df_short.drop(["month", "day", "mxpsd", "prcp", "days", "year"], axis=1)
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27140 entries, 1643 to 28782
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              27140 non-null  datetime64[ns]
 1   temp              27140 non-null  float64       
 2   dewp              27140 non-null  float64       
 3   slp               27140 non-null  float64       
 4   visib             27140 non-null  float64       
 5   wdsp              27140 non-null  float64       
 6   max               27140 non-null  float64       
 7   min               27140 non-null  float64       
 8   fog               27140 non-null  float64       
 9   rain_drizzle      27140 non-null  float64       
 10  snow_ice_pellets  27140 non-null  float64       
 11  hail              27140 non-null  float64       
 12  thunder           27140 non-null  float64       
 13  year sin          27140 non-null  float64       
 14  year cos          2

In [50]:
df_clean.to_csv("model_data.csv")