In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("bq-results.csv", na_values=[9999.9, 999.9])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109897 entries, 0 to 109896
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   year                  109897 non-null  int64  
 1   mo                    109897 non-null  int64  
 2   da                    109897 non-null  int64  
 3   temp                  109897 non-null  float64
 4   dewp                  105381 non-null  float64
 5   slp                   81761 non-null   float64
 6   stp                   61606 non-null   float64
 7   visib                 106936 non-null  float64
 8   wdsp                  109419 non-null  float64
 9   mxpsd                 108153 non-null  float64
 10  gust                  44051 non-null   float64
 11  max                   109880 non-null  float64
 12  min                   109884 non-null  float64
 13  prcp                  109897 non-null  float64
 14  sndp                  77 non-null      float64
 15  

In [3]:
df_drop = df.drop(["stp", "gust", "tornado_funnel_cloud", "sndp"], axis=1)
df_drop = df_drop.dropna()
df_drop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79409 entries, 56 to 109896
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              79409 non-null  int64  
 1   mo                79409 non-null  int64  
 2   da                79409 non-null  int64  
 3   temp              79409 non-null  float64
 4   dewp              79409 non-null  float64
 5   slp               79409 non-null  float64
 6   visib             79409 non-null  float64
 7   wdsp              79409 non-null  float64
 8   mxpsd             79409 non-null  float64
 9   max               79409 non-null  float64
 10  min               79409 non-null  float64
 11  prcp              79409 non-null  float64
 12  fog               79409 non-null  int64  
 13  rain_drizzle      79409 non-null  int64  
 14  snow_ice_pellets  79409 non-null  int64  
 15  hail              79409 non-null  int64  
 16  thunder           79409 non-null  int6

In [4]:
df_drop.head()

Unnamed: 0,year,mo,da,temp,dewp,slp,visib,wdsp,mxpsd,max,min,prcp,fog,rain_drizzle,snow_ice_pellets,hail,thunder
56,2016,10,27,71.7,59.5,1021.3,8.9,4.2,8.9,84.0,60.1,0.0,1,0,0,0,0
59,2016,3,30,73.2,71.1,1011.1,6.3,10.2,17.1,77.0,71.1,0.06,0,1,0,0,0
70,2016,6,28,84.2,74.9,1017.0,8.1,3.0,15.9,93.9,77.0,0.0,1,1,0,0,1
197,2016,9,18,86.3,75.9,1014.6,9.8,2.6,8.9,99.0,77.0,0.0,0,0,0,0,0
198,2016,12,18,50.3,35.6,1022.0,9.1,13.5,21.0,78.1,37.9,0.02,0,1,0,0,0


In [5]:
dfr = df_drop.rename(columns={"mo":"month", "da":"day"})
dfr["Date"] = pd.to_datetime(dfr[["year", "month", "day"]])
dfr.head()

Unnamed: 0,year,month,day,temp,dewp,slp,visib,wdsp,mxpsd,max,min,prcp,fog,rain_drizzle,snow_ice_pellets,hail,thunder,Date
56,2016,10,27,71.7,59.5,1021.3,8.9,4.2,8.9,84.0,60.1,0.0,1,0,0,0,0,2016-10-27
59,2016,3,30,73.2,71.1,1011.1,6.3,10.2,17.1,77.0,71.1,0.06,0,1,0,0,0,2016-03-30
70,2016,6,28,84.2,74.9,1017.0,8.1,3.0,15.9,93.9,77.0,0.0,1,1,0,0,1,2016-06-28
197,2016,9,18,86.3,75.9,1014.6,9.8,2.6,8.9,99.0,77.0,0.0,0,0,0,0,0,2016-09-18
198,2016,12,18,50.3,35.6,1022.0,9.1,13.5,21.0,78.1,37.9,0.02,0,1,0,0,0,2016-12-18


In [6]:
dfr = dfr.sort_values("Date", ignore_index=True)
dfr.head()

Unnamed: 0,year,month,day,temp,dewp,slp,visib,wdsp,mxpsd,max,min,prcp,fog,rain_drizzle,snow_ice_pellets,hail,thunder,Date
0,1941,7,1,84.3,75.0,1015.4,83.7,6.4,16.9,93.4,75.4,99.99,0,1,0,0,1,1941-07-01
1,1941,7,2,83.3,75.1,1016.6,75.8,7.0,15.0,92.3,76.3,0.0,0,0,0,0,1,1941-07-02
2,1941,7,3,85.6,73.8,1016.5,19.8,7.5,13.0,93.4,76.3,0.0,0,0,0,0,0,1941-07-03
3,1941,7,4,86.6,71.5,1014.4,87.2,9.0,21.0,92.3,75.4,0.0,0,0,0,0,0,1941-07-04
4,1941,7,5,84.6,69.7,1012.6,14.3,10.3,13.0,91.2,74.3,0.0,0,0,0,0,0,1941-07-05


In [7]:
df_stgrp = dfr.groupby(["Date"], as_index=False).mean()
df_stgrp.head()

Unnamed: 0,Date,year,month,day,temp,dewp,slp,visib,wdsp,mxpsd,max,min,prcp,fog,rain_drizzle,snow_ice_pellets,hail,thunder
0,1941-07-01,1941.0,7.0,1.0,84.3,75.0,1015.4,83.7,6.4,16.9,93.4,75.4,99.99,0.0,1.0,0.0,0.0,1.0
1,1941-07-02,1941.0,7.0,2.0,83.3,75.1,1016.6,75.8,7.0,15.0,92.3,76.3,0.0,0.0,0.0,0.0,0.0,1.0
2,1941-07-03,1941.0,7.0,3.0,85.6,73.8,1016.5,19.8,7.5,13.0,93.4,76.3,0.0,0.0,0.0,0.0,0.0,0.0
3,1941-07-04,1941.0,7.0,4.0,86.6,71.5,1014.4,87.2,9.0,21.0,92.3,75.4,0.0,0.0,0.0,0.0,0.0,0.0
4,1941-07-05,1941.0,7.0,5.0,84.6,69.7,1012.6,14.3,10.3,13.0,91.2,74.3,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
tomorrow = df_stgrp["rain_drizzle"]
tomorrow.drop(0, inplace=True)
tomorrow = tomorrow.reindex()
tomorrow

1        0.0
2        0.0
3        0.0
4        0.0
5        0.0
        ... 
28778    0.0
28779    0.0
28780    0.0
28781    0.0
28782    0.0
Name: rain_drizzle, Length: 28782, dtype: float64

In [17]:
tomorrow = []
for i in range(len(df_stgrp["rain_drizzle"])-1):
    tomorrow.append(df_stgrp["rain_drizzle"][i+1])
tomorrow

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0

In [18]:
dfs = df_stgrp.drop(df_stgrp.tail(1).index)
# dfs = df_stgrp
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28782 entries, 0 to 28781
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              28782 non-null  datetime64[ns]
 1   year              28782 non-null  float64       
 2   month             28782 non-null  float64       
 3   day               28782 non-null  float64       
 4   temp              28782 non-null  float64       
 5   dewp              28782 non-null  float64       
 6   slp               28782 non-null  float64       
 7   visib             28782 non-null  float64       
 8   wdsp              28782 non-null  float64       
 9   mxpsd             28782 non-null  float64       
 10  max               28782 non-null  float64       
 11  min               28782 non-null  float64       
 12  prcp              28782 non-null  float64       
 13  fog               28782 non-null  float64       
 14  rain_drizzle      2878

In [19]:
dfs["tomorrow"]=tomorrow
dfs.head(15)

Unnamed: 0,Date,year,month,day,temp,dewp,slp,visib,wdsp,mxpsd,max,min,prcp,fog,rain_drizzle,snow_ice_pellets,hail,thunder,tomorrow
0,1941-07-01,1941.0,7.0,1.0,84.3,75.0,1015.4,83.7,6.4,16.9,93.4,75.4,99.99,0.0,1.0,0.0,0.0,1.0,0.0
1,1941-07-02,1941.0,7.0,2.0,83.3,75.1,1016.6,75.8,7.0,15.0,92.3,76.3,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1941-07-03,1941.0,7.0,3.0,85.6,73.8,1016.5,19.8,7.5,13.0,93.4,76.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1941-07-04,1941.0,7.0,4.0,86.6,71.5,1014.4,87.2,9.0,21.0,92.3,75.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1941-07-05,1941.0,7.0,5.0,84.6,69.7,1012.6,14.3,10.3,13.0,91.2,74.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1941-07-06,1941.0,7.0,6.0,85.6,66.6,1013.4,26.3,6.3,9.9,93.4,72.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1941-07-07,1941.0,7.0,7.0,85.7,70.4,1014.7,25.1,6.1,15.9,92.3,74.3,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,1941-07-08,1941.0,7.0,8.0,84.5,73.3,1016.2,44.1,5.4,9.9,90.3,76.3,99.99,0.0,1.0,0.0,0.0,1.0,1.0
8,1941-07-09,1941.0,7.0,9.0,85.4,74.5,1015.7,80.7,9.3,29.9,94.3,74.3,99.99,0.0,1.0,0.0,0.0,1.0,1.0
9,1941-07-10,1941.0,7.0,10.0,82.7,76.5,1013.2,59.6,6.7,12.0,93.4,77.4,99.99,0.0,1.0,0.0,0.0,1.0,1.0


In [None]:
y = dfr["rain_drizzle"]


In [None]:
X = dfr.drop(["rain_drizzle", "Date"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")