In [48]:
import sqlite3
import pandas as pd 
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [49]:
data = pd.read_csv('daily_weather.csv')

In [50]:
data.head(10)

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46
3,3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74
5,5,915.3,78.404,182.8,9.932014,189.0,10.983375,0.02,170.0,35.13,33.93
6,6,915.598868,70.043304,177.875407,3.745587,186.606696,4.589632,0.0,0.0,10.657422,21.385657
7,7,918.07,51.71,242.4,2.527742,271.6,3.646212,0.0,0.0,80.47,74.92
8,8,920.08,80.582,40.7,4.518619,63.0,5.883152,0.0,0.0,29.58,24.03
9,9,915.01,47.498,163.1,4.943637,195.9,6.576604,0.0,0.0,88.6,68.05


In [51]:
data.shape

(1095, 11)

In [52]:
data.columns

Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

In [53]:
features = ['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'rain_duration_9am',
       'rain_accumulation_9am']

target = ['max_wind_speed_9am']

In [54]:
data = data.dropna()
data.shape

(1064, 11)

In [55]:
X = data[features]
y = data[target]

** Let us look at a typical row from our features: **

In [56]:
X.iloc[334]

air_pressure_9am          920.519457
air_temp_9am               81.371746
avg_wind_direction_9am    159.722535
avg_wind_speed_9am          2.614439
max_wind_direction_9am    190.369374
rain_duration_9am           0.000000
rain_accumulation_9am       0.000000
Name: 340, dtype: float64

In [57]:
y

Unnamed: 0,max_wind_speed_9am
0,2.863283
1,3.533324
2,22.100967
3,5.190045
4,2.863283
...,...
1090,5.212070
1091,2.371156
1092,3.892276
1093,4.764682


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

In [59]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [60]:
y_prediction = regressor.predict(X_test)
y_prediction

array([[ 2.44666438],
       [ 3.30818495],
       [ 3.36418206],
       [ 1.91613918],
       [ 2.32057236],
       [19.64129655],
       [ 4.76444817],
       [ 5.26713293],
       [ 1.79260855],
       [ 5.20888803],
       [10.90125885],
       [ 7.37228918],
       [ 9.96959184],
       [11.80449922],
       [15.27071029],
       [ 2.40186543],
       [ 1.95436533],
       [ 6.6273803 ],
       [11.18131495],
       [27.1769888 ],
       [ 7.78452297],
       [ 2.74112796],
       [10.8430872 ],
       [15.63713319],
       [ 5.18736461],
       [11.48771606],
       [ 8.95023577],
       [ 4.02955303],
       [ 1.81971448],
       [14.53228123],
       [11.63821003],
       [ 4.67188312],
       [ 3.95210257],
       [13.54333909],
       [ 5.39973904],
       [ 9.41116904],
       [ 4.18136908],
       [ 8.76634284],
       [ 4.08033559],
       [ 6.64583344],
       [23.34590471],
       [ 3.95718465],
       [10.5227379 ],
       [22.03517337],
       [ 9.55400303],
       [ 8

In [61]:
y_test.describe()

Unnamed: 0,max_wind_speed_9am
count,352.0
mean,6.817839
std,5.800063
min,1.275056
25%,2.903676
50%,4.775867
75%,8.321417
max,29.84078


In [62]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
print(RMSE)

0.4820521999227169


** Using decision tree model to see RMSE

In [63]:
regressor = DecisionTreeRegressor(max_depth=20)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [64]:
y_prediction = regressor.predict(X_test)
y_prediction

array([ 2.3040482 ,  2.89129467,  4.0041226 ,  2.0356154 ,  2.5948504 ,
       18.0521058 ,  4.585727  ,  5.4581336 ,  1.5211192 ,  5.5699806 ,
       10.9386366 ,  6.934514  ,  9.1043458 , 11.1623306 , 15.5914718 ,
        2.6882476 ,  2.20327371,  6.51634708, 10.7596814 , 25.3668996 ,
        7.0016222 ,  2.9527608 , 10.78090554, 14.9651286 ,  4.8317904 ,
       10.23926855,  8.500372  ,  3.7133204 ,  1.8890032 , 14.56586307,
       10.8475151 ,  4.362033  ,  3.8922756 , 13.4440094 ,  4.77104404,
        8.6793272 ,  4.8317904 ,  8.51419256,  3.8922756 ,  5.816044  ,
       23.5549782 ,  4.84472325,  9.4398868 , 22.1904448 ,  9.9096442 ,
        8.6345884 ,  2.6172198 ,  2.6619586 ,  3.4225182 ,  5.40260097,
       25.81563062,  1.77940322, 13.1308378 ,  5.8831522 ,  5.8607828 ,
        9.5741032 ,  5.0107456 ,  4.0488614 ,  3.31059535,  2.6172198 ,
        7.270055  , 12.7281886 ,  2.0803542 , 10.7596814 ,  5.8607828 ,
        1.9685072 ,  3.8251674 ,  7.45194855,  7.4490102 , 10.08

In [65]:
y_test.describe()

Unnamed: 0,max_wind_speed_9am
count,352.0
mean,6.817839
std,5.800063
min,1.275056
25%,2.903676
50%,4.775867
75%,8.321417
max,29.84078


In [66]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
print(RMSE)

0.6259760063431123
