In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

Adding Dataset

In [45]:
df=pd.read_csv('mumbai-monthly-rains.csv')
df=df.round(decimals = 2)
print("Data heads:")
print(df.head())

Data heads:
   Year    Jan   Feb    Mar  April     May    June    July     Aug    Sept  \
0  1901  13.12  0.00   0.00   3.95   17.14  640.71  888.37  545.05   64.27   
1  1902   0.00  0.00   0.00   0.00    0.36  248.00  408.43  566.60  688.91   
2  1903   0.00  0.00   0.84   0.00  220.57  370.85  902.45  602.42  264.59   
3  1904   0.00  0.00  11.38   0.00    0.00  723.08  390.89  191.58   85.70   
4  1905   0.66  1.71   0.00   0.00    0.00  123.87  581.83  167.38  172.30   

      Oct    Nov    Dec    Total  
0    9.87   0.00   0.00  2182.48  
1   28.65   0.49  19.53  1960.97  
2  157.89   0.00   0.00  2519.61  
3   38.68   0.00   0.00  1441.32  
4    7.37  24.90   0.00  1080.02  


Modelling

In [46]:
df_M=pd.melt(df,id_vars=['Year'],var_name='Month',value_name='RainFall(mm)')
df_M.head()

Unnamed: 0,Year,Month,RainFall(mm)
0,1901,Jan,13.12
1,1902,Jan,0.0
2,1903,Jan,0.0
3,1904,Jan,0.0
4,1905,Jan,0.66


Month column Encoding

In [47]:
Month_map={'Jan':1,'Feb':2,'Mar' :3,'April':4,'May':5,'June':6,'July':7,'Aug':8,'Sept':9,
   'Oct':10,'Nov':11,'Dec':12}
df_M['Month']=df_M['Month'].map(Month_map)

In [48]:
X=np.asanyarray(df_M[['Year','Month']]).astype('int')
y=np.asanyarray(df_M['RainFall(mm)']).astype('int')
print(y.shape)
print(X.shape)

(1573,)
(1573, 2)


  X=np.asanyarray(df_M[['Year','Month']]).astype('int')


Splitting data set

In [49]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

Polynomial regression

In [50]:
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
p_regressor=LinearRegression()
p_regressor.fit(X_poly, y_train)

first decision tree

In [51]:
d_regressor=DecisionTreeRegressor(random_state=0)
d_regressor.fit(X_train,y_train)


second with randomforrest

In [52]:
r_regressor=RandomForestRegressor(n_estimators=10,random_state=0)
r_regressor.fit(X_train,y_train)


Voting Regressor

In [53]:
test_weather = [[2020, 7]]
y_d_pred=d_regressor.predict(test_weather)
y_rf_pred=r_regressor.predict(test_weather)
y_p_pred=p_regressor.predict(poly_reg.transform(test_weather))
print(y_d_pred)
print(y_rf_pred)
print(y_p_pred)

[1302.]
[1097.4]
[197.71988542]


In [54]:
# Predict for a 2020 rainfall july month

from sklearn.ensemble import VotingRegressor

ensemble_model = VotingRegressor([
    ("decision_tree", d_regressor), 
    ("randomforrest", r_regressor),
    ("polynomial", p_regressor)
])
ensemble_model.fit(X=X_train, y=y_train)
y_ensamble_predict=ensemble_model.predict(test_weather)

In [55]:
from sklearn.model_selection import cross_val_score

for model in [d_regressor, r_regressor, ensemble_model]:
    print(-cross_val_score(model, X=X_train, y=y_train, cv=5,
                           scoring="neg_mean_squared_error").mean())

72837.31000822707
56167.24297108187


54831.934819971466


 To obtain a more interpretable metric, we can take the square root to obtain the RMSE. However, for the purposes of determining which model is best, we just need to know which value is smallest. 

In [56]:
r2_score(y_test,ensemble_model.predict(X_test))

0.872066990566641

Stacking

In [62]:
from sklearn.ensemble import StackingRegressor
final_estimator=LinearRegression()
stacking_model = StackingRegressor(estimators=[
    ("decision_tree", d_regressor), 
    ("randomforrest", r_regressor),
    ("polynomial", p_regressor)
],final_estimator=final_estimator)

stacking_model.fit(X=X_train, y=y_train)
y_stack_predict=stacking_model.predict(X_test)
stacker = stacking_model.final_estimator_
stacker.intercept_, stacker.coef_

(-1.1665247594183938, array([-0.07063041,  0.8564411 ,  0.21314905]))

In [63]:
-cross_val_score(stacking_model, X=X_train, y=y_train, cv=5,
                 scoring="neg_mean_squared_error").mean()

52754.6523641865

Visualize it

In [67]:
# Evaluate 

print("Mean Squared Error: %.4f"
      % mean_squared_error(y_test,y_stack_predict, squared=False))

print('Variance Score: %.4f' % stacking_model.score(X_test, y_test))



Mean Squared Error: 240.0547
Variance Score: 0.8774


