In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

Adding Dataset

In [26]:
df=pd.read_csv('mumbai-monthly-rains.csv')
df=df.round(decimals = 2)
print("Data heads:")
print(df.head())

Data heads:
   Year    Jan   Feb    Mar  April     May    June    July     Aug    Sept  \
0  1901  13.12  0.00   0.00   3.95   17.14  640.71  888.37  545.05   64.27   
1  1902   0.00  0.00   0.00   0.00    0.36  248.00  408.43  566.60  688.91   
2  1903   0.00  0.00   0.84   0.00  220.57  370.85  902.45  602.42  264.59   
3  1904   0.00  0.00  11.38   0.00    0.00  723.08  390.89  191.58   85.70   
4  1905   0.66  1.71   0.00   0.00    0.00  123.87  581.83  167.38  172.30   

      Oct    Nov    Dec    Total  
0    9.87   0.00   0.00  2182.48  
1   28.65   0.49  19.53  1960.97  
2  157.89   0.00   0.00  2519.61  
3   38.68   0.00   0.00  1441.32  
4    7.37  24.90   0.00  1080.02  


Modelling

In [27]:
df_M=pd.melt(df,id_vars=['Year'],var_name='Month',value_name='RainFall(mm)')
df_M.head()

Unnamed: 0,Year,Month,RainFall(mm)
0,1901,Jan,13.12
1,1902,Jan,0.0
2,1903,Jan,0.0
3,1904,Jan,0.0
4,1905,Jan,0.66


Month column Encoding

In [28]:
Month_map={'Jan':1,'Feb':2,'Mar' :3,'April':4,'May':5,'June':6,'July':7,'Aug':8,'Sept':9,
   'Oct':10,'Nov':11,'Dec':12}
df_M['Month']=df_M['Month'].map(Month_map)

In [29]:
X=np.asanyarray(df_M[['Year','Month']]).astype('int')
y=np.asanyarray(df_M['RainFall(mm)']).astype('int')
print(y.shape)
print(X.shape)

(1573,)
(1573, 2)


  X=np.asanyarray(df_M[['Year','Month']]).astype('int')


Splitting data set

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

Medelling

In [31]:
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

Predicting test results

In [32]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[1.30e+00 0.00e+00]
 [2.50e+01 3.43e+02]
 [1.44e+03 2.44e+03]
 [0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00]
 [2.00e-01 6.00e+00]
 [4.10e+02 1.15e+03]
 [0.00e+00 0.00e+00]
 [2.00e-01 1.00e+00]
 [2.00e-01 0.00e+00]
 [6.91e+02 2.64e+02]
 [1.96e+03 2.43e+03]
 [2.70e+00 0.00e+00]
 [5.58e+02 5.40e+02]
 [7.06e+02 7.42e+02]
 [1.09e+03 8.92e+02]
 [0.00e+00 0.00e+00]
 [9.30e+00 5.00e+00]
 [0.00e+00 0.00e+00]
 [1.99e+03 1.84e+03]
 [0.00e+00 0.00e+00]
 [2.39e+03 2.31e+03]
 [8.52e+02 2.85e+02]
 [0.00e+00 0.00e+00]
 [0.00e+00 2.30e+01]
 [4.48e+01 0.00e+00]
 [1.00e-01 0.00e+00]
 [0.00e+00 0.00e+00]
 [3.10e+02 1.28e+02]
 [1.44e+01 2.10e+01]
 [2.68e+01 5.00e+01]
 [7.33e+02 8.73e+02]
 [5.60e+02 4.28e+02]
 [0.00e+00 0.00e+00]
 [3.80e+00 0.00e+00]
 [2.28e+01 0.00e+00]
 [3.30e+02 4.93e+02]
 [0.00e+00 0.00e+00]
 [1.93e+03 2.47e+03]
 [1.00e-01 0.00e+00]
 [0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00]
 [5.83e+02 7.30e+01]
 [0.00e+00 0.00e+00]
 [4.00e-01 0.00e+00]
 [7.00e-01 0.00e+00]
 [3.10e+00 7.90e+01]
 [4.46e+02 6.

Evaluating Model Performance

In [33]:
r2_score(y_test,y_pred)

0.866013163622908

In [39]:
mean_squared_error(y_test,y_pred, squared=False)
# Evaluate 

print("Mean Squared Error: %.4f"
      % mean_squared_error(y_test,y_pred, squared=False))

print('Variance Score: %.4f' % regressor.score(X_test, y_test))

Mean Squared Error: 250.9302
Variance Score: 0.8660




In [34]:
import pickle

with open('rainfall_model.pkl','wb') as file:
    pickle.dump(regressor,file)

