In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score

Adding Dataset

In [3]:
df=pd.read_csv('mumbai-monthly-rains.csv')
df=df.round(decimals = 2)
print("Data heads:")
print(df.head())

Data heads:
   Year    Jan   Feb    Mar  April     May    June    July     Aug    Sept  \
0  1901  13.12  0.00   0.00   3.95   17.14  640.71  888.37  545.05   64.27   
1  1902   0.00  0.00   0.00   0.00    0.36  248.00  408.43  566.60  688.91   
2  1903   0.00  0.00   0.84   0.00  220.57  370.85  902.45  602.42  264.59   
3  1904   0.00  0.00  11.38   0.00    0.00  723.08  390.89  191.58   85.70   
4  1905   0.66  1.71   0.00   0.00    0.00  123.87  581.83  167.38  172.30   

      Oct    Nov    Dec    Total  
0    9.87   0.00   0.00  2182.48  
1   28.65   0.49  19.53  1960.97  
2  157.89   0.00   0.00  2519.61  
3   38.68   0.00   0.00  1441.32  
4    7.37  24.90   0.00  1080.02  


Modelling

In [4]:
df_M=pd.melt(df,id_vars=['Year'],var_name='Month',value_name='RainFall(mm)')
df_M.head()

Unnamed: 0,Year,Month,RainFall(mm)
0,1901,Jan,13.12
1,1902,Jan,0.0
2,1903,Jan,0.0
3,1904,Jan,0.0
4,1905,Jan,0.66


Month column Encoding

In [5]:
Month_map={'Jan':1,'Feb':2,'Mar' :3,'April':4,'May':5,'June':6,'July':7,'Aug':8,'Sept':9,
   'Oct':10,'Nov':11,'Dec':12}
df_M['Month']=df_M['Month'].map(Month_map)

In [6]:
X=np.asanyarray(df_M[['Year','Month']]).astype('int')
y=np.asanyarray(df_M['RainFall(mm)']).astype('int')
print(y.shape)
print(X.shape)

(1573,)
(1573, 2)


  X=np.asanyarray(df_M[['Year','Month']]).astype('int')


Splitting data set

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

Decision Tree

In [8]:

regressor=DecisionTreeRegressor(random_state=0)
regressor.fit(X_train,y_train)

Predicting the testset result

In [9]:
y_pred=regressor.predict(X_test)
p_y_pred_test=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((p_y_pred_test.reshape(len(p_y_pred_test),1),y_test.reshape(len(y_test),1)),1))

[[1.00e+00 0.00e+00]
 [0.00e+00 3.43e+02]
 [1.30e+03 2.44e+03]
 [0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00]
 [0.00e+00 6.00e+00]
 [3.20e+02 1.15e+03]
 [0.00e+00 0.00e+00]
 [2.00e+00 1.00e+00]
 [0.00e+00 0.00e+00]
 [7.19e+02 2.64e+02]
 [1.80e+03 2.43e+03]
 [0.00e+00 0.00e+00]
 [6.81e+02 5.40e+02]
 [8.84e+02 7.42e+02]
 [1.35e+03 8.92e+02]
 [0.00e+00 0.00e+00]
 [0.00e+00 5.00e+00]
 [0.00e+00 0.00e+00]
 [1.54e+03 1.84e+03]
 [0.00e+00 0.00e+00]
 [2.29e+03 2.31e+03]
 [1.13e+03 2.85e+02]
 [0.00e+00 0.00e+00]
 [0.00e+00 2.30e+01]
 [3.90e+01 0.00e+00]
 [0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00]
 [2.06e+02 1.28e+02]
 [2.40e+01 2.10e+01]
 [4.30e+01 5.00e+01]
 [7.68e+02 8.73e+02]
 [5.56e+02 4.28e+02]
 [0.00e+00 0.00e+00]
 [1.00e+00 0.00e+00]
 [0.00e+00 0.00e+00]
 [3.00e+02 4.93e+02]
 [0.00e+00 0.00e+00]
 [1.72e+03 2.47e+03]
 [0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00]
 [3.74e+02 7.30e+01]
 [0.00e+00 0.00e+00]
 [4.00e+00 0.00e+00]
 [1.00e+00 0.00e+00]
 [0.00e+00 7.90e+01]
 [4.83e+02 6.

Evaluating Model Performance

In [10]:
r2_score(y_test,p_y_pred_test)

0.830103773857245

In [11]:
# Evaluate 
from sklearn.metrics import mean_squared_error
print("Mean Squared Error: %.4f"
      % mean_squared_error(y_test,y_pred, squared=False))

print('Variance Score: %.4f' % regressor.score(X_test, y_test))

Mean Squared Error: 282.5619
Variance Score: 0.8301
