## Decision Tree Regressor

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib_inline

In [3]:
df=pd.read_csv("regression_cleaned_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,VISIBILITY,DRYBULBTEMPF,WETBULBTEMPF,DewPointTempF,RelativeHumidity,WindSpeed,WindDirection,StationPressure,SeaLevelPressure,Precip,MONTH
0,0,6.0,33,32,31,92,0,0,29.97,29.99,0.01,1
1,1,6.0,33,33,32,96,0,0,29.97,29.99,0.02,1
2,2,5.0,33,33,32,96,0,0,29.97,29.99,0.02,1
3,3,5.0,33,33,32,96,0,0,29.95,29.97,0.02,1
4,4,5.0,33,32,31,92,0,0,29.93,29.96,0.02,1


In [6]:
df=df.drop(['Unnamed: 0'],axis=1)

In [7]:
df.head()

Unnamed: 0,VISIBILITY,DRYBULBTEMPF,WETBULBTEMPF,DewPointTempF,RelativeHumidity,WindSpeed,WindDirection,StationPressure,SeaLevelPressure,Precip,MONTH
0,6.0,33,32,31,92,0,0,29.97,29.99,0.01,1
1,6.0,33,33,32,96,0,0,29.97,29.99,0.02,1
2,5.0,33,33,32,96,0,0,29.97,29.99,0.02,1
3,5.0,33,33,32,96,0,0,29.95,29.97,0.02,1
4,5.0,33,32,31,92,0,0,29.93,29.96,0.02,1


## Independent and dependent features

In [8]:
y= df.iloc[:,0] #independent feature
X = df.iloc[:, 1:] #dependent feature

## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)


In [10]:
from sklearn.tree import DecisionTreeRegressor
regressor=DecisionTreeRegressor()

In [11]:
regressor.fit(X_train,y_train)

In [12]:
y_pred=regressor.predict(X_test)

In [13]:
y_pred

array([10. , 10. ,  0.5, ..., 10. , 10. , 10. ])

## Accuracy of the model

In [14]:
from sklearn.metrics import r2_score
score=r2_score(y_pred,y_test)

In [15]:
score

0.42238646672318914

## Hyperparameter Tuning

In [21]:
parameter = {
    'criterion': ['mse', 'friedman_mse', 'poisson', 'mae'],  # 'mse' is the correct criterion for squared error
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12],
    'max_features': ['auto', 'sqrt', 'log2']
}

regressor = DecisionTreeRegressor()

import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
regressorcv = GridSearchCV(regressor, param_grid=parameter, cv=2, scoring='neg_mean_squared_error')

regressorcv.fit(X_train, y_train)


In [22]:
regressorcv.best_params_

{'criterion': 'poisson',
 'max_depth': 7,
 'max_features': 'sqrt',
 'splitter': 'best'}

In [23]:
y_pred=regressorcv.predict(X_test)

In [24]:
r2_score(y_pred,y_test)

-0.25132475770812257