In [2]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('petrol_consumption.csv')
dataset.head()


Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410


In [3]:
X = dataset.iloc[:, :-1].values

# For demonstration, let's assume 'Petrol_Consumption' has been converted to a categorical target variable
y = np.where(dataset['Petrol_Consumption'] > dataset['Petrol_Consumption'].median(), 1, 0)


In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0])

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [8]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=4, random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=4, random_state=0)

In [9]:
from sklearn import metrics

y_pred = classifier.predict(X_test)
print('Test Accuracy:', metrics.accuracy_score(y_test, y_pred))
print('Test Precision:', metrics.precision_score(y_test, y_pred))
print('Test Recall:', metrics.recall_score(y_test, y_pred))

Test Accuracy: 0.6
Test Precision: 0.75
Test Recall: 0.5


In [11]:
params = {
    'max_depth': [1, 2, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'max_features': [2,3,4],
    'n_estimators': [10, 30, 50, 100, 200]
}

In [14]:
from sklearn.model_selection import GridSearchCV


In [15]:
classifier = RandomForestClassifier()

In [21]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=classifier, param_grid=params, 
                          n_jobs=-1, verbose=2, scoring = "accuracy")

In [26]:
%%time
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 375 candidates, totalling 1875 fits
Wall time: 1min 24s


GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 5, 10, 20],
                         'max_features': [2, 3, 4],
                         'min_samples_leaf': [5, 10, 20, 50, 100],
                         'n_estimators': [10, 30, 50, 100, 200]},
             scoring='accuracy', verbose=2)

In [27]:
rf_best = grid_search.best_estimator_

In [28]:
rf_best

RandomForestClassifier(max_depth=2, max_features=2, min_samples_leaf=5,
                       n_estimators=50)

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [33]:
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

In [34]:
evaluate_model(rf_best)

Train Accuracy : 0.9473684210526315
Train Confusion Matrix:
[[18  2]
 [ 0 18]]
--------------------------------------------------
Test Accuracy : 0.8
Test Confusion Matrix:
[[4 0]
 [2 4]]
