# 1. Using Random Forest for Regression 

Problem Definition: The problem is to predict the gas consumption(in millions of gallons) in 48 of the US states based on petrol tax(in cents), per capita income(dollars), paved highways(in miles) and the proportion of population with the driving license

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
dataset=pd.read_csv("petrol_consumption.csv")
dataset.head()

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410


In [12]:
# mean of target variable
0.1*dataset["Petrol_Consumption"].mean()

# RMSE < 10% of mean of target variable

57.67708333333334

In [13]:
# Prepare the data
# divide the data into attributes and lables

x=dataset.drop("Petrol_Consumption",axis=1)
y=dataset["Petrol_Consumption"]


In [14]:
# Dividing the data into training and testing sets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

# Training and Making Predictions

from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=20,random_state=0)
regressor.fit(x_train,y_train)
y_pred=regressor.predict(x_test)

In [15]:
# Evaluating the Algorithm

from sklearn import metrics
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 51.76500000000001
Mean Squared Error: 4216.166749999999
Root Mean Squared Error: 64.93201637097064


In [16]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=250,random_state=0)
regressor.fit(x_train,y_train)
y_pred=regressor.predict(x_test)
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Root Mean Squared Error: 59.02340544563658


# 2. Using Random Forest for Classification

Problem Definition: The task here is to predict whether a bank currency note is aunthetic or not  based on four attributes i.e. variance of the image wavelet transformed image, skewness, entropy, and curtosis of the image.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
dataset = pd.read_csv("bill_authentication.csv")
dataset.head()

In [None]:
dataset.shape

In [None]:
# Dividing the data into attributes and lables

x=dataset.drop("Class", axis=1)
y=dataset["Class"]

In [None]:
# Dividing the data into training and testing sets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
# Training the Algorithm

from sklearn.ensemble import RandomForestClassifier

classifier=RandomForestClassifier(n_estimators=20,random_state=0)
classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test)

In [None]:
# Evaluating the Algorithm

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
accuracy_score(y_test,y_pred)*100

In [None]:
# To find the best value of estimators:
# (Because higher number of estimators means more trees, which means more time.)

# Loop through values 10 to 201, step by 10
# Use n_estimator as variable in RandomForestClassifer or RandomForestRegressor

# declaring a list to hold performance values
error=[]
for i in range(10,201,10):
    classifier=RandomForestClassifier(n_estimators=i,random_state=100)
    classifier.fit(x_train,y_train)
    pred_i=classifier.predict(x_test)
    error.append(np.mean(pred_i!=y_test))
    
    
# plotting the error values vs. n_estimator
plt.figure(figsize=(12, 6))  
plt.plot(range(10, 201, 10), error, color='red', 
         linestyle='dashed', marker='o', markerfacecolor='blue', 
         markersize=10)
plt.xticks(np.arange(10, 201, step=10))
plt.title('Error Rate n_estimator Value')  
plt.xlabel('n_estimator Value')  
plt.ylabel('Mean Error');