## Random Forest

Import necessary libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

Import data

In [None]:
data = pd.read_csv('lowdata.csv')
data.head()

Setting our independent and dependent variables

In [None]:
# The data have to be in a numpy array in order for the random forest algorithm to accept it
x_data = data.iloc[:, 1:-2].values
y_data = data.iloc[:, -1].values

In [None]:
# Checking our independent variables values
x_data

In [None]:
# Checking our dependent variable values
y_data

Splitting of data into train and test set

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, random_state=0)

<b>Training the algorithm</b>

<i>Note</i>: We can change the number of decision trees by adjusting the value of the parameter, n_estimators

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=0) # initialize
#we set 100 decision trees in our initialization

rf.fit(x_train, y_train) # fit the data to the algorithm

Testing the algorithm

In [None]:
#Converting x_test(an array) into a data frame
test = pd.DataFrame(x_test, columns = ['BIRTH', 'SMOKE', 'RACE', 'AGE', 'LWT'])

In [None]:
#Appending the actual classification and predicted classification
test['LOW'] = y_test
test['PRED'] = rf.predict(x_test)

In [None]:
#Actual vs predicted classification
test

Evaluating our algorithm

In [None]:
rf.score(x_test,y_test)

<b>Feature Importance

In [None]:
plt.figure()
#The RandomForestClassifier already has a built in method called "feature_importances_" to calculate feature importance
plt.barh(['BIRTH', 'SMOKE', 'RACE', 'AGE', 'LWT'], rf.feature_importances_)
plt.xlabel('Feature importance')
plt.ylabel('Feature name')
plt.show()

<b>Accuracy of Random Forest with different number of decision trees

In [None]:
Accuracy_Test = []
trees = []
for i in range (1,5000, 100):
    rf = RandomForestClassifier(n_estimators=i, random_state=0).fit(x_train, y_train)
    trees.append(i)
    Accuracy_Test.append(rf.score(x_test, y_test))

Plotting the accuracy vs n_estimators of Random Forest classifier

In [None]:
plt.figure()

# add a label to the x axis
plt.xlabel('n_estimators')

# add a label to the y axis
plt.ylabel('Accuracy')

# add a title
plt.title('n_estimators vs Accuracy on Data')

#scatter plot for n_estimators vs Accuracy on Testing data 
plt.scatter(trees, Accuracy_Test, s=50, c="blue")

plt.show()