In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

Enter parameters here.

In [None]:
# This is the source for training data.
train_data = pd.read_csv("clean_data1.csv")

# This is the source for testing data.
test_data = pd.read_csv("clean_data2.csv")

# If this is None, then the other data sources are used. Otherwise, this is the source for both training and testing data.
combined_data = None

# This is the column to test for.
objective = "gardener"

# This is a list of columns to ignore when training the decision tree.
ignored_features = []

In [None]:
chosen_data = (train_data, test_data) if combined_data is None else (combined_data,)

In [None]:
for data in chosen_data:
    for column in data:
        encoder = LabelEncoder()
        data[column] = encoder.fit_transform(data[column].astype("str"))

In [None]:
common_columns = set.intersection(*(set(data.columns) for data in chosen_data))
features = list(common_columns - ({objective} | set(ignored_features)))

In [None]:
if combined_data is None:
    x_train, y_train = train_data[features], train_data[objective]
    x_test, y_test = test_data[features], test_data[objective]
else:
    x_train, x_test, y_train, y_test = train_test_split(
       combined_data[features], combined_data[objective], test_size=0.2)
    
x_train, x_validate, y_train, y_validate = train_test_split(
    x_train, y_train, test_size=0.3)

In [None]:
forest = RandomForestClassifier(n_estimators=1000)
forest.fit(x_train, y_train)

In [None]:
status = forest.predict_proba(x_validate)
false_positive_rate, true_positive_rate, _ = roc_curve(y_validate, status[:,1])
roc_auc = auc(false_positive_rate, true_positive_rate)

In [None]:
plt.figure()
plt.plot(false_positive_rate, true_positive_rate, label="ROC Curve (Area = {:.2f})".format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle="--")
plt.title("Receiver Operating Characteristic for '{}'".format(objective))
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")

In [None]:
# importances = forest.feature_importances_
# indices = np.argsort(importances)[-10:]
# plt.barh(range(len(indices)), importances[indices])
# plt.yticks(range(len(indices)), features[indices])

In [None]:
y_true = y_validate
y_pred = status[:, 1] < 0.5
confusion = pd.DataFrame(confusion_matrix(y_true, y_pred), columns=["True", "False"], index=["False", "True"])
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy Score: {:.2f}%".format(accuracy * 100))

In [None]:
sns.heatmap(confusion, annot=True, fmt="d")
plt.title("Confusion Matrix for '{}'".format(objective))
plt.xlabel("Actual")
plt.ylabel("Predicted")