In [None]:
import pandas as pd
import config
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
def readTestFiles():
    X_train = pd.read_csv(config.X_train_data1, index_col = False)
    X_test = pd.read_csv(config.X_test_data1, index_col = False)
    Y_train = pd.read_csv(config.Y_train_data1, index_col = False)
    Y_test = pd.read_csv(config.Y_test_data1, index_col = False)

    return X_train, X_test, Y_train, Y_test

In [None]:
def importantFeatures(X_train):
    rf = pickle.load(open(config.rf_model_data1, 'rb'))
    names = list(X_train.columns)
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    new_indices = indices[:20]
    features = X_train.columns[indices]
    indices = rf.feature_importances_[indices]
    f1 = features
    i1 = indices
    features = list(features[:20])
    indices = list(indices[:20])
    print(features)
    print(indices)
    return features, indices, f1, i1

In [None]:
def featurePlot(features, indices, X_train, X_test, Y_train, Y_test):
    acc = []
    for i in range(1, 55):
        x = X_train[features[:i]]
        R = RandomForestClassifier()
        R.fit(x, Y_train)
        xtest = X_test[features[:i]]
        y_pred = R.predict(xtest)
        acc1 = accuracy_score(y_pred, Y_test)
        print(acc1)
        acc.append(acc1)
    return acc


In [None]:
def plotFeatureImportance(features, indices):
    figure(figsize=(8, 6), dpi=80)
    x1 = [i for i in features[::-1]]
    x2 = [i for i in indices[::-1]]
    plt.barh(x1, x2, align = 'center')
    #plt.yticks(range(n), [features[i] for i in indices[-n:]])
    plt.xticks([0, 0.0025, 0.005, 0.0075, 0.01, 0.0125, 0.015, 0.0175, 0.02, 0.0225, 0.025])
    plt.xlabel('Relative Importance')
    plt.ylabel('Features')
    plt.title('Feature Importance plot on Dataset 1')
    plt.savefig('/home/ubuntu/asset_classification/plots/feature_importances_dataset3.png')

In [None]:
def main():
    X_train, X_test, Y_train, Y_test = readTestFiles()
    features, indices, f1, i1 = importantFeatures(X_train)
    plotFeatureImportance(features, indices)
    acc = featurePlot(f1, i1,  X_train, X_test, Y_train, Y_test)
    return features, indices, acc


In [None]:
features, indices, acc = main()

In [None]:
plt.plot(list(range(0, 54)), acc)
plt.xticks([1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55])
plt.yticks([0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6])
plt.xlabel('Number of Top Features Selected')
plt.ylabel('Accuracy Obtained')
plt.savefig('1.png')