In [None]:
import pandas as pd
import config
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import random

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
def read_files():
    df = pd.read_csv(config.datasets_dir + config.optimized_dataset)
    df['ASSET_CLASS'] = pd.Categorical(df['ASSET_CLASS'])
    df['ASSET_CLASS_CODES'] = df['ASSET_CLASS'].cat.codes
    return df

In [None]:
def trainTestSplit(df,n):
    random.seed(123)
    df1 = df['ASSET_CLASS'].value_counts().rename_axis('Assets').reset_index(name = 'counts')
    df_new = df1[df1['counts']>=n] # Train Test split 75% - train   
    assets = list(df_new['Assets'])
    dffiltered = df[df['ASSET_CLASS'].isin(assets)]
    x = dffiltered.drop(columns = ['ASSET_CLASS','important_words','BUSINESS_UNIT','PSC_CODE','FUND_SUBOBJCLASS','ORDER_DATE','ORDER_TITLE','ASSET_CLASS_CODES',                                                                                                                                                           
            'LINE_DESCRIPTION', 'VENDOR_NAME', 'VENDOR_COUNTRY', 'ASSET_CLASS_DESCRIPTION','text_fields','ASSET_CLASS_OLD','SUB_OBJ_DESCR','OBJ_CODE'])
    xcols = list(x.columns)
    y = dffiltered['ASSET_CLASS_CODES']
    X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.20, stratify = y)
    print(' Number of Assets ' + str(len(set(list(dffiltered['ASSET_CLASS'])))))
    print(X_train.shape)
    return X_train, X_test, Y_train, Y_test

In [None]:
def importantFeatures(X_train):
    rf = pickle.load(open(config.rf_model_data3, 'rb'))
    names = list(X_train.columns)
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    new_indices = indices[:20]
    features = X_train.columns[indices]
    indices = rf.feature_importances_[indices]
    f1 = features
    i1 = indices
    features = list(features[:20])
    indices = list(indices[:20])
    print(features)
    print(indices)
    return features, indices, f1, i1

In [None]:
def featurePlot(features, indices, X_train, X_test, Y_train, Y_test):
    acc = []
    for i in range(1, 55):
        x = X_train[features[:i]]
        R = RandomForestClassifier()
        R.fit(x, Y_train)
        xtest = X_test[features[:i]]
        y_pred = R.predict(xtest)
        acc1 = accuracy_score(y_pred, Y_test)
        print(acc1)
        acc.append(acc1)
    return acc


In [None]:
def plotFeatureImportance(features, indices):
    figure(figsize=(8, 6), dpi=80)
    x1 = [i for i in features[::-1]]
    x2 = [i for i in indices[::-1]]
    plt.barh(x1, x2, align = 'center')
    #plt.yticks(range(n), [features[i] for i in indices[-n:]])
    plt.xticks([0, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25, 0.275])
    plt.xlabel('Relative Importance')
    plt.ylabel('Features')
    plt.title('Feature Importance plot on Dataset 3')
    plt.savefig('/home/ubuntu/asset_classification/plots/feature_importances_dataset1.png')

In [None]:
def main():
    df = read_files()
    X_train, X_test, Y_train, Y_test = trainTestSplit(df,100)
    features, indices, f1, i1 = importantFeatures(X_train)
    plotFeatureImportance(features, indices)
    acc = featurePlot(f1, i1,  X_train, X_test, Y_train, Y_test)
    return features, indices, acc


In [None]:
plotFeatureImportance(features, indices)

In [None]:
#features, indices, acc = main()


In [None]:
plt.plot(list(range(0, 54)), acc)
plt.xticks([1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55])
plt.xlabel('Number of Top Features Selected')
plt.ylabel('Accuracy Obtained')
plt.savefig('1.png')