# Classification Equivalent Features
Analysis of the features that can be substituted in high accuracy, minimal classifiers. A classifer is **minimal** if the removal of any single feature makes it no longer high accuracy. Two features are equivalent if one can substituted for the other in a minimal, high accuracy classifier and the result is still a high accuracy classifier.

In [1]:
import init
import common.constants as cn
import common_python.constants as ccn
from common.trinary_data import TrinaryData
from common.data_provider import DataProvider
import classifier.main_multi_classifier_feature_optimizer as main
from common.data_provider import DataProvider
from common_python.plots import util_plots
from plots import util_plots as xutil_plots

import datetime
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn

%matplotlib inline

In [2]:
# These are the "long" data that have individual replications, not averages
PROVIDER = DataProvider()
PROVIDER.do()
TRINARY = TrinaryData(is_averaged=False, is_dropT1=False)  # Trinary data
DF_X = TRINARY.df_X
SER_Y = TRINARY.ser_y
STATES = SER_Y.unique()
path = os.path.join(cn.DATA_DIR, "fit_result.xlsx")
DF_FIT = pd.read_excel(path)
path = os.path.join(cn.DATA_DIR, "fit_result_tf.xlsx")
DF_FIT_TF = pd.read_excel(path)
DF_FIT_TF.head()
TFs = list(DF_FIT_TF["feature"].unique())

In [14]:
# Relate incremental accuracy (RIA) of a feature
#   CCN.CLS_FEATURE - feature in the classifier that is removed
#   CCN.CMP_FEATURE - other feature that is inserted
#   CCN.SCORE - RIA of the new classifier compared to the old 1. A score of 1 means that the
#               two features are equivalent. A score > 1 means that the new feature creates
#               an improved classifier, and < 1 a degraded classifier.
DATA_DIR = cn.PROJECT_DIR
for directory in ["xstate", "python", "classifier"]:
    DATA_DIR = os.path.join(DATA_DIR, directory)
dfs = []
for state in SER_Y.unique():
    path = os.path.join(DATA_DIR, "main_feature_equivalence_calculator_%d.csv" % state)
    if os.path.isfile(path):
        df = pd.read_csv(path)
        del df[df.columns[0]] # Eliminate bogus column "Unnamed"
        dfs.append(df)
DF_RIA = pd.concat(dfs)
DF_RIA.head()

Unnamed: 0,cls_feature,cmp_feature,score,state
0,Rv0158,Rv0158,1.0,1
1,Rv0158,Rv1460,0.0,1
2,Rv0158,Rv0054,-4.0,1
3,Rv0158,Rv3246c,0.0,1
4,Rv0158,Rv2034,0.0,1
