# Feature Analysis Statistics
Provides various statistics about features in the context of a binary classifier. The study is limited to transcription factors (reglators).

In [1]:
import init
import common.constants as cn
import common_python.constants as ccn
from common.trinary_data import TrinaryData
from common.data_provider import DataProvider
import classifier.main_multi_classifier_feature_optimizer as main
from common.data_provider import DataProvider
from common_python.plots import util_plots
from plots import util_plots as xutil_plots
from common_python.classifier import feature_analyzer
from common_python.util import util

import datetime
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn
from sklearn import svm

%matplotlib inline

In [2]:
# These are the "long" data that have individual replications, not averages. And, only using TFs.
TRINARY = TrinaryData(is_averaged=False, is_dropT1=False, is_regulator=True)  # Trinary data
DF_X = TRINARY.df_X
SER_Y = TRINARY.ser_y
STATES = SER_Y.unique()
REGULATORS = DF_X.columns.tolist()
DATA_PATH = cn.PROJECT_DIR
for directory in ["data", "feature_analysis"]:
    DATA_PATH = os.path.join(DATA_PATH, directory)
BASE_PATH = os.path.join(DATA_PATH, "main_feature_analyzer_%s_%d.csv") 
CLF = svm.LinearSVC()

In [3]:
# Construct an FeatureAnalyzer for each sate
ANALYZERS = []
for state in STATES:
    dct = {m: BASE_PATH  % (m, state) for m in feature_analyzer.METRICS}
    analyzer = feature_analyzer.FeatureAnalyzer(
        CLF, DF_X, SER_Y,
        data_path_dct=dct)
    ANALYZERS.append(analyzer)

In [4]:
# Prune dataframe


In [14]:
# Non-zero regulators by state
def getNZRegulators():
    dct = {}
    for state in STATES:
        analyzer = ANALYZERS[state]
        df = util.pruneZeros(analyzer.getMetric(feature_analyzer.CPC))
        dct[state] = df.columns.tolist()
    return dct

In [15]:
NZ_REGULATOR_DCT = getNZRegulators()

In [None]:
NZ_REGULATOR_DCT

In [None]:
# Number of non-zero regulators by state
num_regulators = [len(NZ_REGULATOR_DCT[s]) for s in STATES]
plt.bar(STATES, num_regulators )
_ = plt.xlabel("state")
_ = plt.ylabel("# non-zero regulators")

In [None]:
NZ_REGULATOR_DCT[0]

## Single Feature Accuracy (SFA)

In [None]:
def plotSFA(state, num_gene=10, ncol=1, nrow=1):
    fig, ax = plt.subplots(ncol, nrow)
    fig.set_figheight(6)
    fig.set_figwidth(18)
    row = int(state/ncol)
    col = state - row*ncol
    this_ax = ax
    ser = getSFA(state)
    xv = ser.index.tolist()[:num_gene]
    yv = ser.values[:num_gene]
    this_ax.bar(xv, yv)
    this_ax.set_title("%d" % state)
    this_ax.set_xticklabels(xv, fontsize=14)
    this_ax.set_ylabel("Single Feature Accuracy")
    this_ax.set_ylim([0, 1])
    this_ax.set_xticklabels(xv, rotation='vertical')

plotSFA(2, num_gene=70)

In [None]:
def plotAllSFA(num_gene=10, nrow=1, ncol=6):
    fig, ax = plt.subplots(nrow, ncol)
    fig.set_figheight(6)
    fig.set_figwidth(18)
    for state in STATES:
        row = int(state/ncol)
        col = state - row*ncol
        if nrow == 1:
            this_ax = ax[col]
        else:
            this_ax = ax[row, col]
        ser = getSFA(state)
        xv = ser.index.tolist()[:num_gene]
        yv = ser.values[:num_gene]
        this_ax.bar(xv, yv)
        this_ax.set_title("%d" % state)
        this_ax.set_xticklabels(xv, fontsize=14)
        if state == 0:
            this_ax.set_ylabel("Single Feature Accuracy")
            this_ax.set_ylim([0, 1])
        else:
            this_ax.set_yticklabels([])
        this_ax.set_xticklabels(xv, rotation='vertical')
        this_ax.set_ylim([0.48, 1])
        this_ax.yaxis.set_ticks_position('both')

plotAllSFA(num_gene=10)

# Classifier Prediction Correlation (CPC)
Delete rows and columns with 0 values.
Heat map for the remainder.

In [None]:
# For each state, plot the accuracy of a single regulator classifier
def plotCPC(state):
    print("\n*** State %d" % state)
    df = pruneDF(getCPC(state))
    if len(df) > 1:
        cg = seaborn.clustermap(df, col_cluster=True,
              row_cluster=True,
              vmin=-1, vmax=1,
              cbar_kws={"ticks":[-1, 0, 1]}, cmap="seismic")
        plt.title("State %d" % state)
    else:
        print(df)

In [None]:
for state in STATES:
    _ = plt.figure()
    plotCPC(state)

In state 2, Rv0602c has he largest SFA. However, this gene does not appear in the CPC matrix.

In [None]:
df = getCPC(2)
df["Rv0602c"]

In [None]:
dff = pruneDF(df)
dff