# Classification Groups
A classification group is a set of genes that, when used as features, results in a high accuracy classifier for a state. The genes in a classification group may well be surrogates for transcription modules or programs. This can be exposed by looking for transcription factors common across groups for a single state.

# Analysis of Features by Classification Accuracy

In [1]:
import init
import common.constants as cn
from common.trinary_data import TrinaryData
from common.data_provider import DataProvider
import classifier.main_multi_classifier_feature_optimizer as main
from common.data_provider import DataProvider
from common_python.plots import util_plots
from plots import util_plots as xutil_plots

import datetime
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

## Create Base Data

In [18]:
PROVIDER = DataProvider()
PROVIDER.do()
TRINARY = TrinaryData(is_averaged=False, is_dropT1=False)  # Trinary data
path = os.path.join(cn.DATA_DIR, "fit_result.xlsx")
DF_FIT = pd.read_excel(path)
DF_FIT.head()

Unnamed: 0,state,index,GENE_ID,score,count
0,0,0,Rv2626c,1.0,1
1,0,1,Rv1813c,1.0,1
2,0,2,Rv0022c,1.0,117
3,0,2,Rv2007c,1.0,117
4,0,2,Rv3854c,1.0,117


In [3]:
PROVIDER.df_trn_signed.head()

Unnamed: 0,tf,GENE_ID,sign
0,Rv0022c,Rv0029,1
1,Rv0022c,Rv0030,1
2,Rv0022c,Rv0031,1
3,Rv0022c,Rv0069c,1
4,Rv0022c,Rv0070c,1


## Monitor Progress on Constructing Classification Groups

In [20]:
FIT_RESULT_DCT = main.getFitResultFromPersister()
def run():
    for cl in FIT_RESULT_DCT.keys():
        length_stg = str([len(fr.sels) for fr in FIT_RESULT_DCT[cl]])
        score_stg = str([fr.sels_score for fr in FIT_RESULT_DCT[cl]])
        n_eval_stg = str([fr.n_eval for fr in FIT_RESULT_DCT[cl]])
        print("%d: %s\n   %s\n   %s" % (cl, length_stg, score_stg, n_eval_stg))
if False:
    run()

In [6]:
# Check status of run
if False:
    df_g = pd.DataFrame(DF_CSV.groupby([cn.STATE, cn.INDEX]).count())
    dff = df_g.reset_index()
    for state in DF_CSV[cn.STATE].unique():
        dff_state = dff[dff[cn.STATE] == state]
        print("%d: %d" % (state, dff_state[cn.INDEX].max()))

## Data Construction for Classification Groups

In [7]:
# Group by state and index
dfg = DF_CSV.groupby([cn.STATE, cn.INDEX])
df_g = pd.DataFrame(dfg.count())
df_g[cn.COUNT] = dfg.count()
for col in df_g.columns:
    if col != cn.COUNT:
        del df_g[col]              
df_g = df_g.reset_index()
df_g[cn.SCORE] = dfg.mean().reset_index()[cn.SCORE]
state = 0
df = df_g[df_g[cn.STATE] == state]
#plt.scatter(df[cn.INDEX], df[cn.SCORE])
df_g.head()

Unnamed: 0,state,index,count,score
0,0,0,1,1.0
1,0,1,1,1.0
2,0,2,3,1.0
3,0,3,2,0.993333
4,0,4,2,1.0


In [8]:
DF_CSV.head()

Unnamed: 0,state,index,GENE_ID,score,count
0,0,0,Rv2626c,1.0,1
1,0,1,Rv1813c,1.0,1
2,0,2,Rv0022c,1.0,117
3,0,2,Rv2007c,1.0,117
4,0,2,Rv3854c,1.0,117


In [9]:
PROVIDER.df_trn_unsigned

Unnamed: 0,tf,GENE_ID,sign
0,Rv0022c,Rv0029,1
1,Rv0022c,Rv0030,1
2,Rv0022c,Rv0031,1
3,Rv0022c,Rv0069c,1
4,Rv0022c,Rv0070c,1
...,...,...,...
12183,Rv3862c,Rv3881c,1
12184,Rv3911,Rv2514c,1
12185,Rv3911,Rv2515c,1
12186,Rv3911,Rv3906c,1


In [10]:
df_merge = DF_CSV.merge(PROVIDER.df_trn_unsigned)

In [11]:
df_merge.head()

Unnamed: 0,state,index,GENE_ID,score,count,tf,sign
0,0,0,Rv2626c,1.0,1,Rv1985c,1
1,0,0,Rv2626c,1.0,1,Rv3133c,1
2,0,1,Rv1813c,1.0,1,Rv1985c,1
3,0,2,Rv0022c,1.0,117,Rv1033c,1
4,0,2,Rv0022c,1.0,117,Rv3597c,1


In [22]:
# Construct a dataframe that merges classification group with transcription factors
df_merge = DF_CSV.merge(PROVIDER.df_trn_unsigned)
del df_merge[cn.SIGN]
del df_merge[cn.COUNT]
df_merge = df_merge.drop_duplicates()
df_merge = df_merge.sort_values([cn.STATE, cn.INDEX, cn.TF])
df_merge.head()

Unnamed: 0,state,index,GENE_ID,score,tf
0,0,0,Rv2626c,1.0,Rv1985c
1,0,0,Rv2626c,1.0,Rv3133c
2,0,1,Rv1813c,1.0,Rv1985c
8,0,2,Rv3854c,1.0,Rv0324
3,0,2,Rv0022c,1.0,Rv1033c


## Analysis Transcription Factor by Class
This analysis examines commonalities in the transcription factors of classification groups.

In [83]:
df_tf = df_merge.copy()
del df_tf[cn.GENE_ID]
df_tf[cn.GROUP] = df_tf[cn.INDEX]
del df_tf[cn.INDEX]
df_tf = df_tf.drop_duplicates()
df_tf.head()

Unnamed: 0,state,score,tf,group
0,0,1.0,Rv1985c,0
1,0,1.0,Rv3133c,0
2,0,1.0,Rv1985c,1
8,0,1.0,Rv0324,2
3,0,1.0,Rv1033c,2


57

In [51]:
STATE = 0
df_plot = df_tf[df_tf[cn.STATE] == STATE]
df_plot = df_plot.copy()
df_plot.index = [str(i) for i in df_plot.index]
del df_plot[cn.STATE]
df_plot = df_plot.pivot(index=cn.GROUP, columns=cn.TF, values=cn.SCORE)
df_plot.columns = [str(c) for c in df_plot.columns]
df_plot.head()

Unnamed: 0_level_0,Rv0022c,Rv0023,Rv0042c,Rv0047c,Rv0081,Rv0135c,Rv0302,Rv0324,Rv0353,Rv0465c,...,Rv3249c,Rv3416,Rv3488,Rv3574,Rv3597c,Rv3681c,Rv3736,Rv3830c,Rv3849,Rv3855
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,1.0,,,...,1.0,,,,1.0,,,,,1.0
3,,,,0.993333,,,,,,,...,,,,,,,,,,
4,,,,,1.0,,,,,,...,,,,,,,,,,


In [94]:
df_plot.head()

Unnamed: 0_level_0,Rv0022c,Rv0023,Rv0042c,Rv0047c,Rv0081,Rv0135c,Rv0302,Rv0324,Rv0353,Rv0465c,...,Rv3249c,Rv3416,Rv3488,Rv3574,Rv3597c,Rv3681c,Rv3736,Rv3830c,Rv3849,Rv3855
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,1.0,,,...,1.0,,,,1.0,,,,,1.0
3,,,,0.993333,,,,,,,...,,,,,,,,,,
4,,,,,1.0,,,,,,...,,,,,,,,,,


In [93]:
import seaborn
def plotTFByGroup(state, min_score=0.9):
    # Construct the data
    df_plot = df_tf[df_tf[cn.STATE] == state]
    df_plot = df_plot.copy()
    df_plot = df_plot.applymap(lambda v: float(v))
    df_plot = df_plot.applymap(lambda v: 0 if np.isnan(v) else v)
    df_plot.index = [str(i) for i in df_plot.index]
    del df_plot[cn.STATE]
    df_plot = df_plot.pivot(index=cn.GROUP, columns=cn.TF, values=cn.SCORE)
    df_plot.columns = [str(c) for c in df_plot.columns]
    # Do the plot
    plt.figure(figsize=(18, 10))
    ax = plt.gca()
    ax.set_xticks(np.arange(len(df_plot.index))+0.5)
    ax.set_xticklabels(df_plot.index, rotation=0)
    ax.set_yticks(np.arange(len(df_plot.columns))+0.5)
    ax.set_yticklabels(df_plot.columns, rotation=0)
    ax.set_xlabel(cn.GROUP)
    ax.set_ylabel("transcription factor")
    #heatmap = plt.pcolor(df_plot.T)
    #_ = plt.colorbar(heatmap)
    cg = seaborn.clustermap(df_plot, row_cluster=True)
    ax.set_title("State: %d" % state)
    
plotTFByGroup(0)

ValueError: could not convert string to float: 'Rv1985c'

In [None]:
cg = seaborn.clustermap(df_corr, col_cluster=True, vmin=    -1, vmax=1,
       cbar_kws={"ticks":[-1, 0, 1]}, cmap=cmap)

In [88]:
for state in range(6):
    plotTFByGroup(state)

ValueError: Index contains duplicate entries, cannot reshape