# CLASSIFICATION OF LAB SAMPLES

This notebook analyzes the AM_MDM and AW_plus_v_AW_neg samples.

# Preliminaries

In [1]:
import init
from common import constants as cn
from common.trinary_data import TrinaryData
from common.data_provider import DataProvider
from common_python.plots import util_plots
from common_python.classifier import classifier_ensemble
from common_python.classifier import classifier_collection
from common_python.classifier.classifier_ensemble_random_forest import ClassifierEnsembleRandomForest
from common import transform_data

import collections
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import cross_val_score

%matplotlib inline

# Data
Data used in the analysis.

In [57]:
DATA = TrinaryData()
DATA.df_X.head()

Unnamed: 0,Rv1927,Rv1129c,Rv3085,Rv3083,Rv3086,Rv2226,Rv0260c,Rv1365c,Rv3084,Rv0975c,...,Rv3267,Rv2748c,Rv3260c,Rv0350,Rv2737c,Rv0440--Rv3417c,Rv2734,Rv0519c,Rv3418c,Rv3269
T2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,1,1,1,1,1,1,1,1,1,1
T3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,1,1,1,1,1,1,1,1,1,1
T4,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,1,1,1,1,1,1,1,1,1,1
T5,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,1,1,1,1,1,1,1,1,1,1
T6,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,1,1,1,1,1,1,1,1,1,1


In [3]:
df_sampleAM = transform_data.trinaryReadsDF(
    csv_file="AM_MDM_Mtb_transcripts_DEseq.csv", is_time_columns=False, is_display_errors=False)
df_sampleAW = transform_data.trinaryReadsDF(
"AW_plus_v_AW_neg_Mtb_transcripts_DEseq.csv", is_time_columns=False, is_display_errors=False)

In [4]:
df_sampleAM = df_sampleAM.T
df_sampleAM.head()

GENE_ID,Rv0001,Rv0005,Rv0006,Rv0009,Rv0010c,Rv0011c,Rv0013,Rv0014c,Rv0016c,Rv0020c,...,Rvnt36,Rvnt37,Rvnt38,Rvnt39,Rvnt40,Rvnt41,Rvnt42,Rvnt43,Rvnt44,Rvnt45
AM_D20_1,0,1,0,0,-1,1,0,0,0,0,...,-1,1,-1,0,-1,-1,-1,-1,-1,-1
AM_D20_3,0,1,0,0,-1,1,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
AM_D20_4,0,1,0,0,-1,1,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
AM_D20_5,0,1,0,0,-1,1,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
MDM_D20_1,1,0,0,0,-1,1,0,0,0,0,...,-1,0,-1,-1,-1,-1,-1,-1,-1,-1


In [5]:
df_sampleAW = df_sampleAW.T
df_sampleAW.head()

GENE_ID,Rv0001,Rv0005,Rv0006,Rv0009,Rv0010c,Rv0011c,Rv0013,Rv0014c,Rv0016c,Rv0020c,...,Rvnt36,Rvnt37,Rvnt38,Rvnt39,Rvnt40,Rvnt41,Rvnt42,Rvnt43,Rvnt44,Rvnt45
AW_plus_1,0,0,0,0,-1,0,-1,0,0,0,...,0,0,-1,0,-1,0,-1,0,0,-1
AW_neg_1,1,1,0,0,-1,1,-1,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
AW_plus_3,0,0,0,0,-1,1,0,0,0,0,...,0,0,-1,0,-1,-1,-1,-1,-1,-1
AW_neg_3,1,0,0,0,-1,1,0,0,0,0,...,0,-1,-1,-1,-1,-1,-1,-1,-1,-1
AW_plus_4,0,0,0,0,-1,1,0,0,0,0,...,0,0,-1,-1,-1,-1,-1,-1,-1,1


## Classification Validations
Classify T2-T25 and see if result is same as original class. Use 5-fold cross validation.


In [6]:
provider = DataProvider(is_normalized_wrtT0=False)
provider.do()

In [7]:
def getTimeSample(time_index):
    """
    Retrieves the values for a particular time column.
    
    Parameters
    ----------
    time_index: int
        index into the times
        
    Returns
    -------
    pd.DataFrame
    """
    time_index  = 2
    df0 = provider.dfs_centered_adjusted_read_count[0]
    column_name = list(df0.columns)[time_index]
    num = len(provider.dfs_centered_adjusted_read_count)
    ser = pd.Series(np.repeat(0, len(df0.index)), index=df0.index)
    for rpl in range(num):
        df = provider.dfs_centered_adjusted_read_count[rpl]
        column_name = list(df.columns)[time_index]
        ser += df[column_name]
    df = pd.DataFrame(ser/num)
    df_result = transform_data.trinaryReadsDF(df_sample=df, is_time_columns=False)
    return df_result.T

In [85]:
def dropIndices(df, indices):
    """
    Drops the indices from the dataframe or series.
    """
    df_result = df.copy()
    sorted_indices = list(indices)
    sorted_indices.sort()
    sorted_indices.reverse()
    for idx in sorted_indices:
        df_result = df_result.drop(idx, axis=0)
    return df_result
# Do cross 1-fold validation across all classes
def crossValidate(num_holdout=5, data=DATA, num_iter=10, **kwargs):
    """
    Does cross validation with the specified number of holdouts.
    
    Parameters
    ----------
    num_holdout: int
    num_iter: int
        number of cross validation iterations (folds)
    kwargs: dict
        arguments from constructor
        
    Returns
    -------
    float: fraction correct
    """
    svm_ensemble = classifier_ensemble.ClassifierEnsemble(
        classifier_ensemble.ClassifierDescriptorSVM(), **kwargs)
        #filter_high_rank=15, size=30
    total_correct = 0
    indices = list(data.df_X.index)
    length = len(indices)
    for _ in range(num_iter):
        # Find the holdouts
        random_positions = np.random.permutation(range(length))
        holdout_idxs = [indices[n] for n in random_positions[:num_holdout]]
        # Fit
        #df_X.columns = data.features
        df_X = dropIndices(data.df_X, holdout_idxs)
        ser_y = dropIndices(data.ser_y, holdout_idxs)
        svm_ensemble.fit(df_X, ser_y)
        # Evaluate
        df = pd.DataFrame(data.df_X.loc[holdout_idxs, :])
        df_pred = svm_ensemble.predict(df)
        for idx in holdout_idxs:
            true_cls = data.ser_y.loc[idx]
            total_correct += df_pred.loc[idx, true_cls]
    return total_correct/(num_iter*num_holdout)

# Tests
result = crossValidate(num_iter=20, filter_high_rank=15)
result

0.97

In [86]:
type(DATA)

common.trinary_data.TrinaryData

In [29]:
svm_ensemble = classifier_ensemble.ClassifierEnsemble(
        classifier_ensemble.ClassifierDescriptorSVM(), filter_high_rank=15, size=30)
total_correct = 0

1.0

In [None]:
if False:
    data.ser_y
    df_expected = pd.DataFrame(data.ser_y)
    df_expected.columns = ["columns"]
    df_expected["value s"] = 1.0
    dff_expected = df_expected.pivot(columns="columns", values="values")
    dff_expected = dff_expected.fillna(0)
    dff_expected.index = [int(v[1:]) for v in dff_expected.index]
    dff_expected 
    df_diff = (dff_expected - dfff)
    df_diff = df_diff.applymap(lambda v: np.abs(v))
    df_diff.sum().sum()/len(df_diff)

## Classification of Lab Samples

In [None]:
svm_ensemble = classifier_ensemble.ClassifierEnsemble(
        classifier_ensemble.ClassifierDescriptorSVM(), filter_high_rank=15, size=30)
df_X = data.df_X.copy()
df_X.columns = data.features
svm_ensemble.fit(df_X, data.ser_y)

In [None]:
svm_ensemble.predict(df_sampleAM)

In [None]:
svm_ensemble.predict(df_sampleAW)

## Comparisons With Random Forest

In [None]:
dummy_columns = [v+"-" for v in data.features]
truncated_columns = [f[0:f.index("-")] for f in dummy_columns]
len(truncated_columns)
df_X = data.df_X.copy()
df_X.columns = truncated_columns

In [None]:
clf = ClassifierEnsembleRandomForest(size=150, filter_high_rank=30)
clf.fit(df_X, data.ser_y)

In [None]:
df_sampleAM[truncated_columns].head()

In [None]:
clf.predict(df_sampleAM[clf.columns])

In [None]:
clf.predict(df_sampleAW[clf.columns])