In [1]:
import pandas as pd
import os
import numpy as np

In [None]:
# for dirname, _, filenames in os.walk('./lish-moa'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [9]:
BASE_PATH = os.environ.get('TRAINML_DATA_PATH') if os.environ.get('TRAINML_DATA_PATH') else './lish-moa'
BASE_PATH

'/opt/data'

In [10]:
train = pd.read_csv(f"{BASE_PATH}/train_features.csv")
train_target = pd.read_csv(f"{BASE_PATH}/train_targets_scored.csv")
train_target_nonscored = pd.read_csv(f"{BASE_PATH}/train_targets_nonscored.csv")
test = pd.read_csv(f"{BASE_PATH}/test_features.csv")

### `train_features.csv`
Columns:
* `sig_id` = sample
* `cp_type` = treated with a compound (`trt_cp`) or control (`ctl_vehicle`)
  * if control, then sample has no MoA.
* `cp_time` = treatment duration, hours
* `cp_dose` = low (D1), high (D2)

In [11]:
train.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


### `train_targets_scored.csv`, `train_targets_nonscored.csv`

* Rows are the same samples as `train_features.csv`
* Columns are different MoA's. Cell values are 0 or 1
* The scored file has all the columns (MoA's) we need in our `submission.csv` submission file.
* The nonscored file has additional MoA responses, using this file is optional. Not sure how it would help us anyways.

In [12]:
print("train_targets_scored: {} x {}".format(len(train_target), len(train_target.columns)))
print("train_targets_nonscored: {} x {}".format(len(train_target_nonscored), len(train_target_nonscored.columns)))
train_target.head()

train_targets_scored: 23814 x 207
train_targets_nonscored: 23814 x 403


Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


* Control samples have no MoA's. ==> All MoA targets are 0 for samples where `cp_type` == `ctrl_vehicle`
* Still possible for treated samples to have 0 for all MoA targets.

In [13]:
train[train['cp_type'] == 'ctl_vehicle'][['sig_id', 'cp_type']].head(1)

Unnamed: 0,sig_id,cp_type
25,id_0054388ec,ctl_vehicle


In [130]:
train_target.loc[train_target['sig_id'] == 'id_0054388ec'].values

array([['id_0054388ec', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=object)

## Relevant notebooks:

### 1. Good data overview: https://www.kaggle.com/isaienkov/mechanisms-of-action-moa-prediction-eda
  * Data visualizations
  * Found some correlations btwn. features (feature aggregation)
  * Found most correlated features for every MoA target.
  
  
 <br>
 
2. Someone hypothesizes that we ca use t-tests for feature selection, to find feature-label correlations
  * https://www.kaggle.com/isaienkov/moa-prediction-interesting-findings
  * https://www.quora.com/How-can-I-use-students-T-test-for-feature-selection
  * [Feature selection for classification of gene expression data](https://core.ac.uk/download/pdf/82376463.pdf)

  TODO: Is it correct to use t-test feature selection for non-binary (multilabel) classification?

  TODO: If so, has someone written code for such feature selection algorithm? Can we use it.


In [41]:
targets = train_target[train_target.columns[~train_target.columns.isin(['sig_id'])]]

In [42]:
train_target_summary = targets.T.agg(['sum','count'], axis='columns').sort_values(by=['sum'])
train_target_summary

Unnamed: 0,sum,count
atp-sensitive_potassium_channel_antagonist,1,23814
erbb2_inhibitor,1,23814
diuretic,6,23814
autotaxin_inhibitor,6,23814
protein_phosphatase_inhibitor,6,23814
...,...,...
serotonin_receptor_antagonist,404,23814
dopamine_receptor_antagonist,424,23814
cyclooxygenase_inhibitor,435,23814
proteasome_inhibitor,726,23814


In [43]:
N, K = targets.shape

In [44]:
predictions = np.zeros((N, K))

In [45]:
predictions.shape

(23814, 206)

In [47]:
eval = predictions == targets

In [51]:
np.mean(eval)

5-alpha_reductase_inhibitor              0.999286
11-beta-hsd1_inhibitor                   0.999244
acat_inhibitor                           0.998992
acetylcholine_receptor_agonist           0.992021
acetylcholine_receptor_antagonist        0.987360
                                           ...   
ubiquitin_specific_protease_inhibitor    0.999748
vegfr_inhibitor                          0.992861
vitamin_b                                0.998908
vitamin_d_receptor_agonist               0.998362
wnt_inhibitor                            0.998740
Length: 206, dtype: float64

# Need to deal with the class imbalance problem

### Resources
* [Deep Learning Methods for class imbalanced data](https://link.springer.com/article/10.1186/s40537-019-0192-5#Sec14)
* [Summary of PyTorch Loss Functions](https://neptune.ai/blog/pytorch-loss-functions)
* [PyTorch loss function for Multi-Class, Multi-Label](https://stackoverflow.com/a/52859411)
* [Imbalance dataset sampler for images](https://github.com/ufoym/imbalanced-dataset-sampler)
* [Different techniques](https://en.wikipedia.org/wiki/Oversampling_and_undersampling_in_data_analysis)
* [Imbalance Learn Library](https://imbalanced-learn.org/stable/over_sampling.html#a-practical-guide)

### Solutions

1. MLSMOTE algo (overfitting)
  * [Medium article](https://medium.com/thecyphy/handling-data-imbalance-in-multi-label-classification-mlsmote-531155416b87)
  * [Kaggle notebook](https://www.kaggle.com/c/lish-moa/discussion/187419) 
  * imbalanced-learn does not directly offer algos compatible for multilabel classification (see below)
 
  
2. Transform multilabel classification problem into single-label classification, then use imbalanced-learn
    * [label power set transformation](https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/)
    * [Using imbalanced-learn and skmultilearn](https://github.com/scikit-learn-contrib/imbalanced-learn/issues/340#issuecomment-343114570)
    
Easiest approach is to try 1) and see if overfitting with MLSMOTE works. If not, try 2) and use anothe