In [1]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
BASE_PATH = os.environ.get('TRAINML_DATA_PATH') if os.environ.get('TRAINML_DATA_PATH') else './lish-moa'
X_train = pd.read_csv(f"{BASE_PATH}/train_features.csv")
y_train = pd.read_csv(f"{BASE_PATH}/train_targets_scored.csv")

In [3]:
feature_cols = X_train.columns[2:]
target_cols = y_train.columns[1:]
n_targets = len(target_cols)

In [4]:
def preprocess_df(df):
    # Make all feature cols numerical.
    if 'cp_type' in df.columns:
        df = df.drop('cp_type', axis=1)
    df['cp_dose'] = (df['cp_dose'] == 'D2').astype(int)
    df['cp_time'] = df['cp_time'].map({24:1, 48: 2, 72: 3})
    return df

In [5]:
X_train = preprocess_df(X_train)
X_train = X_train.drop('sig_id', axis=1)
y_train = y_train.drop('sig_id', axis=1)

In [6]:
X_train.head()

Unnamed: 0,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,1,0,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,-0.0326,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,3,0,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,0.3372,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,2,0,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,0.2155,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,2,0,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,0.1792,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,3,1,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,-0.1498,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [7]:
y_train.head()

Unnamed: 0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Using MLSMOTE algorithm written in [this notebook](https://www.kaggle.com/c/lish-moa/discussion/187419).

In [8]:
def get_tail_label(df: pd.DataFrame, ql=[0.05, 1.]) -> list:
    """
    Find the underrepresented targets.
    Underrepresented targets are those which are observed less than the median occurance.
    Targets beyond a quantile limit are filtered.
    """
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_label = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_label

def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.05, 1.]):
    """
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    tail_labels = get_tail_label(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
    """
    Give index of 10 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

def MLSMOTE(X, y, n_sample, neigh=5):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X, neigh=5)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0, n-1)
        neighbor = random.choice(indices2[reference, 1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val > 0 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbor,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    return new_X, target


# TODO: Use imbalanced-learn and adapt it to MLSMOTE?
# from imblearn.over_sampling import SMOTENC
# smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
# X_resampled, y_resampled = smote_nc.fit_resample(X, y)

In [9]:
n_samples = 1000
n_neighbors = 5

In [10]:
# Find samples with "tail"/"minority" labels
X_sub, y_sub = get_minority_samples(X_train, y_train)
# Augment dataframe
X_res, y_res = MLSMOTE(X_sub, y_sub, n_samples, n_neighbors)

In [11]:
X_res.shape

(1000, 874)

In [12]:
y_res.head()

Unnamed: 0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
y_res.sum()

5-alpha_reductase_inhibitor              81.0
11-beta-hsd1_inhibitor                   40.0
acat_inhibitor                           26.0
acetylcholine_receptor_agonist            0.0
acetylcholine_receptor_antagonist         0.0
                                         ... 
ubiquitin_specific_protease_inhibitor     0.0
vegfr_inhibitor                           0.0
vitamin_b                                23.0
vitamin_d_receptor_agonist               37.0
wnt_inhibitor                            25.0
Length: 206, dtype: float64

In [14]:
y_train.sum()

5-alpha_reductase_inhibitor               17
11-beta-hsd1_inhibitor                    18
acat_inhibitor                            24
acetylcholine_receptor_agonist           190
acetylcholine_receptor_antagonist        301
                                        ... 
ubiquitin_specific_protease_inhibitor      6
vegfr_inhibitor                          170
vitamin_b                                 26
vitamin_d_receptor_agonist                39
wnt_inhibitor                             30
Length: 206, dtype: int64

In [15]:
X_augmented = pd.concat([X_train, X_res])
y_augmented = pd.concat([y_train, y_res])

In [16]:
# X_augmented.to_csv(f"{BASE_PATH}/train_features_augmented.csv", index=False)
# y_augmented.to_csv(f"{BASE_PATH}/train_targets_augmented.csv", index=False)

### Visualizing data before/after

In [21]:
y_before = y_train.sum(axis=0).sort_values().reset_index()
y_before.columns = ['column', 'nonzero_records']
fig = px.bar(
    y_before, 
    x='nonzero_records', 
    y='column', 
    orientation='h', 
    title='Label counts BEFORE', 
    width=800,
    height=1000
)

fig.show()

In [23]:
y_after = y_augmented.sum(axis=0).sort_values().reset_index()
y_after.columns = ['column', 'nonzero_records']
fig = px.bar(
    y_after, 
    x='nonzero_records', 
    y='column', 
    orientation='h', 
    title='Label counts AFTER', 
    width=800,
    height=1000
)

fig.show()

In [24]:
print("Median label counts before augmentation: ", y_before['nonzero_records'].median())
print("Median label counts after augmentation: ", y_after['nonzero_records'].median())

Median label counts before augmentation:  38.5
Median label counts after augmentation:  70.5


### Assessment

Right now I'm just oversampling by 1000 samples. (23,000 -> 24,000 data points) The median label counts jumps from 38.5 -> ~70.

This seems to improve data imbalance between the labels themselves, however it does not significantly affect overall labeled/non-labeled data imbalance (i.e., # zero-label samples vs. # nonzero-label samples)

### Improvements

* Can copy more visualizations from [this notebook](https://www.kaggle.com/isaienkov/mechanisms-of-action-moa-prediction-eda) (section 5)
* Should we oversample by more than 1000 data points?
* Is there anything we can (or should) do to assess overal labeled/non-labeled data imbalance?