# Project: DrugResponse_Lapatinib (Team 1)
R08946014 陳俊達

In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
import smote_variants as sv

## 1. Data Overview

In [2]:
!ls Lapatinib

CCLE_GED_Lapatinib.csv	GDSC_GED_Lapatinib.csv


In [3]:
df_GDSC = pd.read_csv('Lapatinib/GDSC_GED_Lapatinib.csv')
df_CCLE = pd.read_csv('Lapatinib/CCLE_GED_Lapatinib.csv')

### 1.1 GDSC (Genomics of Drug Sensitivity in Cancer)

In [4]:
df_GDSC.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396 entries, 0 to 395
Columns: 17489 entries, CELL_LINE_NAME to TBC1D3P5
dtypes: float64(17488), object(1)
memory usage: 52.8+ MB


In [5]:
df_GDSC.head()

Unnamed: 0,CELL_LINE_NAME,IC50,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,...,ZNF234,J3KSW9,MYH4,LINC00526,PPY2,KRT18P55,MIR5195,POLRMTP1,UBL5P2,TBC1D3P5
0,NCI-H1648,0.031113,2.217581,-0.735253,2.669516,-0.058065,-0.465657,-0.551981,0.73455,1.783264,...,-0.045624,-0.747319,-0.902292,0.542939,-0.610554,-0.312383,-0.657732,-0.471618,2.229009,-0.587149
1,HCC2218,0.066119,1.13086,-1.152389,2.227567,-0.103168,-0.801887,-0.953301,-1.105763,0.918863,...,-0.307984,-0.866525,-1.342557,-0.592605,-0.911555,-1.023212,-0.909322,-1.093605,1.917536,-0.834582
2,BB30-HNC,0.20356,1.028961,-0.494848,3.043575,0.071932,-0.321628,-0.358059,-0.044099,1.130233,...,0.330618,-0.477184,-0.540995,0.719914,-0.273641,-0.279611,-0.038461,-0.347925,2.278153,-0.192392
3,DSH1,0.093657,2.123831,-0.796891,2.557647,0.000816,-0.320608,-0.494476,0.290516,1.444494,...,-0.036115,-0.643546,-0.885688,-0.306266,-0.508864,-0.746488,-0.979115,-0.655797,2.355807,-0.648386
4,LB2241-RCC,0.753576,1.540483,-1.070618,2.38117,-0.531222,-0.847162,-0.86371,-0.637168,1.617217,...,-0.138542,-1.002873,-1.140608,-0.293279,-0.85063,-0.696104,-0.82701,-0.785605,1.978891,-0.824081


### 1.2 CCLE (Cancer Cell Line Encyclopedia)

In [6]:
df_CCLE.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Columns: 17183 entries, CCLE.Cell.Line.Name to ZZZ3
dtypes: float64(17182), object(1)
memory usage: 61.6+ MB


In [7]:
df_CCLE.head()

Unnamed: 0,CCLE.Cell.Line.Name,IC50,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,...,ZUFSP,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.849476,1.169179,-1.028292,0.465891,-0.163033,0.008992,-0.076229,1.611807,0.730032,...,1.112574,1.497387,1.825714,1.01805,1.343089,-0.42443,1.212832,1.426546,1.300771,1.255317
1,AMO1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.256447,0.973119,-1.080065,0.080114,-1.080065,0.450694,-1.080065,1.583691,1.084361,...,1.139076,1.349479,1.706555,0.946974,1.130707,1.15903,0.931128,1.200484,1.271493,1.093488
2,AU565_BREAST,0.18321,0.706415,-0.313506,0.839385,1.037266,0.597347,-1.096185,1.54657,1.253366,...,1.126273,1.377786,1.754884,1.029821,1.168004,-0.362577,1.155447,1.796215,1.221892,1.151136
3,BT474_BREAST,0.116183,0.415596,0.058635,0.026602,0.850557,0.418443,0.068478,1.614142,1.232776,...,1.423967,1.541735,1.877341,1.334396,1.438847,0.906038,1.28814,1.581832,1.370155,1.354113
4,CAL27_UPPER_AERODIGESTIVE_TRACT,0.688771,-0.218206,0.198325,0.060658,0.802534,1.290542,-0.96346,1.554015,1.2295,...,1.353865,1.499827,1.710708,1.039307,1.352269,0.47756,1.235963,1.652735,1.327339,1.345483


### 1.3 Compare Gene Expression between GDSC & CCLE 

#### 1.3.1 Verify wheher CCLE's gene expressions  are all in GDSC

In [8]:
len(set(df_GDSC.columns[2:]) - set(df_CCLE.columns[2:]))==len(df_GDSC.columns)-len(df_CCLE.columns)

True

## 2. Preprocess

### 2.1 Reduce GDSC's columns to be as the same as in CCLE 

In [9]:
selFeatCols = df_CCLE.columns[2:]

def chooseCommonFeatures(datatype):
    
    labelCol = ['IC50']
    
    if datatype=='train':
        df = df_GDSC
    
    elif datatype=='test':
        df = df_CCLE
    
    X = df[selFeatCols].values
    y = df[labelCol].values.flatten()

    if datatype=='train':
        X, y = shuffle(X, y, random_state=0)
    
    print('X_{}.shape'.format(datatype),X.shape)
    print('y_{}.shape'.format(datatype),y.shape)
    
    return X, y

### Standardized 

In [10]:
def standardized(X_train, X_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

### Turn label into zero and one

In [11]:
def value2zeroOne(array, cutoff=2.0):
    return np.array(list(map(lambda x:int(x<cutoff), array)))

### Confusion Matrix

In [12]:
def outputConfusionMatrix(y_Actual, y_Predicted):
    data = {'y_Actual':  y_Actual,
            'y_Predicted': y_Predicted}

    df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
    confusionMatrix = pd.crosstab(df['y_Predicted'], df['y_Actual'], rownames=['Predicted'], colnames=['Actual'])
    
    return confusionMatrix

In [13]:
def getResultConfusionMatrix(clf, X, y_zo):
    y_pred = clf.predict(X)
    print('recall:', recall_score(y_zo, y_pred))
    print('precision:', precision_score(y_zo, y_pred))
    print('f1:', f1_score(y_zo, y_pred))
    return outputConfusionMatrix(y_zo, y_pred)

## get X, y

In [14]:
X_train, y_train = chooseCommonFeatures(datatype='train')
X_test, y_test = chooseCommonFeatures(datatype='test')

X_train.shape (396, 17181)
y_train.shape (396,)
X_test.shape (470, 17181)
y_test.shape (470,)


## get binary label of y

In [15]:
y_train_zo = value2zeroOne(y_train, cutoff=2.0)
y_test_zo = value2zeroOne(y_test, cutoff=2.0)

## Over-sampling

In [16]:
oversampler = sv.SMOTE(random_state=0)

X_train_os, y_train_os = oversampler.sample(X_train, y_train_zo)
X_train_os, y_train_os = shuffle(X_train_os, y_train_os)

2020-06-05 14:45:11,464:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': 0}")


In [17]:
X_train_os.shape

(742, 17181)

## Best Method:  ExtraTree + LinearSVC

In [18]:
X = X_train_os
y = y_train_os

X_train_std, X_test_std = standardized(X, X_test)

In [19]:
etc = ExtraTreesClassifier(n_estimators=50, random_state=0).fit(X_train_std, y) 
treeFeatureSelector = SelectFromModel(etc, prefit=True)

X_train_tree = treeFeatureSelector.transform(X_train_std)
x_test_tree = treeFeatureSelector.transform(X_test_std)

print('X_train_tree.shape', X_train_tree.shape)

X_train_tree.shape (742, 1628)


In [20]:
def linearSVM(X_train, y_train, X_test, y_test, isLabelBinary=False, parameters={}):
    if not isLabelBinary:
        y_train_zo = value2zeroOne(y_train, cutoff=2.0)
        y_test_zo = value2zeroOne(y_test, cutoff=2.0)
    else:
        y_train_zo = y_train
        y_test_zo = y_test
    
    clf = LinearSVC(random_state=0)
    if parameters:
        clf = GridSearchCV(clf, parameters, cv=5).fit(X_train, y_train_zo)
    
    
    print('[train]')
    cm_train = getResultConfusionMatrix(clf, X_train, y_train_zo)
    
    print()
    
    print('[test]')
    cm_test = getResultConfusionMatrix(clf, X_test, y_test_zo)
    
    return clf, cm_train, cm_test



## Finetune on regularization parameter and 5-fold cross-validation

In [21]:
parameters = {'C':[0.01, 0.1, 1, 10, 100], 'max_iter':[10000]}

clf, cm_train, cm_test = linearSVM(X_train_tree, y, 
                                         x_test_tree, y_test_zo, 
                                         isLabelBinary=True, parameters=parameters)

[train]
recall: 1.0
precision: 1.0
f1: 1.0

[test]
recall: 0.38636363636363635
precision: 0.3541666666666667
f1: 0.3695652173913044


### selected parameters

In [22]:
clf.best_params_

{'C': 0.1, 'max_iter': 10000}

## Confusion matrix

In [23]:
cm_train

Actual,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,371,0
1,0,371


In [24]:
cm_test

Actual,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,395,27
1,31,17
