In [53]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [54]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output

In [55]:
#Read Data
train_variant = pd.read_csv("training_variants")
test_variant = pd.read_csv("test_variants")
train_text = pd.read_csv("training_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
test_text = pd.read_csv("test_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [56]:
train_variant.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [57]:
test_variant.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [58]:
train_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [59]:
test_text.head()

Unnamed: 0,ID,Text
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,Vascular endothelial growth factor receptor (V...
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,Abstract Retinoblastoma is a pediatric retina...


In [60]:
#Train data number of observations in each class
##Class1:568#Class2:452#Class3:89#Class4:686
##Class6:275#Class7:953#Class8:19#Class9:37
##Class5:242
train = pd.merge(train_variant, train_text, how='left', on='ID')
x_train = train.drop('Class', axis=1)

In [61]:
x_train.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...


In [62]:
x_test = pd.merge(test_variant, test_text, how='left', on='ID')
test_index = x_test['ID'].values
x_test.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [63]:
data = np.concatenate((x_train, x_test), axis=0)
data=pd.DataFrame(data)
data.columns = ["ID", "Gene", "Variation", "Text"]
data.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...


In [64]:
data.shape

(8989, 4)

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
#TFIDF
mod=TfidfVectorizer(min_df=5, max_features=500, stop_words='english')
mod_TD=mod.fit_transform(data.Text)

In [67]:
from sklearn.decomposition import TruncatedSVD

In [68]:
#SVD features
SVD=TruncatedSVD(200,random_state=41)
SVD_FIT=SVD.fit_transform(mod_TD)
yet_to_complete=pd.DataFrame(SVD_FIT)

In [69]:
from sklearn import preprocessing

In [70]:
#data.drop(data.columns[[0,3]],inplace=True, axis=1)
#as Gene and Variation data values are just scattered like IDS, i dont think these give u great info about the prediction
encoder = preprocessing.LabelEncoder()
y_train = train['Class'].values
encoder.fit(y_train)
encoded_y = encoder.transform(y_train)

In [71]:
y_train

array([1, 2, 2, ..., 1, 4, 4], dtype=int64)

In [72]:
encoded_y

array([0, 1, 1, ..., 0, 3, 3], dtype=int64)

In [73]:
X_Train=yet_to_complete[:3321]
Y_Train=encoded_y[:3321]
X_Test=yet_to_complete[2700:3321]
Y_Test=encoded_y[2700:3321]

Tune XGboost

1 Tune max_depth and min_child_weight

In [22]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import cross_validation, metrics   #Additional scklearn functions
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import log_loss
import time



In [23]:
start = time.time()

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
grid = GridSearchCV(estimator= XGBClassifier( learning_rate =0.1, n_estimators=50, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

print(time.time() - start) 

[mean: -1.57556, std: 0.18843, params: {'min_child_weight': 1, 'max_depth': 3}, mean: -1.57686, std: 0.18649, params: {'min_child_weight': 3, 'max_depth': 3}, mean: -1.56619, std: 0.18482, params: {'min_child_weight': 5, 'max_depth': 3}, mean: -1.64840, std: 0.21005, params: {'min_child_weight': 1, 'max_depth': 5}, mean: -1.63757, std: 0.20063, params: {'min_child_weight': 3, 'max_depth': 5}, mean: -1.62841, std: 0.19755, params: {'min_child_weight': 5, 'max_depth': 5}, mean: -1.73250, std: 0.23972, params: {'min_child_weight': 1, 'max_depth': 7}, mean: -1.69277, std: 0.21979, params: {'min_child_weight': 3, 'max_depth': 7}, mean: -1.67109, std: 0.20889, params: {'min_child_weight': 5, 'max_depth': 7}, mean: -1.77400, std: 0.24558, params: {'min_child_weight': 1, 'max_depth': 9}, mean: -1.72515, std: 0.22087, params: {'min_child_weight': 3, 'max_depth': 9}, mean: -1.69335, std: 0.22073, params: {'min_child_weight': 5, 'max_depth': 9}]
{'min_child_weight': 5, 'max_depth': 3}
-1.56618678



In [24]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import cross_validation, metrics   #Additional scklearn functions
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import log_loss
import time
start = time.time()

param_test1 = {
 'max_depth':[4,5,6],
 'min_child_weight':[2,3,4]
}
grid = GridSearchCV(estimator= XGBClassifier( learning_rate =0.1, n_estimators=50, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

print(time.time() - start) 

[mean: -1.60835, std: 0.19498, params: {'min_child_weight': 2, 'max_depth': 4}, mean: -1.60754, std: 0.19458, params: {'min_child_weight': 3, 'max_depth': 4}, mean: -1.60414, std: 0.19250, params: {'min_child_weight': 4, 'max_depth': 4}, mean: -1.64968, std: 0.20975, params: {'min_child_weight': 2, 'max_depth': 5}, mean: -1.63757, std: 0.20063, params: {'min_child_weight': 3, 'max_depth': 5}, mean: -1.63973, std: 0.20084, params: {'min_child_weight': 4, 'max_depth': 5}, mean: -1.68687, std: 0.22266, params: {'min_child_weight': 2, 'max_depth': 6}, mean: -1.67588, std: 0.21912, params: {'min_child_weight': 3, 'max_depth': 6}, mean: -1.66881, std: 0.20857, params: {'min_child_weight': 4, 'max_depth': 6}]
{'min_child_weight': 4, 'max_depth': 4}
-1.6041399268
0.663933974905
207.43786454200745




In [25]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import cross_validation, metrics   #Additional scklearn functions
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import log_loss
import time
start = time.time()

param_test1 = {
 'max_depth':[2,3,4,5],
 'min_child_weight':[3,4,5,6,7,8]
}
grid = GridSearchCV(estimator= XGBClassifier( learning_rate =0.1, n_estimators=50, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

print(time.time() - start) 

[mean: -1.56513, std: 0.16753, params: {'min_child_weight': 3, 'max_depth': 2}, mean: -1.56631, std: 0.16717, params: {'min_child_weight': 4, 'max_depth': 2}, mean: -1.56285, std: 0.16679, params: {'min_child_weight': 5, 'max_depth': 2}, mean: -1.56560, std: 0.16776, params: {'min_child_weight': 6, 'max_depth': 2}, mean: -1.56798, std: 0.16580, params: {'min_child_weight': 7, 'max_depth': 2}, mean: -1.56297, std: 0.16543, params: {'min_child_weight': 8, 'max_depth': 2}, mean: -1.57686, std: 0.18649, params: {'min_child_weight': 3, 'max_depth': 3}, mean: -1.56787, std: 0.18238, params: {'min_child_weight': 4, 'max_depth': 3}, mean: -1.56619, std: 0.18482, params: {'min_child_weight': 5, 'max_depth': 3}, mean: -1.57334, std: 0.18351, params: {'min_child_weight': 6, 'max_depth': 3}, mean: -1.57035, std: 0.18482, params: {'min_child_weight': 7, 'max_depth': 3}, mean: -1.56037, std: 0.18352, params: {'min_child_weight': 8, 'max_depth': 3}, mean: -1.60754, std: 0.19458, params: {'min_child_w



In [26]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import cross_validation, metrics   #Additional scklearn functions
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import log_loss
import time
start = time.time()

param_test1 = {
 'max_depth':[2,3,4],
 'min_child_weight':[7,8,9,10,11,12,13,14,15]
}
grid = GridSearchCV(estimator= XGBClassifier( learning_rate =0.1, n_estimators=50, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

print(time.time() - start) 

[mean: -1.56798, std: 0.16580, params: {'min_child_weight': 7, 'max_depth': 2}, mean: -1.56297, std: 0.16543, params: {'min_child_weight': 8, 'max_depth': 2}, mean: -1.56209, std: 0.16734, params: {'min_child_weight': 9, 'max_depth': 2}, mean: -1.56438, std: 0.16380, params: {'min_child_weight': 10, 'max_depth': 2}, mean: -1.56181, std: 0.16702, params: {'min_child_weight': 11, 'max_depth': 2}, mean: -1.56203, std: 0.16195, params: {'min_child_weight': 12, 'max_depth': 2}, mean: -1.56353, std: 0.16579, params: {'min_child_weight': 13, 'max_depth': 2}, mean: -1.55688, std: 0.16647, params: {'min_child_weight': 14, 'max_depth': 2}, mean: -1.55283, std: 0.16290, params: {'min_child_weight': 15, 'max_depth': 2}, mean: -1.57035, std: 0.18482, params: {'min_child_weight': 7, 'max_depth': 3}, mean: -1.56037, std: 0.18352, params: {'min_child_weight': 8, 'max_depth': 3}, mean: -1.56566, std: 0.18218, params: {'min_child_weight': 9, 'max_depth': 3}, mean: -1.56700, std: 0.18019, params: {'min_c



In [27]:
import winsound
winsound.Beep(300,5000)

2.3 Tune gamma

In [28]:
start = time.time()

param_test1 = {
 'gamma':[0.0,0.1,0.2,0.3,0.4,0.5]
}
grid = GridSearchCV(estimator= XGBClassifier( learning_rate =0.1, n_estimators=50, max_depth=3,
 min_child_weight=14, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

print(time.time() - start) 

[mean: -1.54998, std: 0.17843, params: {'gamma': 0.0}, mean: -1.55173, std: 0.17601, params: {'gamma': 0.1}, mean: -1.55198, std: 0.17988, params: {'gamma': 0.2}, mean: -1.55709, std: 0.17529, params: {'gamma': 0.3}, mean: -1.55738, std: 0.17637, params: {'gamma': 0.4}, mean: -1.55628, std: 0.17969, params: {'gamma': 0.5}]
{'gamma': 0.0}
-1.54997893521
0.877828109116
85.25587630271912




In [29]:
import winsound
winsound.Beep(300,5000)

2.4 Tune subsample and colsample_bytree

In [32]:
start = time.time()

param_test1 = {
 'subsample':[0.6,0.7,0.8,0.9,1.0], 'colsample_bytree':[0.6,0.7,0.8,0.9,1.0]
}
grid = GridSearchCV(estimator= XGBClassifier( learning_rate =0.1, n_estimators=50, max_depth=3,
 min_child_weight=14, gamma=0.0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

print(time.time() - start) 

[mean: -1.56506, std: 0.17087, params: {'colsample_bytree': 0.6, 'subsample': 0.6}, mean: -1.56446, std: 0.17799, params: {'colsample_bytree': 0.6, 'subsample': 0.7}, mean: -1.56691, std: 0.18173, params: {'colsample_bytree': 0.6, 'subsample': 0.8}, mean: -1.56716, std: 0.18138, params: {'colsample_bytree': 0.6, 'subsample': 0.9}, mean: -1.57539, std: 0.18398, params: {'colsample_bytree': 0.6, 'subsample': 1.0}, mean: -1.56644, std: 0.17096, params: {'colsample_bytree': 0.7, 'subsample': 0.6}, mean: -1.56004, std: 0.17985, params: {'colsample_bytree': 0.7, 'subsample': 0.7}, mean: -1.56032, std: 0.17208, params: {'colsample_bytree': 0.7, 'subsample': 0.8}, mean: -1.57260, std: 0.17951, params: {'colsample_bytree': 0.7, 'subsample': 0.9}, mean: -1.56968, std: 0.18299, params: {'colsample_bytree': 0.7, 'subsample': 1.0}, mean: -1.55961, std: 0.17391, params: {'colsample_bytree': 0.8, 'subsample': 0.6}, mean: -1.56006, std: 0.17896, params: {'colsample_bytree': 0.8, 'subsample': 0.7}, mea



In [33]:
import winsound
winsound.Beep(300,5000)

2.5 Tune Tuning Regularization Parameters

In [34]:
start = time.time()

param_test1 = {
 'reg_alpha':[0,0.0001,0.001, 0.01, 0.1, 1,10, 100]
}
grid = GridSearchCV(estimator= XGBClassifier( learning_rate =0.1, n_estimators=50, max_depth=3,
 min_child_weight=14, gamma=0.0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

print(time.time() - start) 

[mean: -1.54998, std: 0.17843, params: {'reg_alpha': 0}, mean: -1.54998, std: 0.17843, params: {'reg_alpha': 0.0001}, mean: -1.55125, std: 0.17932, params: {'reg_alpha': 0.001}, mean: -1.55558, std: 0.17934, params: {'reg_alpha': 0.01}, mean: -1.56037, std: 0.17944, params: {'reg_alpha': 0.1}, mean: -1.55814, std: 0.17832, params: {'reg_alpha': 1}, mean: -1.55670, std: 0.14790, params: {'reg_alpha': 10}, mean: -1.79601, std: 0.04045, params: {'reg_alpha': 100}]
{'reg_alpha': 0}
-1.54997893521
0.877828109116
131.49052095413208




2.6 Reducing Learning Rate re-calibrate the number of boosting rounds

In [82]:
start = time.time()

param_test1 = {
 'n_estimators':[50,100,300,500,700,1000,2000,3000,4000,5000]
}
grid = GridSearchCV(estimator= XGBClassifier( learning_rate =0.01, n_estimators=50, max_depth=2,
 min_child_weight=8, gamma=0.1, subsample=0.9, colsample_bytree=0.9,reg_alpha=0.001,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

print(time.time() - start) 

[mean: -1.93679, std: 0.03969, params: {'n_estimators': 50}, mean: -1.79973, std: 0.06082, params: {'n_estimators': 100}, mean: -1.61490, std: 0.09063, params: {'n_estimators': 300}, mean: -1.58443, std: 0.10415, params: {'n_estimators': 500}, mean: -1.58770, std: 0.11590, params: {'n_estimators': 700}, mean: -1.61258, std: 0.12722, params: {'n_estimators': 1000}, mean: -1.70785, std: 0.15774, params: {'n_estimators': 2000}, mean: -1.80898, std: 0.18192, params: {'n_estimators': 3000}, mean: -1.90401, std: 0.20212, params: {'n_estimators': 4000}, mean: -1.99316, std: 0.22057, params: {'n_estimators': 5000}]
{'n_estimators': 500}
-1.58443174497
1.6187840573
7373.850759983063




In [74]:
model=XGBClassifier(base_score=0.5, learning_rate =0.1, n_estimators=100, max_depth=3,
 min_child_weight=14, gamma=0.0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1, seed=27, reg_alpha=0)

In [45]:
model=XGBClassifier(base_score=0.5,colsample_bylevel=1,colsample_bytree=1,gamma=0.2,learning_rate=0.1,max_delta_step=0,max_depth=6,
                    min_child_weight=1, missing=None, n_estimators=100, nthread=-1,objective='multi:softprob', reg_alpha=0, reg_lambda=1,
                    scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [75]:
model.fit(X_Train,Y_Train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=14, missing=None, n_estimators=100, nthread=4,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.8)

In [76]:
y_pred=model.predict_proba(yet_to_complete[3321:])

In [77]:
#tweaking the submission file as required
subm_file = pd.DataFrame(y_pred)
subm_file['id'] = test_index
subm_file.columns = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'id']
subm_file.to_csv("submission.csv",index=False)

scored 0.87016