In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output

In [3]:
#Read Data
train_variant = pd.read_csv("training_variants")
test_variant = pd.read_csv("test_variants")
train_text = pd.read_csv("training_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
test_text = pd.read_csv("test_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [4]:
train_variant.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [5]:
test_variant.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [6]:
train_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [7]:
test_text.head()

Unnamed: 0,ID,Text
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,Vascular endothelial growth factor receptor (V...
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,Abstract Retinoblastoma is a pediatric retina...


In [8]:
#Train data number of observations in each class
##Class1:568#Class2:452#Class3:89#Class4:686
##Class6:275#Class7:953#Class8:19#Class9:37
##Class5:242
train = pd.merge(train_variant, train_text, how='left', on='ID')
x_train = train.drop('Class', axis=1)

In [9]:
x_train.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...


In [10]:
x_test = pd.merge(test_variant, test_text, how='left', on='ID')
test_index = x_test['ID'].values
x_test.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [11]:
data = np.concatenate((x_train, x_test), axis=0)
data=pd.DataFrame(data)
data.columns = ["ID", "Gene", "Variation", "Text"]
data.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...


In [12]:
data.shape

(8989, 4)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
#TFIDF
mod=TfidfVectorizer(min_df=5, max_features=500, stop_words='english')
mod_TD=mod.fit_transform(data.Text)

In [15]:
from sklearn.decomposition import TruncatedSVD

In [16]:
#SVD features
SVD=TruncatedSVD(200,random_state=41)
SVD_FIT=SVD.fit_transform(mod_TD)
yet_to_complete=pd.DataFrame(SVD_FIT)

In [17]:
from sklearn import preprocessing

In [18]:
#data.drop(data.columns[[0,3]],inplace=True, axis=1)
#as Gene and Variation data values are just scattered like IDS, i dont think these give u great info about the prediction
encoder = preprocessing.LabelEncoder()
y_train = train['Class'].values
encoder.fit(y_train)
encoded_y = encoder.transform(y_train)

In [19]:
y_train

array([1, 2, 2, ..., 1, 4, 4], dtype=int64)

In [20]:
encoded_y

array([0, 1, 1, ..., 0, 3, 3], dtype=int64)

In [21]:
X_Train=yet_to_complete[:3321]
Y_Train=encoded_y[:3321]
X_Test=yet_to_complete[2700:3321]
Y_Test=encoded_y[2700:3321]

In [31]:
X_Train.shape

(3321, 200)

5 GradientBoostingClassifier

In [22]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import ShuffleSplit



5.1 benchmark

In [49]:
model=GradientBoostingClassifier()
model.fit(X_Train,Y_Train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [46]:
y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

0.479791900278


5.2 Fix learning rate and number of estimators for tuning tree-based parameters

In [23]:
param_test1 = {'n_estimators':range(20,151,10)}

grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,
min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -1.59712, std: 0.14869, params: {'n_estimators': 20}, mean: -1.57449, std: 0.16752, params: {'n_estimators': 30}, mean: -1.58593, std: 0.17979, params: {'n_estimators': 40}, mean: -1.60740, std: 0.19125, params: {'n_estimators': 50}, mean: -1.63347, std: 0.20295, params: {'n_estimators': 60}, mean: -1.66060, std: 0.21028, params: {'n_estimators': 70}, mean: -1.69419, std: 0.22221, params: {'n_estimators': 80}, mean: -1.71913, std: 0.23130, params: {'n_estimators': 90}, mean: -1.74833, std: 0.23347, params: {'n_estimators': 100}, mean: -1.77577, std: 0.24139, params: {'n_estimators': 110}, mean: -1.80508, std: 0.24855, params: {'n_estimators': 120}, mean: -1.84075, std: 0.25526, params: {'n_estimators': 130}, mean: -1.86429, std: 0.26051, params: {'n_estimators': 140}, mean: -1.88967, std: 0.26487, params: {'n_estimators': 150}]
{'n_estimators': 30}
-1.57448719507
0.823942390973




5.3 Tune max_depth and num_samples_split

In [24]:
param_test1 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}

grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,n_estimators=30,
min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -1.57904, std: 0.16463, params: {'min_samples_split': 200, 'max_depth': 5}, mean: -1.56999, std: 0.15477, params: {'min_samples_split': 400, 'max_depth': 5}, mean: -1.58097, std: 0.15608, params: {'min_samples_split': 600, 'max_depth': 5}, mean: -1.56508, std: 0.14659, params: {'min_samples_split': 800, 'max_depth': 5}, mean: -1.57890, std: 0.13886, params: {'min_samples_split': 1000, 'max_depth': 5}, mean: -1.58355, std: 0.18053, params: {'min_samples_split': 200, 'max_depth': 7}, mean: -1.57188, std: 0.16840, params: {'min_samples_split': 400, 'max_depth': 7}, mean: -1.57513, std: 0.15705, params: {'min_samples_split': 600, 'max_depth': 7}, mean: -1.56715, std: 0.15101, params: {'min_samples_split': 800, 'max_depth': 7}, mean: -1.57549, std: 0.14370, params: {'min_samples_split': 1000, 'max_depth': 7}, mean: -1.58912, std: 0.17711, params: {'min_samples_split': 200, 'max_depth': 9}, mean: -1.57012, std: 0.17181, params: {'min_samples_split': 400, 'max_depth': 9}, mean: -1.5753



In [25]:
param_test1 = {'max_depth':range(9,14,1), 'min_samples_split':range(300,500,20)}

grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,n_estimators=30, min_samples_split=500,
min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -1.57773, std: 0.17504, params: {'min_samples_split': 300, 'max_depth': 9}, mean: -1.58036, std: 0.16979, params: {'min_samples_split': 320, 'max_depth': 9}, mean: -1.57592, std: 0.17035, params: {'min_samples_split': 340, 'max_depth': 9}, mean: -1.56383, std: 0.17065, params: {'min_samples_split': 360, 'max_depth': 9}, mean: -1.56197, std: 0.16237, params: {'min_samples_split': 380, 'max_depth': 9}, mean: -1.57012, std: 0.17181, params: {'min_samples_split': 400, 'max_depth': 9}, mean: -1.56880, std: 0.16761, params: {'min_samples_split': 420, 'max_depth': 9}, mean: -1.57153, std: 0.16734, params: {'min_samples_split': 440, 'max_depth': 9}, mean: -1.57624, std: 0.16106, params: {'min_samples_split': 460, 'max_depth': 9}, mean: -1.57621, std: 0.16302, params: {'min_samples_split': 480, 'max_depth': 9}, mean: -1.57326, std: 0.16111, params: {'min_samples_split': 300, 'max_depth': 10}, mean: -1.58141, std: 0.18223, params: {'min_samples_split': 320, 'max_depth': 10}, mean: -1.5757



5.4 Tune min_samples_leaf

In [26]:
param_test1 = {'min_samples_split':range(370,430,10), 'min_samples_leaf':range(20,71,10)}

grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,n_estimators=30, min_samples_split=400,
min_samples_leaf=50,max_depth=11,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -1.60863, std: 0.18706, params: {'min_samples_split': 370, 'min_samples_leaf': 20}, mean: -1.61081, std: 0.18171, params: {'min_samples_split': 380, 'min_samples_leaf': 20}, mean: -1.60757, std: 0.18194, params: {'min_samples_split': 390, 'min_samples_leaf': 20}, mean: -1.59739, std: 0.17598, params: {'min_samples_split': 400, 'min_samples_leaf': 20}, mean: -1.58273, std: 0.17991, params: {'min_samples_split': 410, 'min_samples_leaf': 20}, mean: -1.60681, std: 0.17256, params: {'min_samples_split': 420, 'min_samples_leaf': 20}, mean: -1.58104, std: 0.17802, params: {'min_samples_split': 370, 'min_samples_leaf': 30}, mean: -1.57981, std: 0.17934, params: {'min_samples_split': 380, 'min_samples_leaf': 30}, mean: -1.58392, std: 0.17707, params: {'min_samples_split': 390, 'min_samples_leaf': 30}, mean: -1.58447, std: 0.17100, params: {'min_samples_split': 400, 'min_samples_leaf': 30}, mean: -1.56739, std: 0.17968, params: {'min_samples_split': 410, 'min_samples_leaf': 30}, mean: -1.



In [28]:
param_test1 = {'min_samples_split':range(400,420,1), 'min_samples_leaf':range(70,151,10)}

grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,n_estimators=30, min_samples_split=410,
min_samples_leaf=70,max_depth=11,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -1.56832, std: 0.16213, params: {'min_samples_split': 400, 'min_samples_leaf': 70}, mean: -1.56809, std: 0.16565, params: {'min_samples_split': 401, 'min_samples_leaf': 70}, mean: -1.56811, std: 0.17184, params: {'min_samples_split': 402, 'min_samples_leaf': 70}, mean: -1.55594, std: 0.16000, params: {'min_samples_split': 403, 'min_samples_leaf': 70}, mean: -1.55624, std: 0.16145, params: {'min_samples_split': 404, 'min_samples_leaf': 70}, mean: -1.56203, std: 0.16310, params: {'min_samples_split': 405, 'min_samples_leaf': 70}, mean: -1.56020, std: 0.16266, params: {'min_samples_split': 406, 'min_samples_leaf': 70}, mean: -1.56067, std: 0.16548, params: {'min_samples_split': 407, 'min_samples_leaf': 70}, mean: -1.55106, std: 0.16855, params: {'min_samples_split': 408, 'min_samples_leaf': 70}, mean: -1.55476, std: 0.17121, params: {'min_samples_split': 409, 'min_samples_leaf': 70}, mean: -1.55202, std: 0.16974, params: {'min_samples_split': 410, 'min_samples_leaf': 70}, mean: -1.



In [29]:
param_test1 = {'min_samples_split':range(400,410,1), 'min_samples_leaf':range(80,100,1)}

grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,n_estimators=30, min_samples_split=410,
min_samples_leaf=70,max_depth=11,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -1.55772, std: 0.15262, params: {'min_samples_split': 400, 'min_samples_leaf': 80}, mean: -1.55595, std: 0.15454, params: {'min_samples_split': 401, 'min_samples_leaf': 80}, mean: -1.55566, std: 0.14983, params: {'min_samples_split': 402, 'min_samples_leaf': 80}, mean: -1.54676, std: 0.15049, params: {'min_samples_split': 403, 'min_samples_leaf': 80}, mean: -1.55605, std: 0.15484, params: {'min_samples_split': 404, 'min_samples_leaf': 80}, mean: -1.54502, std: 0.15151, params: {'min_samples_split': 405, 'min_samples_leaf': 80}, mean: -1.54895, std: 0.15090, params: {'min_samples_split': 406, 'min_samples_leaf': 80}, mean: -1.55447, std: 0.15175, params: {'min_samples_split': 407, 'min_samples_leaf': 80}, mean: -1.55122, std: 0.14979, params: {'min_samples_split': 408, 'min_samples_leaf': 80}, mean: -1.55794, std: 0.15428, params: {'min_samples_split': 409, 'min_samples_leaf': 80}, mean: -1.55682, std: 0.15141, params: {'min_samples_split': 400, 'min_samples_leaf': 81}, mean: -1.



In [None]:
5.5 Tune max_features

In [32]:
param_test1 = {'max_features':range(45,65,2)}

grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,n_estimators=30, min_samples_split=400,
min_samples_leaf=95,max_depth=11,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -1.57279, std: 0.18203, params: {'max_features': 45}, mean: -1.57437, std: 0.17828, params: {'max_features': 47}, mean: -1.58613, std: 0.17701, params: {'max_features': 49}, mean: -1.58152, std: 0.16953, params: {'max_features': 51}, mean: -1.57417, std: 0.18126, params: {'max_features': 53}, mean: -1.57027, std: 0.17977, params: {'max_features': 55}, mean: -1.56766, std: 0.16818, params: {'max_features': 57}, mean: -1.57789, std: 0.16945, params: {'max_features': 59}, mean: -1.58232, std: 0.18992, params: {'max_features': 61}, mean: -1.57946, std: 0.18375, params: {'max_features': 63}]
{'max_features': 57}
-1.56765738862
0.75607678148




In [None]:
5.6 Tuning subsample

In [33]:
param_test1 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}

grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,n_estimators=30, min_samples_split=400,
min_samples_leaf=95,max_depth=11,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -1.58084, std: 0.15496, params: {'subsample': 0.6}, mean: -1.55607, std: 0.16175, params: {'subsample': 0.7}, mean: -1.55037, std: 0.14999, params: {'subsample': 0.75}, mean: -1.54085, std: 0.14806, params: {'subsample': 0.8}, mean: -1.56443, std: 0.15428, params: {'subsample': 0.85}, mean: -1.57190, std: 0.16556, params: {'subsample': 0.9}]
{'subsample': 0.8}
-1.54085232125
0.808693284005




In [None]:
5.7 lower learning rate

In [34]:
param_test1 = {'n_estimators':range(10,200,20)}
grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.01,n_estimators=30, min_samples_split=400,
min_samples_leaf=95,max_depth=11,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -2.04397, std: 0.01702, params: {'n_estimators': 10}, mean: -1.91995, std: 0.04009, params: {'n_estimators': 30}, mean: -1.83184, std: 0.05949, params: {'n_estimators': 50}, mean: -1.76853, std: 0.07825, params: {'n_estimators': 70}, mean: -1.72066, std: 0.09153, params: {'n_estimators': 90}, mean: -1.68101, std: 0.10287, params: {'n_estimators': 110}, mean: -1.64945, std: 0.11248, params: {'n_estimators': 130}, mean: -1.62601, std: 0.12153, params: {'n_estimators': 150}, mean: -1.60640, std: 0.12933, params: {'n_estimators': 170}, mean: -1.58974, std: 0.13546, params: {'n_estimators': 190}]
{'n_estimators': 190}
-1.58973793204
0.993815061324




In [35]:
param_test1 = {'n_estimators':range(1000,3000,200)}
grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.01,n_estimators=30, min_samples_split=400,
min_samples_leaf=95,max_depth=11,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -1.68937, std: 0.22603, params: {'n_estimators': 1000}, mean: -1.74536, std: 0.23871, params: {'n_estimators': 1200}, mean: -1.80004, std: 0.25189, params: {'n_estimators': 1400}, mean: -1.85113, std: 0.26526, params: {'n_estimators': 1600}, mean: -1.90299, std: 0.27687, params: {'n_estimators': 1800}, mean: -1.94938, std: 0.28793, params: {'n_estimators': 2000}, mean: -1.99401, std: 0.30050, params: {'n_estimators': 2200}, mean: -2.03430, std: 0.30943, params: {'n_estimators': 2400}, mean: -2.07193, std: 0.31763, params: {'n_estimators': 2600}, mean: -2.10861, std: 0.32508, params: {'n_estimators': 2800}]
{'n_estimators': 1000}
-1.68936899243
0.327964048753




In [44]:
param_test1 = {'n_estimators':range(200,1200,100)}
grid= GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.01,n_estimators=30, min_samples_split=400,
min_samples_leaf=95,max_depth=11,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='log_loss',n_jobs=4,iid=False, cv=5)

grid.fit(X_Train,Y_Train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

model=grid.best_estimator_

y_pred = model.predict_proba(X_Test)
print(log_loss( Y_Test,y_pred))

[mean: -1.58439, std: 0.13966, params: {'n_estimators': 200}, mean: -1.55501, std: 0.16165, params: {'n_estimators': 300}, mean: -1.55394, std: 0.17792, params: {'n_estimators': 400}, mean: -1.56545, std: 0.18839, params: {'n_estimators': 500}, mean: -1.58541, std: 0.19644, params: {'n_estimators': 600}, mean: -1.60851, std: 0.20333, params: {'n_estimators': 700}, mean: -1.63527, std: 0.21134, params: {'n_estimators': 800}, mean: -1.66067, std: 0.21936, params: {'n_estimators': 900}, mean: -1.68937, std: 0.22603, params: {'n_estimators': 1000}, mean: -1.71672, std: 0.23230, params: {'n_estimators': 1100}]
{'n_estimators': 400}
-1.55394460174
0.679090946848




In [59]:
model=GradientBoostingClassifier(learning_rate=0.01,n_estimators=2000, min_samples_split=400,
min_samples_leaf=95,max_depth=11,max_features='sqrt',subsample=0.8,random_state=10)

In [60]:
model.fit(X_Train,Y_Train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=11,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=95,
              min_samples_split=400, min_weight_fraction_leaf=0.0,
              n_estimators=3000, presort='auto', random_state=10,
              subsample=0.8, verbose=0, warm_start=False)

In [61]:
y_pred=model.predict_proba(yet_to_complete[3321:])

In [62]:
#tweaking the submission file as required
subm_file = pd.DataFrame(y_pred)
subm_file['id'] = test_index
subm_file.columns = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'id']
subm_file.to_csv("submission.csv",index=False)

final results on kaggle: 
learning_rate=0.1,n_estimators=30: Scored 0.96711 on Kaggle;
learning_rate=0.01,n_estimators=1000: Scored 0.72116 on Kaggle;
learning_rate=0.01,n_estimators=400: Scored 0.84983 on Kaggle;
learning_rate=0.01,n_estimators=2000: Scored 0.70223 on Kaggle;
learning_rate=0.01,n_estimators=5000: Scored 0.78279 on Kaggle;
the other parametes remain the same,only differ "learning_rate" and "n_estimators"

Observation: the result on kaggle is not matching the result from model tuning,probably because the over_fitting on kaggle leaderboard as it is only calculated with approximately 12% of the test data.