In [1]:
from glob2 import glob
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
from gensim import models,corpora
import numpy as np
import time
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score,RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import recall_score

In [2]:
files = glob('../data/processed/10sets/*')

In [9]:
files

['../data/processed/10sets/is_java.csv',
 '../data/processed/10sets/is_css.csv',
 '../data/processed/10sets/is_html.csv',
 '../data/processed/10sets/is_ios.csv',
 '../data/processed/10sets/is_c#.csv',
 '../data/processed/10sets/is_android.csv',
 '../data/processed/10sets/is_jquery.csv',
 '../data/processed/10sets/is_php.csv',
 '../data/processed/10sets/is_javascript.csv',
 '../data/processed/10sets/is_python.csv']

In [3]:
df = pd.read_csv(files[0])
df.head()

Unnamed: 0,content,is_java
0,click event get triggered due blur reproduce i...,0
1,command python setuppy egginfo fail error code...,0
2,endofcentraldirectory signature find download ...,0
3,initializationondemand holder idiom lazy load ...,1
4,valueerror io operation closed file try print ...,0


In [4]:
class LDATransformer(BaseEstimator,TransformerMixin):
    def __init__(self,num_topics,passes):
        self.num_topics = num_topics
        self.passes = passes
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        tic = time.time()
        texts = X.content.apply(lambda x: x.split())
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        print('training LDA...')
        ldamodel = models.ldamodel.LdaModel(
            corpus, num_topics=self.num_topics, id2word = dictionary, passes=self.passes)
        dict_values = {i:[] for i in range(10)}
        for sample in ldamodel.get_document_topics(corpus):
            row = np.zeros(self.num_topics)
            for topic_id,value in sample:
                row[topic_id] = value
            for i,v in enumerate(row):
                dict_values[i].append(v)   
        print('{} seconds used'.format(str(int(time.time()-tic) )) )
        return pd.concat((pd.DataFrame(dict_values),X.iloc[:,1]),axis=1)

In [5]:
ppl = Pipeline([
    ('LDA',LDATransformer(num_topics=10,passes=2)),
])

ppl.fit_transform(df)

training LDA...
11 seconds used


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,is_java
0,0.000000,0.000000,0.957463,0.000000,0.000000,0.000000,0.031868,0.000000,0.000000,0.000000,0
1,0.000000,0.322864,0.252503,0.000000,0.060758,0.047804,0.000000,0.166413,0.139399,0.000000,0
2,0.000000,0.000000,0.984480,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,0.000000,0.000000,0.000000,0.979997,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
4,0.000000,0.000000,0.657454,0.000000,0.000000,0.000000,0.048087,0.000000,0.281493,0.000000,0
5,0.000000,0.000000,0.215929,0.000000,0.739618,0.000000,0.000000,0.000000,0.000000,0.000000,0
6,0.000000,0.000000,0.984207,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
7,0.000000,0.453357,0.513303,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
8,0.000000,0.000000,0.784104,0.195893,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
9,0.000000,0.000000,0.414423,0.563953,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0


In [6]:
df = _

# only use test set to do randsearch

In [7]:
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values
skf = StratifiedShuffleSplit(n_splits=1, random_state=123)
for train_i,test_i in skf.split(X,Y):
    x_train,y_train = X[train_i], Y[train_i]
    x_test,y_test = X[test_i],Y[test_i]

In [8]:
%%time
param_distribs = {
    'max_depth':[1,2,3,4,5],
    'learning_rate':[0.01,0.02,0.03,0.1,0.2,0.3,0.5,1],
    'n_estimators':[3000,4000],
    'subsample':[0.6,0.7,0.8,0.9],
    'colsample_bytree':[0.6,0.7,0.8,0.9],
    'seed':[123]
}
gbm = xgb.XGBClassifier()
rnd_search = RandomizedSearchCV(gbm, param_distribs,n_iter=10, cv=5, scoring='recall')
rnd_search.fit(x_train,y_train)

CPU times: user 3min 7s, sys: 772 ms, total: 3min 8s
Wall time: 49.1 s


In [92]:
cvres = rnd_search.cv_results_
scores_list = []
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    scores_list.append((mean_score, params))
highest_score, highest_paras = sorted(scores_list,key=lambda x: x[0],reverse=True)[0]
print('best score: {}, best paras:{}'.format(highest_score, highest_paras))

In [95]:
highest_paras

{'colsample_bytree': 0.6,
 'learning_rate': 1,
 'max_depth': 2,
 'n_estimators': 3000,
 'seed': 123,
 'subsample': 0.7}

In [96]:
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())

Scores: [ 0.16901408  0.14084507  0.15492958  0.14084507  0.07142857]
Mean: 0.135412474849
Standard deviation: 0.033653979118
