In [None]:
%run app_dataCleaning.ipynb

In [None]:
import lightgbm as lgb

In [None]:
import pyforest
from nltk.stem import WordNetLemmatizer 
def lemma(text):
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w,'v') for w in tokens]
    return lemmatized

tfidf=TfidfVectorizer(tokenizer= lemma, min_df=3, lowercase = True, ngram_range=(1,2), stop_words='english')
features = tfidf.fit_transform(df.comment).toarray()
labels = df.result
features.shape

In [None]:
labels.unique()

In [None]:
lablels = labels.replace({1:0, 2:1, 3:2}, inplace=True)

In [None]:
_, _, _, _, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.20, random_state=0)

In [None]:
indices_test

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.20, random_state=0)

In [None]:
print(np.unique(np.array(y_valid)))
print(np.unique(np.array(y_train)))
print(np.unique(np.array(labels)))
print(len(np.array(y_valid)))
print(len(np.array(X_train)))

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

In [None]:
#SEARCH_PARAMS = {'learning_rate': 0.4,
#                'max_depth': 15,
#                'num_leaves': 32,
#                'feature_fraction': 0.8,
#                'subsample': 0.2
#                }
SEARCH_PARAMS = {'learning_rate': 0.4,
                 'max_depth': 15,
                 'num_leaves': 20,
                 'feature_fraction': 0.8,
                 'subsample': 0.2
                }

FIXED_PARAMS={'objective': 'multiclass',
             'metric': 'auc_mu',
            # 'metric': 'None',
             'num_class': 3 ,
             'is_unbalance':True,
             'bagging_freq':5,
             'boosting':'dart',
             'num_boost_round':300,
             'early_stopping_rounds':30}

params = {'metric':FIXED_PARAMS['metric'],
          'num_class':FIXED_PARAMS['num_class'],
          'objective':FIXED_PARAMS['objective'],          
          **SEARCH_PARAMS}


In [None]:
evals_result = {}
model = lgb.train(params, train_data,                     
                     valid_sets=[valid_data, train_data],
                     num_boost_round=FIXED_PARAMS['num_boost_round'],
                     early_stopping_rounds=FIXED_PARAMS['early_stopping_rounds'],
                     valid_names=['valid', 'train'], evals_result=evals_result)

In [None]:
score = model.best_score#['valid']#['auc_mu']
score

In [None]:
lgb.plot_metric(evals_result, metric='auc_mu') 

In [None]:
len(train_data.get_label())

In [None]:
from sklearn.metrics import f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = y_hat.reshape(3, -1).T
    y_hat = y_hat.argmax(axis=1)
    return 'f1_macro', f1_score(y_true, y_hat, average='macro'), True

evals_result = {}

clf = lgb.train(params, train_data, valid_sets=[valid_data, train_data], valid_names=['valid', 'train'], feval=lgb_f1_score, evals_result=evals_result)

lgb.plot_metric(evals_result, metric='f1_macro')

In [None]:
lgb_pred = clf.predict(X_valid)
lgb_pred = lgb_pred.argmax(axis = 1)
lgb_pred.shape
lgb_pred
lgb_F1 = f1_score(y_valid, lgb_pred, average = 'macro')
lgb_F1

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_valid, lgb_pred)
fig, ax = plt.subplots(figsize=(5,5))
target_names = ['Negative', 'Positive', 'Intermediate']
sns.heatmap(conf_mat, annot=True, fmt='d', cmap=plt.cm.Blues, xticklabels=target_names, yticklabels=target_names) #cmap=plt.cm.Blues,
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
np.array(y_valid)

In [None]:
lgb_pred

## Try out skopt.BayesSearchCV

In [None]:
import skopt
from sklearn.metrics import f1_score
from skopt.space import Real, Categorical, Integer
import lightgbm as lgb
#patch below to solve TypeError: __init__() got an unexpected keyword argument 'iid'
def bayes_search_CV_init(self, estimator, search_spaces, optimizer_kwargs=None,
                         n_iter=50, scoring=None, fit_params=None, n_jobs=1,
                         n_points=1, iid=True, refit=True, cv=None, verbose=0,
                         pre_dispatch='2*n_jobs', random_state=None,
                         error_score='raise', return_train_score=False):

        self.search_spaces = search_spaces
        self.n_iter = n_iter
        self.n_points = n_points
        self.random_state = random_state
        self.optimizer_kwargs = optimizer_kwargs
        self._check_search_space(self.search_spaces)
        self.fit_params = fit_params

        super(skopt.BayesSearchCV, self).__init__(
             estimator=estimator, scoring=scoring,
             n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose,
             pre_dispatch=pre_dispatch, error_score=error_score,
             return_train_score=return_train_score)
        
skopt.BayesSearchCV.__init__ = bayes_search_CV_init

In [None]:
SEARCH_PARAMS = {'learning_rate': 0.4,
                 'max_depth': 15, 
                 'num_iterations': 100,
                 'early_stopping_round': 30,
                 'num_leaves': 20,                 
                 'min_data_in_leaf':5,
                 'min_sum_hessian_in_leaf': 0.001,
                 'scale_pos_weight': 0.1,
                 'feature_fraction': 0.8,
                 'subsample': 0.2
                } 

In [None]:

SPACE = [
    skopt.space.Real(0.01, 0.5, name='learning_rate', prior='log-uniform'),
    skopt.space.Integer(1, 30, name='max_depth'),
    skopt.space.Integer(50, 200, name='num_iterations'),
    skopt.space.Integer(20, 100, name='early_stopping_round'),
    skopt.space.Integer(15, 1000, name='num_leaves'), 
    skopt.space.Integer(20, 200, name='min_data_in_leaf'),
    skopt.space.Real(0.0001, 0.005, name='min_sum_hessian_in_leaf', prior='uniform'), 
    skopt.space.Real(0.1, 10, name='scale_pos_weight', prior='uniform'),
    skopt.space.Real(0.1, 1.0, name='feature_fraction', prior='uniform'),    
    skopt.space.Real(0.1, 1.0, name='subsample', prior='uniform')]

In [None]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = y_hat.reshape(3, -1).T
    y_hat = y_hat.argmax(axis=1)
    return 'f1_macro', f1_score(y_true, y_hat, average='macro'), True

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
def cv_evaluate(search_params):     
    params = {'metric':'None',
              'num_class':3,
              'objective': 'multiclass', 
              'force_col_wise': True,
              **search_params}  
    cv_dict = lgb.cv(params, train_data, nfold= 5, feval=lgb_f1_score)
    return max(cv_dict['f1_macro-mean'])
 
score = cv_evaluate(SEARCH_PARAMS)
score


In [None]:
@skopt.utils.use_named_args(SPACE)
def objective(**params):
    return -1.0 * cv_evaluate(params)

In [None]:
results = skopt.forest_minimize(objective, SPACE, n_calls=30, n_random_starts=10, random_state=0)
best_cv_f1 = -1.0 * results.fun
best_params_cv = results.x

print('best result: ', best_cv_f1)
print('best parameters: ', best_params_cv)

In [None]:
#final_params_cv['num_iterations'] = best_params_cv[2]
#final_params_cv['early_stopping_round'] = best_params_cv[3]
#final_params_cv['min_data_in_leaf'] = best_params_cv[5]

final_params_cv = {'metric':'None',
                   'num_class':3,
                   'objective': 'multiclass',
                   'force_col_wise': True,
         
          } 
final_params_cv['learning_rate'] = best_params_cv[0]
final_params_cv['max_depth'] = best_params_cv[1]
final_params_cv['num_iterations'] = best_params_cv[2]
final_params_cv['early_stopping_round'] = best_params_cv[3]
final_params_cv['num_leaves'] = best_params_cv[4]
final_params_cv['min_data_in_leaf'] = best_params_cv[5]
final_params_cv['min_sum_hessian_in_leaf'] = best_params_cv[6]
final_params_cv['scale_pos_weight'] = best_params_cv[7]
final_params_cv['feature_fraction'] = best_params_cv[8]
final_params_cv['subsample'] = best_params_cv[9] 

In [None]:
final_params_cv

In [None]:
evals_result = {}

clf_cv = lgb.train(final_params_cv, train_data, valid_sets=[valid_data, train_data], valid_names=['valid', 'train'], feval=lgb_f1_score, evals_result=evals_result)

lgb.plot_metric(evals_result, metric='f1_macro')

In [None]:
lgb.plot_metric(evals_result, metric='f1')

In [None]:
print(np.unique(np.array(y_valid)))
print(np.unique(np.array(y_train)))
print(np.unique(np.array(labels)))
print(len(np.array(y_valid)))
print(len(np.array(y_train)))

In [None]:
lgb_pred_cv = clf_cv.predict(X_valid)
lgb_pred_cv = lgb_pred_cv.argmax(axis = 1)
lgb_pred_cv.shape
lgb_F1_cv = f1_score(y_valid, list(lgb_pred_cv), average = 'macro')
lgb_F1_cv

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(np.array(y_valid), lgb_pred_cv)
fig, ax = plt.subplots(figsize=(5,5))
target_names = ['Negative', 'Positive', 'Intermediate']
sns.heatmap(conf_mat, annot=True, fmt='d', cmap=plt.cm.Blues, xticklabels=target_names, yticklabels=target_names) #cmap=plt.cm.Blues,
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


## Tuning parameters with skopt using lgb.train

In [None]:
import skopt

In [None]:
SEARCH_PARAMS = {'learning_rate': 0.4,
                 'max_depth': 15,
                 'max_bin':300,
                 'num_leaves': 300,
                 'min_sum_hessian_in_leaf': 0.001,
                 'scale_pos_weight': 0.1,
                 'feature_fraction': 0.8,
                 'subsample': 0.2
                } 

In [None]:
'''SEARCH_PARAMS = {'learning_rate': 0.4,
                 'max_depth': 15,
                 'num_iterations': 20,
                 'min_data_in_leaf':5,
                 'min_sum_hessian_in_leaf': 0.001,
                 'scale_pos_weight': 0.1
                }'''

In [None]:
from sklearn.metrics import f1_score

def lgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = y_hat.reshape(3, -1).T
        y_hat = y_hat.argmax(axis=1)
        return 'f1', f1_score(y_true, y_hat, average='macro'), True
    
def train_evaluate(search_params):     
    #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1234)

    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

    params = {'metric':'None',
              'num_class':3,
              'objective': 'multiclass', 
              'force_col_wise': True, 
              **search_params}  
   
    evals_result = {}
    
    model = lgb.train(params, train_data, 
                      valid_sets=[valid_data, train_data], 
                      valid_names=['valid', 'train'], 
                      num_boost_round=300,
                      early_stopping_rounds=30,
                      feval=lgb_f1_score, evals_result=evals_result)
    score = model.best_score['valid']['f1']
    return score

In [None]:
score = train_evaluate(SEARCH_PARAMS)


In [None]:
'''SPACE = [
    skopt.space.Real(0.01, 0.5, name='learning_rate', prior='log-uniform'),
    skopt.space.Integer(1, 45, name='max_depth'),
    skopt.space.Integer(25, 1500, name='num_iterations'),
    skopt.space.Integer(5, 30, name='min_data_in_leaf'),
    skopt.space.Real(0.0001, 0.005, name='min_sum_hessian_in_leaf', prior='uniform'),
    skopt.space.Real(0.1, 10, name='scale_pos_weight', prior='uniform')]
 '''

In [None]:
SPACE = [
    skopt.space.Real(0.01, 0.5, name='learning_rate', prior='log-uniform'),
    skopt.space.Integer(1, 30, name='max_depth'),
    skopt.space.Integer(300, 1000, name='max_bin'),
    skopt.space.Integer(500, 1000, name='num_leaves'),
    skopt.space.Real(0.0001, 0.005, name='min_sum_hessian_in_leaf', prior='uniform'), 
    skopt.space.Real(0.1, 10, name='scale_pos_weight', prior='uniform'),
    skopt.space.Real(0.1, 1.0, name='feature_fraction', prior='uniform'),    
    skopt.space.Real(0.1, 1.0, name='subsample', prior='uniform')]

In [None]:
@skopt.utils.use_named_args(SPACE)
def objective(**params):
    return -1.0 * train_evaluate(params)

In [None]:
results = skopt.forest_minimize(objective, SPACE, n_calls=30, n_random_starts=10)#, random_state= 0)
best_auc = -1.0 * results.fun
best_params = results.x

print('best result: ', best_auc)
print('best parameters: ', best_params)

In [None]:
final_params = {'metric':'None',
                'num_class':3,
                'objective': 'multiclass',
                'force_col_wise': True, 
          }  

In [None]:
final_params['learning_rate'] = best_params[0]
final_params['max_depth'] = best_params[1]
#final_params['max_bin'] = best_params[2]
final_params['num_leaves'] = best_params[3]
final_params['min_sum_hessian_in_leaf'] = best_params[4]
final_params['scale_pos_weight'] = best_params[5]
final_params['feature_fraction'] = best_params[6]
final_params['subsample'] = best_params[7] 

In [None]:
'''final_params['learning_rate'] = best_params[0]
final_params['max_depth'] = best_params[1]
final_params['num_iterations'] = best_params[2]
final_params['min_data_in_leaf'] = best_params[3]
final_params['min_sum_hessian_in_leaf'] = best_params[4]
final_params['scale_pos_weight'] = best_params[5]'''

In [None]:

final_params

In [None]:
evals_result = {}

clf = lgb.train(final_params, train_data, valid_sets=[valid_data, train_data], valid_names=['valid', 'train'], feval=lgb_f1_score, evals_result=evals_result)

lgb.plot_metric(evals_result, metric='f1')

In [None]:
lgb_pred = clf.predict(X_valid)
lgb_pred = lgb_pred.argmax(axis = 1)
lgb_pred.shape
lgb_pred
lgb_F1 = f1_score(y_valid, lgb_pred, average = 'macro')
lgb_F1

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_valid, lgb_pred)
fig, ax = plt.subplots(figsize=(5,5))
target_names = ['Negative', 'Positive', 'Intermediate']
sns.heatmap(conf_mat, annot=True, fmt='d', cmap=plt.cm.Blues, xticklabels=target_names, yticklabels=target_names) #cmap=plt.cm.Blues,
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()