In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
import numpy as np
import pandas as pd
import re
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from textblob import Word, TextBlob
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
fixed_skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [3]:
train = fetch_20newsgroups()
test = fetch_20newsgroups(subset="test")
X_train, y_train = train['data'], train['target']
X_test, y_test = test['data'], test['target']

## Preprocessing and regex features:

In [4]:
clean_sentences = lambda text: ''.join(filter(lambda x: x.isalpha() or x in [' ', '\n', '.', ',', ':', ',', '!', '?'] or x.isdigit(), text))
get_sentences = lambda text: re.findall(r'[A-Z][^\.]*[\.|\?|\!][$|\s]', text)
get_weird_sentences = lambda text: re.findall(r'[\s|^|\.][A-Z][^\.]*[\.|\?|\!]', text)

In [5]:
def remove_digits(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

In [6]:
def split_specific(data, topics):
    split_txts = []
    for txt in data:
        split_txt = {}
        for topic in topics:
            
            if topic+':' not in txt:
                split_txt[topic] = ''
            else:
                s = txt.split(topic + ':')[1]
                next_topic = re.findall('[a-z\-A-Z]+:',s)
                if next_topic:
#                     s = s.split(re.findall('[a-z\-A-Z]+:',s)[0])[0]
                    s = s.split(re.findall('[a-z\-A-Z]+:|$',s)[0])[0]
                split_txt[topic] = s
                txt = txt.replace(s, ' ').replace(topic+':', ' ')         
        split_txt['the_rest'] = txt
        split_txts += [split_txt]
        
    return split_txts

In [7]:
letters_only = lambda txt: ''.join(filter(lambda c: c.isalpha(), txt)).lower()

In [8]:
uppercase_words = lambda text: len(re.findall('[A-Z]{2,}', text))

In [9]:
df = pd.DataFrame(X_train, columns=['text'])
df['text'] = df['text'].apply(clean_sentences)
df['question_marks_cnt'] = df['text'].str.count('\?')
df['exclamation_marks_cnt'] = df['text'].str.count('\!')
df['uppercase_words_count'] = df['text'].apply(uppercase_words)

In [10]:
df_ = pd.DataFrame(split_specific(df['text'], ['From']))
df_['From'] = df_['From'].apply(lambda f: ' '.join(f.strip().split(' ')[0].split('.')))
df_['sentences'] = df_.the_rest.apply(get_sentences)
df_['sentences_count'] = df_['sentences'].apply(len)
df_.loc[df_.sentences_count == 0, 'the_rest'] = df_.loc[df_.sentences_count == 0, 'the_rest'].apply(get_weird_sentences)
df_['sentences_count'] = df_['sentences'].apply(len)
df_.loc[df_.sentences_count == 0, 'sentences'] = [['']]*df_.loc[df_.sentences_count == 0, 'sentences'].shape[0]
df_['median_sentence_len'] = df_['sentences'].apply(lambda arr: np.median(np.array([len(item) for item in arr]), axis=0))

  return _methods._mean(a, axis=axis, dtype=dtype,


In [11]:
df_['polarity_1_sntc'] = df_['sentences'].apply(lambda s: TextBlob(s[0]).sentiment.polarity if len(s) > 0 else np.nan)
df_['subjectivity_1_sntc'] = df_['sentences'].apply(lambda s: TextBlob(s[0]).sentiment.subjectivity  if len(s) > 0 else np.nan)
df_['polarity_last_sntc'] = df_['sentences'].apply(lambda s: TextBlob(s[-1]).sentiment.polarity  if len(s) > 0 else np.nan)
df_['subjectivity_last_sntc'] = df_['sentences'].apply(lambda s: TextBlob(s[-1]).sentiment.subjectivity  if len(s) > 0 else np.nan)
df_.drop(['sentences', 'the_rest'], axis=1, inplace=True)

In [12]:
df = pd.concat([df, df_], axis=1)

In [13]:
df.head(2)

Unnamed: 0,text,question_marks_cnt,exclamation_marks_cnt,uppercase_words_count,From,sentences_count,median_sentence_len,polarity_1_sntc,subjectivity_1_sntc,polarity_last_sntc,subjectivity_last_sntc
0,From: lerxstwam.umd.edu wheres my thing\nSubje...,1,1,2,lerxstwam umd edu,8,49.0,-0.208333,0.333333,0.0,0.0
1,From: guykuocarson.u.washington.edu Guy Kuo\nS...,0,0,7,guykuocarson u washington edu,4,94.0,0.75,0.95,0.2,0.2


## CountVectorizer:

In [14]:
vectorizer = CountVectorizer(
                    analyzer='word', #{‘word’, ‘char’, ‘char_wb’} 
                    ngram_range=(1,2),
                    stop_words='english',
                    vocabulary=None,
                    max_df=0.4, #0.4
                    min_df=30, #30
#                     max_features=6,
                    preprocessor=remove_digits
                    )
words = vectorizer.fit_transform(df['text'])
words = pd.DataFrame(words.astype(np.byte).todense(), columns=vectorizer.get_feature_names_out())

In [15]:
words.head(2)

Unnamed: 0,aa,aaa,aaron,ab,abandoned,abc,ability,able,able use,abortion,...,za,zealand,zero,zeus,zip,zone,zoology,zoology kipling,zoology lines,zx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
vectorizer = CountVectorizer(
                    analyzer='char_wb', #{‘word’, ‘char’, ‘char_wb’} 
                    ngram_range=(4,9),
                    vocabulary=None,
                    max_df=400,
                    min_df=200,
                    lowercase=True,
#                     max_features=6,
                    preprocessor=letters_only
                    )
chars = vectorizer.fit_transform(df['text'])
chars = pd.DataFrame(chars.astype(np.byte).todense(), columns=vectorizer.get_feature_names_out())

In [17]:
chars.head(2)

Unnamed: 0,frome,fromf,froml,fromma,fromn,fromw,sub,subj,subje,subjec,...,zationin,zationl,zationne,zationo,zationr,zationst,zationw,zens,zethe,zine
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
final = pd.concat([words, chars, df.drop(['text', 'From'], axis=1)], axis=1)
final.drop(final.columns[final.columns.duplicated()], axis=1, inplace=True)

In [19]:
final.head(2)

Unnamed: 0,aa,aaa,aaron,ab,abandoned,abc,ability,able,able use,abortion,...,zine,question_marks_cnt,exclamation_marks_cnt,uppercase_words_count,sentences_count,median_sentence_len,polarity_1_sntc,subjectivity_1_sntc,polarity_last_sntc,subjectivity_last_sntc
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,2,8,49.0,-0.208333,0.333333,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,7,4,94.0,0.75,0.95,0.2,0.2


**Leaving out only the most important features. It greatly improves score on Logistic Regression.**

In [20]:
imputer = SimpleImputer(strategy='mean')
final_ = imputer.fit_transform(final)
final_ = StandardScaler().fit_transform(final_)
final_ = pd.DataFrame(final_, columns=final.columns)

In [68]:
reg = LogisticRegression(penalty='l1',
                         C=0.1,
                         multi_class='ovr', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42,
                         solver='liblinear')
logistic_reg_results = cross_validate(reg, final_, y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [None]:
reg.fit(final_, y_train)
weights = list(zip(np.abs(reg.coef_).sum(axis=0), final_.columns))
weights.sort(key = lambda x: x[0], reverse=True)

In [None]:
selected_columns = [x[1] for x in weights[:5_000]]

**LightGBM with all of the features**

In [130]:
lgb_params = {
    #default
    "objective": "multiclass",
    'num_classes': 20, 
    "learning_rate": 0.07,
    "num_threads": 10,
    "metric": "multi_error",
    "seed": 42,
    "num_threads":4,
    
# #     #regularization
    "colsample_bytree": 0.7,
    "subsample": 0.7, 
    "subsample_freq": 1,
    "min_data_in_leaf": 40,
#     "num_leaves":100,
    "verbose":-1
}
lgb_train = lgb.Dataset(final, label=y_train, free_raw_data=False)
lgb_result = lgb.cv(lgb_params, lgb_train, 10_000, folds=fixed_skf, callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = lgb_result["cvbooster"].best_iteration

Training until validation scores don't improve for 10 rounds
[10]	cv_agg's train multi_error: 0.141506 + 0.00191815	cv_agg's valid multi_error: 0.238731 + 0.0052255
[20]	cv_agg's train multi_error: 0.0689411 + 0.00122781	cv_agg's valid multi_error: 0.202051 + 0.00492652
[30]	cv_agg's train multi_error: 0.0299188 + 0.0016269	cv_agg's valid multi_error: 0.177745 + 0.00761353
[40]	cv_agg's train multi_error: 0.0117554 + 0.00159633	cv_agg's valid multi_error: 0.161659 + 0.00641955
[50]	cv_agg's train multi_error: 0.00256322 + 0.000488285	cv_agg's valid multi_error: 0.151494 + 0.00627843
[60]	cv_agg's train multi_error: 0.000574519 + 0.000165395	cv_agg's valid multi_error: 0.147694 + 0.00767489
[70]	cv_agg's train multi_error: 0.000132579 + 8.28635e-09	cv_agg's valid multi_error: 0.142833 + 0.007058
[80]	cv_agg's train multi_error: 8.8388e-05 + 6.24998e-05	cv_agg's valid multi_error: 0.140358 + 0.00722785
[90]	cv_agg's train multi_error: 8.8388e-05 + 6.24998e-05	cv_agg's valid multi_error: 

In [74]:
print('accuracy on test: ' + str(1-0.122946))
print('accuracy on train: ' + str(1-8.8388e-05))

accuracy on test: 0.877054
accuracy on train: 0.999911612


**LightGBM with 5000 most important features**

In [66]:
lgb_params = {
    #default
    "objective": "multiclass",
    'num_classes': 20, 
    "learning_rate": 0.07,
    "num_threads": 10,
    "metric": "multi_error",
    "seed": 42,
    "num_threads":4,
    
# #     #regularization
    "colsample_bytree": 0.7,
    "subsample": 0.7, 
    "subsample_freq": 1,
    "min_data_in_leaf": 40,
#     "num_leaves":100,
    "verbose":-1
}
lgb_train = lgb.Dataset(final[selected_columns], label=y_train, free_raw_data=False)
lgb_result = lgb.cv(lgb_params, lgb_train, 10_000, folds=fixed_skf, callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = lgb_result["cvbooster"].best_iteration

Training until validation scores don't improve for 10 rounds
[10]	cv_agg's train multi_error: 0.152731 + 0.0020129	cv_agg's valid multi_error: 0.251017 + 0.00197581
[20]	cv_agg's train multi_error: 0.0813593 + 0.000794545	cv_agg's valid multi_error: 0.203819 + 0.00760376
[30]	cv_agg's train multi_error: 0.039597 + 0.00098422	cv_agg's valid multi_error: 0.182253 + 0.00566177
[40]	cv_agg's train multi_error: 0.0161305 + 0.00102973	cv_agg's valid multi_error: 0.163338 + 0.00596816
[50]	cv_agg's train multi_error: 0.00565671 + 0.000380176	cv_agg's valid multi_error: 0.154676 + 0.00706001
[60]	cv_agg's train multi_error: 0.00150258 + 0.000250099	cv_agg's valid multi_error: 0.147075 + 0.00490728
[70]	cv_agg's train multi_error: 0.00035354 + 6.24791e-05	cv_agg's valid multi_error: 0.140888 + 0.00689844
[80]	cv_agg's train multi_error: 8.8388e-05 + 6.24998e-05	cv_agg's valid multi_error: 0.139916 + 0.00602862
[90]	cv_agg's train multi_error: 8.8388e-05 + 6.24998e-05	cv_agg's valid multi_error:

In [67]:
df_lgb_result = pd.DataFrame()
df_lgb_result['train multi_error-mean'] = lgb_result['train multi_error-mean']
df_lgb_result['valid multi_error-mean'] = lgb_result['valid multi_error-mean']
1-df_lgb_result.iloc[-1]

train multi_error-mean    0.999912
valid multi_error-mean    0.874315
Name: 171, dtype: float64

**Logistic Regression with all of the features**

In [88]:
reg = LogisticRegression(penalty='l2',
                         C=0.01, #0.01 ---> 0.7966224069213816
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
logistic_reg_results_full = cross_validate(reg, final_, y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [76]:
pd.DataFrame(logistic_reg_results_full).mean(axis=0)

fit_time       66.131167
score_time      0.284277
test_score      0.841435
train_score     0.999867
dtype: float64

**Logistic Regression with 5000 most important features**

In [70]:
reg = LogisticRegression(penalty='l2',
                         C=0.01, #0.01 ---> 0.7966224069213816
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
logistic_reg_results = cross_validate(reg, final_[selected_columns], y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [65]:
pd.DataFrame(logistic_reg_results).mean(axis=0)

fit_time       11.329509
score_time      0.051689
test_score      0.882092
train_score     0.999779
dtype: float64

## TF-IDF Vectorizer

In [77]:
vectorizer = TfidfVectorizer(
                    analyzer='word',
                    ngram_range=(1,2),
                    stop_words='english',
                    vocabulary=None,
                    max_df=0.4, #0.8 
                    min_df = 30, #50
                    max_features=None,
                    smooth_idf=False,
                    norm='l2',
                    preprocessor=remove_digits)
bag = vectorizer.fit_transform(df['text'])
words = pd.DataFrame(bag.astype(np.float32).todense(), columns = vectorizer.get_feature_names_out())

In [78]:
words.head(2)

Unnamed: 0,aa,aaa,aaron,ab,abandoned,abc,ability,able,able use,abortion,...,za,zealand,zero,zeus,zip,zone,zoology,zoology kipling,zoology lines,zx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
vectorizer = TfidfVectorizer(
                    analyzer='char_wb',
                    ngram_range=(4,9),
                    vocabulary=None,
                    max_df=400, #0.8 
                    min_df = 200, #50
                    max_features=None,
                    smooth_idf=False,
                    norm='l2',
                    preprocessor=letters_only)
bag = vectorizer.fit_transform(df['text'])
chars = pd.DataFrame(bag.astype(np.float32).todense(), columns = vectorizer.get_feature_names_out())

In [80]:
chars.head(2)

Unnamed: 0,frome,fromf,froml,fromma,fromn,fromw,sub,subj,subje,subjec,...,zationin,zationl,zationne,zationo,zationr,zationst,zationw,zens,zethe,zine
0,0.0,0.0,0.059901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
final = pd.concat([words, chars, df.drop(['text', 'From'], axis=1)], axis=1)
final.drop(final.columns[final.columns.duplicated()], axis=1, inplace=True)

In [83]:
imputer = SimpleImputer(strategy='mean')
final_ = imputer.fit_transform(final)
final_ = StandardScaler().fit_transform(final_)
final_ = pd.DataFrame(final_, columns=final.columns)

In [84]:
final_.head(2)

Unnamed: 0,aa,aaa,aaron,ab,abandoned,abc,ability,able,able use,abortion,...,zine,question_marks_cnt,exclamation_marks_cnt,uppercase_words_count,sentences_count,median_sentence_len,polarity_1_sntc,subjectivity_1_sntc,polarity_last_sntc,subjectivity_last_sntc
0,-0.0613,-0.051056,-0.071476,-0.078557,-0.041258,-0.050063,-0.114927,-0.239743,-0.050097,-0.052426,...,-0.120898,-0.093352,-0.046756,-0.086356,-0.184711,-0.649253,-0.931508,0.160459,-0.281963,-0.875187
1,-0.0613,-0.051056,-0.071476,-0.078557,-0.041258,-0.050063,-0.114927,-0.239743,-0.050097,-0.052426,...,-0.120898,-0.136909,-0.086395,-0.057868,-0.331613,0.085328,3.92211,2.855572,0.561385,-0.216227


**Selecting features**

In [86]:
reg = LogisticRegression(penalty='l1',
                         C=0.1,
                         multi_class='ovr', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42,
                         solver='liblinear')
logistic_reg_results = cross_validate(reg, final_, y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [94]:
reg = LogisticRegression(penalty='l1',
                         C=0.1,
                         multi_class='ovr', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42,
                         solver='liblinear')
reg.fit(final_, y_train)
weights = list(zip(np.abs(reg.coef_).sum(axis=0), final_.columns))
weights.sort(key = lambda x: x[0], reverse=True)



**Logistic Regression with all of the features**

In [91]:
reg = LogisticRegression(penalty='l2',
                         C=0.01, #0.01 ---> 0.7966224069213816
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
logistic_reg_results_tfidf = cross_validate(reg, final_, y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ..................., score=(train=1.000, test=0.882) total time= 1.1min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV] END ..................., score=(train=1.000, test=0.875) total time= 1.1min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s


[CV] END ..................., score=(train=1.000, test=0.876) total time= 1.1min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.4min finished


In [93]:
pd.DataFrame(logistic_reg_results_tfidf).mean(axis=0)

fit_time       67.260816
score_time      0.308254
test_score      0.877850
train_score     0.999867
dtype: float64

**Logistic Regression with 5000 most important features**

In [108]:
selected_columns = [x[1] for x in weights[:5000]]

In [114]:
reg = LogisticRegression(penalty='l2',
                         C=0.01,
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
logistic_reg_results_tfidf = cross_validate(reg, final_[selected_columns], y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ..................., score=(train=1.000, test=0.931) total time=   8.3s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.4s remaining:    0.0s


[CV] END ..................., score=(train=1.000, test=0.934) total time=  10.2s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.8s remaining:    0.0s


[CV] END ..................., score=(train=1.000, test=0.926) total time=  10.3s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   29.4s finished


In [116]:
pd.DataFrame(logistic_reg_results_tfidf).mean(axis=0)

fit_time       9.636649
score_time     0.055589
test_score     0.930440
train_score    0.999779
dtype: float64

**LightGBM with all of the features**

In [49]:
lgb_params = {
    #default
    "objective": "multiclass",
    'num_classes': 20, 
    "learning_rate": 0.06,
    "num_threads": 10,
    "metric": "multi_error",
    "seed": 42,
    "num_threads":4,
    
# #     #regularization
    "colsample_bytree": 0.7,
    "subsample": 0.8, 
    "subsample_freq": 1,
    "min_data_in_leaf": 45,
#     "num_leaves":100,
    "verbose":-1
}
lgb_train = lgb.Dataset(final, label=y_train, free_raw_data=False)
lgb_result = lgb.cv(lgb_params, lgb_train, 10_000, folds=fixed_skf, callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = lgb_result["cvbooster"].best_iteration

Training until validation scores don't improve for 10 rounds
[10]	cv_agg's train multi_error: 0.113179 + 0.00267901	cv_agg's valid multi_error: 0.23882 + 0.00654345
[20]	cv_agg's train multi_error: 0.0514408 + 0.00114882	cv_agg's valid multi_error: 0.207796 + 0.00706486
[30]	cv_agg's train multi_error: 0.0197986 + 0.00165478	cv_agg's valid multi_error: 0.187379 + 0.00799176
[40]	cv_agg's train multi_error: 0.00654061 + 0.000982491	cv_agg's valid multi_error: 0.174917 + 0.00773413
[50]	cv_agg's train multi_error: 0.00154676 + 0.000250036	cv_agg's valid multi_error: 0.165283 + 0.00860688
[60]	cv_agg's train multi_error: 0.000220967 + 6.25039e-05	cv_agg's valid multi_error: 0.158477 + 0.00785107
[70]	cv_agg's train multi_error: 8.8388e-05 + 6.24998e-05	cv_agg's valid multi_error: 0.152909 + 0.00770467
[80]	cv_agg's train multi_error: 8.8388e-05 + 6.24998e-05	cv_agg's valid multi_error: 0.149196 + 0.00797545
[90]	cv_agg's train multi_error: 8.8388e-05 + 6.24998e-05	cv_agg's valid multi_err

In [92]:
print('accuracy on test: ' + str(1-0.12984))
print('accuracy on train: ' + str(1-8.8388e-05))

accuracy on test: 0.87016
accuracy on train: 0.999911612


**LightGBM with 5000 most important features**

In [117]:
lgb_params = {
    #default
    "objective": "multiclass",
    'num_classes': 20, 
    "learning_rate": 0.06,
    "num_threads": 10,
    "metric": "multi_error",
    "seed": 42,
    "num_threads":4,
    
# #     #regularization
    "colsample_bytree": 0.7,
    "subsample": 0.8, 
    "subsample_freq": 1,
    "min_data_in_leaf": 45,
#     "num_leaves":100,
    "verbose":-1
}
lgb_train = lgb.Dataset(final[selected_columns], label=y_train, free_raw_data=False)
lgb_result = lgb.cv(lgb_params, lgb_train, 10_000, folds=fixed_skf, callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = lgb_result["cvbooster"].best_iteration

Training until validation scores don't improve for 10 rounds
[10]	cv_agg's train multi_error: 0.137175 + 0.00281209	cv_agg's valid multi_error: 0.241737 + 0.0117046
[20]	cv_agg's train multi_error: 0.0673061 + 0.00100991	cv_agg's valid multi_error: 0.20709 + 0.00943918
[30]	cv_agg's train multi_error: 0.0301397 + 0.00157894	cv_agg's valid multi_error: 0.184728 + 0.0081641
[40]	cv_agg's train multi_error: 0.0105622 + 0.00133616	cv_agg's valid multi_error: 0.170232 + 0.00746583
[50]	cv_agg's train multi_error: 0.00318189 + 0.000572722	cv_agg's valid multi_error: 0.15777 + 0.00640606
[60]	cv_agg's train multi_error: 0.000353546 + 6.25081e-05	cv_agg's valid multi_error: 0.150699 + 0.00611481
[70]	cv_agg's train multi_error: 8.8388e-05 + 6.24998e-05	cv_agg's valid multi_error: 0.145396 + 0.00645257
[80]	cv_agg's train multi_error: 8.8388e-05 + 6.24998e-05	cv_agg's valid multi_error: 0.142302 + 0.00571751
[90]	cv_agg's train multi_error: 8.8388e-05 + 6.24998e-05	cv_agg's valid multi_error: 0

In [119]:
df_lgb_result = pd.DataFrame()
df_lgb_result['train multi_error-mean'] = lgb_result['train multi_error-mean']
df_lgb_result['valid multi_error-mean'] = lgb_result['valid multi_error-mean']
1-df_lgb_result.iloc[-1]

train multi_error-mean    0.999912
valid multi_error-mean    0.875640
Name: 155, dtype: float64

In [142]:
easy_to_look_at_table = []
easy_to_look_at_table+=[{'name':'TF-IDF, LightGBM, 25000 features', 'test': 0.87016, 'train': 0.999911612, 'cv_time': 'too much'}]
easy_to_look_at_table+=[{'name':'TF-IDF, LightGBM, 5000 features', 'test': 0.875640, 'train': 0.999912, 'cv_time': 'less than 3 minutes'}]

easy_to_look_at_table+=[{'name':'TF-IDF, LogisticRegression, 5000 features', 'test': 0.930440, 'train': 0.999912, 'cv_time': 9.636649+0.055589}]
easy_to_look_at_table+=[{'name':'TF-IDF, LogisticRegression, 25000 features', 'train': 0.999867, 'test': 0.877850, 'cv_time':  67.260816+ 0.308254}]

easy_to_look_at_table+=[{'name':'CountVectorizer, LogisticRegression, 5000 features', 'train': 0.999779, 'test': 0.882092, 'cv_time':  11.329509+ 0.05}]
easy_to_look_at_table+=[{'name':'CountVectorizer, LogisticRegression, 25000 features', 'train': 0.999867, 'test': 0.841435, 'cv_time':  66.131167+ 0.284277}]

easy_to_look_at_table+=[{'name':'CountVectorizer, LightGBM, 5000 features', 'train': 0.999912, 'test': 0.874315, 'cv_time':  'less than 3 minutes'}]
easy_to_look_at_table+=[{'name':'CountVectorizer, LightGBM, 25000 features', 'train': 0.999911612, 'test': 0.877054, 'cv_time':  'too much'}]


In [143]:
pd.DataFrame(easy_to_look_at_table).sort_values('test')

Unnamed: 0,name,test,train,cv_time
5,"CountVectorizer, LogisticRegression, 25000 fea...",0.841435,0.999867,66.415444
0,"TF-IDF, LightGBM, 25000 features",0.87016,0.999912,too much
6,"CountVectorizer, LightGBM, 5000 features",0.874315,0.999912,less than 3 minutes
1,"TF-IDF, LightGBM, 5000 features",0.87564,0.999912,less than 3 minutes
7,"CountVectorizer, LightGBM, 25000 features",0.877054,0.999912,too much
3,"TF-IDF, LogisticRegression, 25000 features",0.87785,0.999867,67.56907
4,"CountVectorizer, LogisticRegression, 5000 feat...",0.882092,0.999779,11.379509
2,"TF-IDF, LogisticRegression, 5000 features",0.93044,0.999912,9.692238


The TF-IDF + Logistic Regression approach is the fastest and, clearly, the most successful one. I hope I will never have to train boosting model on homogeneous data ever again.