In [46]:
from sklearn.datasets import fetch_20newsgroups

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

# create CountVectorizer object
cv = CountVectorizer(
                    analyzer='word', # token = word
                    ngram_range=(1,1), # only unigrams are used, (1,2) - unigrams/bigrams, ..., etc.
                    stop_words=['my', 'stop', 'word', 'list'], # or stop_words='english'
                    vocabulary=None, # or vocabulary=your_own_dictionary
                    max_df=1.0, # don't filter words by their frequency
                    max_features=6 # only top-6 words will be used as columns
                    )

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create TfidfVectorizer object
tv = TfidfVectorizer(
                    analyzer='word', # token = word
                    ngram_range=(1,1), # only unigrams are used, (1,2) - unigrams/bigrams, ..., etc.
                    stop_words=['my', 'stop', 'word', 'list'], # or stop_words='english'
                    vocabulary=None, # or vocabulary=your_own_dictionary
                    max_df=1.0, # don't filter words by their frequency
                    max_features=6, # only top-6 words will be used as columns,
                    smooth_idf=True,
                    norm='l2' # euclidean norm используется по дефолту
                    )

In [48]:
train = fetch_20newsgroups()
test = fetch_20newsgroups(subset="test")

In [49]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [50]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

# LogisticRegression + CountVectorizer

In [52]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', LogisticRegression()),
])

In [53]:
params = dict(clf__C=[10, 1, 0.1, 0.01])
grid_search = GridSearchCV(pipeline, params, scoring="accuracy", cv=skf, n_jobs=-1)

In [54]:
grid_search.fit(train["data"], train["target"], )

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=1, shuffle=True),
             estimator=Pipeline(steps=[('bow', CountVectorizer()),
                                       ('clf', LogisticRegression())]),
             n_jobs=-1, param_grid={'clf__C': [10, 1, 0.1, 0.01]},
             scoring='accuracy')

In [56]:
grid_search.best_score_, grid_search.best_estimator_

(0.863266098677382,
 Pipeline(steps=[('bow', CountVectorizer()), ('clf', LogisticRegression(C=1))]))

In [57]:
from sklearn.metrics import accuracy_score, classification_report

In [58]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', LogisticRegression(C=1)),
])
pipeline.fit(train["data"], train["target"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('bow', CountVectorizer()), ('clf', LogisticRegression(C=1))])

In [59]:
predictions = pipeline.predict(test["data"])
accuracy_score(test["target"], predictions)

0.7858470525756771

In [60]:
print(classification_report(test["target"], predictions, target_names=test["target_names"]))

                          precision    recall  f1-score   support

             alt.atheism       0.78      0.67      0.72       319
           comp.graphics       0.66      0.76      0.70       389
 comp.os.ms-windows.misc       0.72      0.67      0.69       394
comp.sys.ibm.pc.hardware       0.66      0.68      0.67       392
   comp.sys.mac.hardware       0.77      0.80      0.79       385
          comp.windows.x       0.79      0.71      0.74       395
            misc.forsale       0.81      0.88      0.85       390
               rec.autos       0.84      0.84      0.84       396
         rec.motorcycles       0.90      0.91      0.91       398
      rec.sport.baseball       0.85      0.87      0.86       397
        rec.sport.hockey       0.93      0.91      0.92       399
               sci.crypt       0.87      0.88      0.87       396
         sci.electronics       0.69      0.72      0.70       393
                 sci.med       0.77      0.74      0.76       396
         

# LogisticRegression + TfidfVectorizer

In [93]:
pipeline = Pipeline([
    ('bow', TfidfVectorizer()),
    ('clf', LogisticRegression(C=1)),
])
pipeline.fit(train["data"], train["target"])

Pipeline(steps=[('bow', TfidfVectorizer()), ('clf', LogisticRegression(C=1))])

In [94]:
predictions = pipeline.predict(test["data"])
accuracy_score(test["target"], predictions)

0.8274030801911842

In [95]:
print(classification_report(test["target"], predictions, target_names=test["target_names"]))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.74      0.77       319
           comp.graphics       0.69      0.79      0.74       389
 comp.os.ms-windows.misc       0.75      0.73      0.74       394
comp.sys.ibm.pc.hardware       0.72      0.72      0.72       392
   comp.sys.mac.hardware       0.81      0.83      0.82       385
          comp.windows.x       0.83      0.74      0.78       395
            misc.forsale       0.76      0.90      0.82       390
               rec.autos       0.90      0.89      0.90       396
         rec.motorcycles       0.95      0.95      0.95       398
      rec.sport.baseball       0.88      0.92      0.90       397
        rec.sport.hockey       0.94      0.95      0.95       399
               sci.crypt       0.94      0.88      0.91       396
         sci.electronics       0.76      0.80      0.78       393
                 sci.med       0.89      0.83      0.85       396
         

# XGBoost + CountVectorizer

In [97]:
from xgboost import XGBClassifier
xgb = XGBClassifier()

In [99]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', XGBClassifier()),
])

In [100]:
params = dict(clf__max_depth=[3, 4, 5, 6, 7, 8])
grid_search = GridSearchCV(pipeline, params, scoring="accuracy", cv=skf, n_jobs=-1, verbose=1)

In [101]:
grid_search.fit(train["data"], train["target"], )

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 55.5min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=1, shuffle=True),
             estimator=Pipeline(steps=[('bow', CountVectorizer()),
                                       ('clf',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=...
                                                    

In [104]:
grid_search.best_score_, grid_search.best_estimator_

(0.8394905578366426,
 Pipeline(steps=[('bow', CountVectorizer()),
                 ('clf',
                  XGBClassifier(base_score=0.5, booster='gbtree',
                                colsample_bylevel=1, colsample_bynode=1,
                                colsample_bytree=1, gamma=0, gpu_id=-1,
                                importance_type='gain',
                                interaction_constraints='',
                                learning_rate=0.300000012, max_delta_step=0,
                                max_depth=3, min_child_weight=1, missing=nan,
                                monotone_constraints='()', n_estimators=100,
                                n_jobs=0, num_parallel_tree=1,
                                objective='multi:softprob', random_state=0,
                                reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
                                subsample=1, tree_method='exact',
                                validate_parameters=1, verbosit

In [105]:
predictions = pipeline.predict(test["data"])
accuracy_score(test["target"], predictions)

NotFittedError: Vocabulary not fitted or provided

# LightGBM

In [None]:
train["data"]

In [86]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

gkf = KFold(n_splits=5, shuffle=True, random_state=42).split(X=train["data"], y=train["target"])

param_grid = {
    'num_leaves': [31, 127],
    'reg_alpha': [0.1, 0.5],
    'min_data_in_leaf': [30, 50, 100, 300, 400],
    'lambda_l1': [0, 1, 1.5],
    'lambda_l2': [0, 1]
    }

lgb_estimator = lgb.LGBMClassifier(boosting_type='gbdt',  objective='binary', num_boost_round=2000, learning_rate=0.01, metric='auc')

gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=param_grid, cv=gkf)
lgb_model = gsearch.fit(X=train["data"], y=train["target"])

print(lgb_model.best_params_, lgb_model.best_score_)

AttributeError: 'list' object has no attribute 'reshape'

# CatBoost

In [92]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', CatBoost()),
])

params = dict(clf__learning_rate=[0.03, 0.1], clf__depth=[4, 6, 10], clf__l2_leaf_reg=[1, 3, 5, 7, 9])
grid_search = GridSearchCV(pipeline, params, scoring="accuracy", cv=skf, n_jobs=1, verbose=1)

grid_search.fit(train["data"], train["target"], )

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 5.5297126	total: 442ms	remaining: 7m 21s
1:	learn: 5.5034984	total: 859ms	remaining: 7m 8s
2:	learn: 5.4762546	total: 1.32s	remaining: 7m 17s
3:	learn: 5.4464302	total: 1.71s	remaining: 7m 6s
4:	learn: 5.4175812	total: 2.1s	remaining: 6m 58s
5:	learn: 5.3892279	total: 2.56s	remaining: 7m 4s
6:	learn: 5.3674929	total: 3.02s	remaining: 7m 8s
7:	learn: 5.3412439	total: 3.45s	remaining: 7m 7s
8:	learn: 5.3175222	total: 3.89s	remaining: 7m 8s
9:	learn: 5.2950295	total: 4.31s	remaining: 7m 7s
10:	learn: 5.2711856	total: 4.8s	remaining: 7m 11s
11:	learn: 5.2528775	total: 5.25s	remaining: 7m 12s
12:	learn: 5.2326384	total: 5.74s	remaining: 7m 15s
13:	learn: 5.2123785	total: 6.2s	remaining: 7m 16s
14:	learn: 5.1951791	total: 6.61s	remaining: 7m 14s
15:	learn: 5.1757774	total: 7.05s	remaining: 7m 13s
16:	learn: 5.1585947	total: 7.53s	remaining: 7m 15s
17:	learn: 5.1416211	total: 7.96s	remaining: 7m 14s
18:	learn: 5.1232928	total: 8.4s	remaining: 7m 13s
19:	learn: 5.1064587	total: 8.83s

159:	learn: 4.1620467	total: 1m 13s	remaining: 6m 23s
160:	learn: 4.1596208	total: 1m 13s	remaining: 6m 22s
161:	learn: 4.1562343	total: 1m 13s	remaining: 6m 22s
162:	learn: 4.1531739	total: 1m 14s	remaining: 6m 21s
163:	learn: 4.1509635	total: 1m 14s	remaining: 6m 21s
164:	learn: 4.1477106	total: 1m 15s	remaining: 6m 20s
165:	learn: 4.1447244	total: 1m 15s	remaining: 6m 20s
166:	learn: 4.1415980	total: 1m 16s	remaining: 6m 19s
167:	learn: 4.1395307	total: 1m 16s	remaining: 6m 19s
168:	learn: 4.1360620	total: 1m 17s	remaining: 6m 19s
169:	learn: 4.1323731	total: 1m 17s	remaining: 6m 18s
170:	learn: 4.1288465	total: 1m 17s	remaining: 6m 18s
171:	learn: 4.1258743	total: 1m 18s	remaining: 6m 17s
172:	learn: 4.1223380	total: 1m 18s	remaining: 6m 17s
173:	learn: 4.1199311	total: 1m 19s	remaining: 6m 16s
174:	learn: 4.1170314	total: 1m 19s	remaining: 6m 16s
175:	learn: 4.1143867	total: 1m 20s	remaining: 6m 15s
176:	learn: 4.1106640	total: 1m 20s	remaining: 6m 15s
177:	learn: 4.1070918	total:

312:	learn: 3.7849644	total: 2m 21s	remaining: 5m 11s
313:	learn: 3.7830347	total: 2m 22s	remaining: 5m 10s
314:	learn: 3.7811828	total: 2m 22s	remaining: 5m 10s
315:	learn: 3.7789364	total: 2m 23s	remaining: 5m 9s
316:	learn: 3.7770453	total: 2m 23s	remaining: 5m 9s
317:	learn: 3.7749425	total: 2m 23s	remaining: 5m 8s
318:	learn: 3.7730697	total: 2m 24s	remaining: 5m 8s
319:	learn: 3.7708315	total: 2m 24s	remaining: 5m 7s
320:	learn: 3.7682868	total: 2m 25s	remaining: 5m 7s
321:	learn: 3.7661778	total: 2m 25s	remaining: 5m 6s
322:	learn: 3.7640187	total: 2m 26s	remaining: 5m 6s
323:	learn: 3.7621192	total: 2m 26s	remaining: 5m 5s
324:	learn: 3.7600579	total: 2m 27s	remaining: 5m 5s
325:	learn: 3.7580104	total: 2m 27s	remaining: 5m 4s
326:	learn: 3.7561471	total: 2m 27s	remaining: 5m 4s
327:	learn: 3.7534631	total: 2m 28s	remaining: 5m 4s
328:	learn: 3.7515453	total: 2m 28s	remaining: 5m 3s
329:	learn: 3.7496896	total: 2m 29s	remaining: 5m 3s
330:	learn: 3.7477178	total: 2m 29s	remaini

466:	learn: 3.5111914	total: 3m 31s	remaining: 4m
467:	learn: 3.5099803	total: 3m 31s	remaining: 4m
468:	learn: 3.5087702	total: 3m 32s	remaining: 4m
469:	learn: 3.5073913	total: 3m 32s	remaining: 3m 59s
470:	learn: 3.5061161	total: 3m 32s	remaining: 3m 59s
471:	learn: 3.5044858	total: 3m 33s	remaining: 3m 58s
472:	learn: 3.5033157	total: 3m 33s	remaining: 3m 58s
473:	learn: 3.5020723	total: 3m 34s	remaining: 3m 57s
474:	learn: 3.5008122	total: 3m 34s	remaining: 3m 57s
475:	learn: 3.4995247	total: 3m 35s	remaining: 3m 56s
476:	learn: 3.4982924	total: 3m 35s	remaining: 3m 56s
477:	learn: 3.4970776	total: 3m 35s	remaining: 3m 55s
478:	learn: 3.4957116	total: 3m 36s	remaining: 3m 55s
479:	learn: 3.4942447	total: 3m 36s	remaining: 3m 54s
480:	learn: 3.4929460	total: 3m 37s	remaining: 3m 54s
481:	learn: 3.4914707	total: 3m 37s	remaining: 3m 53s
482:	learn: 3.4902823	total: 3m 38s	remaining: 3m 53s
483:	learn: 3.4885450	total: 3m 38s	remaining: 3m 53s
484:	learn: 3.4868902	total: 3m 39s	rema

620:	learn: 3.3196507	total: 4m 40s	remaining: 2m 51s
621:	learn: 3.3187357	total: 4m 40s	remaining: 2m 50s
622:	learn: 3.3175786	total: 4m 41s	remaining: 2m 50s
623:	learn: 3.3166377	total: 4m 41s	remaining: 2m 49s
624:	learn: 3.3155910	total: 4m 42s	remaining: 2m 49s
625:	learn: 3.3141836	total: 4m 42s	remaining: 2m 48s
626:	learn: 3.3130926	total: 4m 43s	remaining: 2m 48s
627:	learn: 3.3123048	total: 4m 43s	remaining: 2m 48s
628:	learn: 3.3114778	total: 4m 44s	remaining: 2m 47s
629:	learn: 3.3102130	total: 4m 44s	remaining: 2m 47s
630:	learn: 3.3090699	total: 4m 44s	remaining: 2m 46s
631:	learn: 3.3080843	total: 4m 45s	remaining: 2m 46s
632:	learn: 3.3070964	total: 4m 45s	remaining: 2m 45s
633:	learn: 3.3061738	total: 4m 46s	remaining: 2m 45s
634:	learn: 3.3048220	total: 4m 46s	remaining: 2m 44s
635:	learn: 3.3038810	total: 4m 47s	remaining: 2m 44s
636:	learn: 3.3028751	total: 4m 47s	remaining: 2m 43s
637:	learn: 3.3017759	total: 4m 48s	remaining: 2m 43s
638:	learn: 3.3008516	total:

773:	learn: 3.1716076	total: 5m 49s	remaining: 1m 41s
774:	learn: 3.1704926	total: 5m 49s	remaining: 1m 41s
775:	learn: 3.1697373	total: 5m 49s	remaining: 1m 41s
776:	learn: 3.1688818	total: 5m 50s	remaining: 1m 40s
777:	learn: 3.1678581	total: 5m 50s	remaining: 1m 40s
778:	learn: 3.1670806	total: 5m 51s	remaining: 1m 39s
779:	learn: 3.1664942	total: 5m 51s	remaining: 1m 39s
780:	learn: 3.1657195	total: 5m 52s	remaining: 1m 38s
781:	learn: 3.1648218	total: 5m 52s	remaining: 1m 38s
782:	learn: 3.1639437	total: 5m 53s	remaining: 1m 37s
783:	learn: 3.1630849	total: 5m 53s	remaining: 1m 37s
784:	learn: 3.1623727	total: 5m 53s	remaining: 1m 36s
785:	learn: 3.1617085	total: 5m 54s	remaining: 1m 36s
786:	learn: 3.1608680	total: 5m 54s	remaining: 1m 36s
787:	learn: 3.1600884	total: 5m 55s	remaining: 1m 35s
788:	learn: 3.1592633	total: 5m 55s	remaining: 1m 35s
789:	learn: 3.1583797	total: 5m 56s	remaining: 1m 34s
790:	learn: 3.1577004	total: 5m 56s	remaining: 1m 34s
791:	learn: 3.1568158	total:

928:	learn: 3.0489151	total: 6m 57s	remaining: 31.9s
929:	learn: 3.0482421	total: 6m 58s	remaining: 31.5s
930:	learn: 3.0475462	total: 6m 58s	remaining: 31s
931:	learn: 3.0470616	total: 6m 59s	remaining: 30.6s
932:	learn: 3.0464352	total: 6m 59s	remaining: 30.1s
933:	learn: 3.0457324	total: 6m 59s	remaining: 29.7s
934:	learn: 3.0451539	total: 7m	remaining: 29.2s
935:	learn: 3.0445153	total: 7m	remaining: 28.8s
936:	learn: 3.0435825	total: 7m 1s	remaining: 28.3s
937:	learn: 3.0425962	total: 7m 1s	remaining: 27.9s
938:	learn: 3.0420144	total: 7m 2s	remaining: 27.4s
939:	learn: 3.0414834	total: 7m 2s	remaining: 27s
940:	learn: 3.0407416	total: 7m 2s	remaining: 26.5s
941:	learn: 3.0401347	total: 7m 3s	remaining: 26.1s
942:	learn: 3.0393132	total: 7m 3s	remaining: 25.6s
943:	learn: 3.0385305	total: 7m 4s	remaining: 25.2s
944:	learn: 3.0379066	total: 7m 4s	remaining: 24.7s
945:	learn: 3.0373856	total: 7m 5s	remaining: 24.3s
946:	learn: 3.0367993	total: 7m 5s	remaining: 23.8s
947:	learn: 3.03

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [None]:
grid_search.best_score_, grid_search.best_estimator_