In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# create CountVectorizer object
cv = CountVectorizer(
                    analyzer='word', # token = word
                    ngram_range=(1,1), # only unigrams are used, (1,2) - unigrams/bigrams, ..., etc.
                    stop_words=['my', 'stop', 'word', 'list'], # or stop_words='english'
                    vocabulary=None, # or vocabulary=your_own_dictionary
                    max_df=1.0, # don't filter words by their frequency
                    max_features=6 # only top-6 words will be used as columns
                    )

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create TfidfVectorizer object
tv = TfidfVectorizer(
                    analyzer='word', # token = word
                    ngram_range=(1,1), # only unigrams are used, (1,2) - unigrams/bigrams, ..., etc.
                    stop_words=['my', 'stop', 'word', 'list'], # or stop_words='english'
                    vocabulary=None, # or vocabulary=your_own_dictionary
                    max_df=1.0, # don't filter words by their frequency
                    max_features=6, # only top-6 words will be used as columns,
                    smooth_idf=True,
                    norm='l2' # euclidean norm используется по дефолту
                    )

In [5]:
train = fetch_20newsgroups()
test = fetch_20newsgroups(subset="test")

In [6]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [7]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

# LogisticRegression + CountVectorizer

In [7]:
# pipeline = Pipeline([
#     ('bow', CountVectorizer()),
#     ('clf', LogisticRegression()),
# ])

Сделаем перебор лучших параметров C для логистической регресии

In [8]:
# params = dict(clf__C=[10, 1, 0.1, 0.01])
# grid_search = GridSearchCV(pipeline, params, scoring="accuracy", cv=skf, n_jobs=-1)

In [9]:
# grid_search.fit(train["data"], train["target"], )

In [10]:
# grid_search.best_score_, grid_search.best_estimator_

Лучший параметр C = 1. Используем это в модели

In [11]:
from sklearn.metrics import accuracy_score, classification_report

In [12]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', LogisticRegression(C=1)),
])
pipeline.fit(train["data"], train["target"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('bow', CountVectorizer()), ('clf', LogisticRegression(C=1))])

In [13]:
predictions = pipeline.predict(test["data"])
accuracy_score(test["target"], predictions)

0.7915560276155071

In [14]:
print(classification_report(test["target"], predictions, target_names=test["target_names"]))

                          precision    recall  f1-score   support

             alt.atheism       0.76      0.71      0.74       319
           comp.graphics       0.67      0.75      0.71       389
 comp.os.ms-windows.misc       0.71      0.68      0.69       394
comp.sys.ibm.pc.hardware       0.65      0.68      0.67       392
   comp.sys.mac.hardware       0.78      0.79      0.79       385
          comp.windows.x       0.81      0.70      0.75       395
            misc.forsale       0.82      0.88      0.85       390
               rec.autos       0.83      0.85      0.84       396
         rec.motorcycles       0.91      0.90      0.90       398
      rec.sport.baseball       0.84      0.88      0.86       397
        rec.sport.hockey       0.93      0.91      0.92       399
               sci.crypt       0.87      0.87      0.87       396
         sci.electronics       0.69      0.71      0.70       393
                 sci.med       0.82      0.74      0.77       396
         

# LogisticRegression + TfidfVectorizer

In [15]:
# pipeline = Pipeline([
#     ('bow', TfidfVectorizer()),
#     ('clf', LogisticRegression()),
# ])

Сделаем такой же перебор лучших параметров C для логистической регресии

In [16]:
# params = {'clf__penalty': ['l1','l2'], 'clf__C': [0.001,0.01,0.1,1,10,100,1000]}
# grid_search = GridSearchCV(pipeline, params, scoring="accuracy", cv=skf, n_jobs=-1)

In [17]:
# grid_search.fit(train["data"], train["target"], )

In [18]:
# grid_search.best_score_, grid_search.best_estimator_

Лучший параметр C = 1000. Используем это в модели

In [19]:
pipeline = Pipeline([
    ('bow', TfidfVectorizer()),
    ('clf', LogisticRegression(C=1000)),
])
pipeline.fit(train["data"], train["target"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('bow', TfidfVectorizer()),
                ('clf', LogisticRegression(C=1000))])

In [20]:
predictions = pipeline.predict(test["data"])
accuracy_score(test["target"], predictions)

0.8441317047265002

In [21]:
print(classification_report(test["target"], predictions, target_names=test["target_names"]))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.78      0.79       319
           comp.graphics       0.75      0.79      0.77       389
 comp.os.ms-windows.misc       0.75      0.71      0.73       394
comp.sys.ibm.pc.hardware       0.69      0.71      0.70       392
   comp.sys.mac.hardware       0.82      0.85      0.84       385
          comp.windows.x       0.85      0.76      0.81       395
            misc.forsale       0.83      0.90      0.86       390
               rec.autos       0.92      0.90      0.91       396
         rec.motorcycles       0.96      0.96      0.96       398
      rec.sport.baseball       0.93      0.94      0.94       397
        rec.sport.hockey       0.97      0.98      0.97       399
               sci.crypt       0.94      0.93      0.93       396
         sci.electronics       0.78      0.79      0.79       393
                 sci.med       0.91      0.84      0.88       396
         

# XGBoost + CountVectorizer

In [22]:
from xgboost import XGBClassifier
xgb = XGBClassifier()

In [23]:
# pipeline = Pipeline([
#     ('bow', CountVectorizer()),
#     ('clf', XGBClassifier()),
# ])

In [24]:
# params = dict(clf__max_depth=[3, 4, 5, 6, 7, 8])
# grid_search = GridSearchCV(pipeline, params, scoring="accuracy", cv=skf, n_jobs=-1, verbose=1)

In [25]:
# grid_search.fit(train["data"], train["target"], )

In [26]:
# grid_search.best_score_, grid_search.best_estimator_

In [27]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', XGBClassifier(max_depth=3)),
])
pipeline.fit(train["data"], train["target"])

Pipeline(steps=[('bow', CountVectorizer()),
                ('clf',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=3, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1,
                               objective='multi:softprob', random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

In [28]:
predictions = pipeline.predict(test["data"])
accuracy_score(test["target"], predictions)

0.7841210833775889

In [29]:
print(classification_report(test["target"], predictions, target_names=test["target_names"]))

                          precision    recall  f1-score   support

             alt.atheism       0.77      0.67      0.72       319
           comp.graphics       0.67      0.73      0.70       389
 comp.os.ms-windows.misc       0.76      0.74      0.75       394
comp.sys.ibm.pc.hardware       0.65      0.74      0.69       392
   comp.sys.mac.hardware       0.75      0.79      0.77       385
          comp.windows.x       0.84      0.72      0.77       395
            misc.forsale       0.80      0.88      0.84       390
               rec.autos       0.85      0.80      0.83       396
         rec.motorcycles       0.90      0.87      0.88       398
      rec.sport.baseball       0.86      0.89      0.87       397
        rec.sport.hockey       0.92      0.89      0.91       399
               sci.crypt       0.90      0.86      0.88       396
         sci.electronics       0.54      0.65      0.59       393
                 sci.med       0.83      0.80      0.81       396
         

# LightGBM

In [30]:
!pip install lightgbm



In [8]:
from lightgbm import LGBMClassifier

In [32]:
# pipeline = Pipeline([
#     ('bow', CountVectorizer()),
#     ('clf', LGBMClassifier(min_data_in_leaf=100)),
# ])

Снова перебираем лучшие параметры

In [33]:
# params = {
#     'clf__min_data_in_leaf': [30, 50, 100, 300, 400],
#     }

In [34]:
# grid_search = GridSearchCV(pipeline, params, scoring="accuracy", cv=skf, verbose=1)

In [35]:
# grid_search.fit(train["data"], train["target"], )

In [36]:
# grid_search.best_score_, grid_search.best_estimator_

Получаем результат

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', LGBMClassifier(min_data_in_leaf=100)),
])
pipeline.fit(train["data"], train["target"])

In [None]:
predictions = pipeline.predict(test["data"])
accuracy_score(test["target"], predictions)

In [1]:
print(classification_report(test["target"], predictions, target_names=test["target_names"]))

NameError: name 'classification_report' is not defined

# CatBoost

In [None]:
!pip install --upgrade pip

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
# pipeline = Pipeline([
#     ('bow', CountVectorizer()),
#     ('clf', CatBoostClassifier()),
# ])

# params = {
#     'clf__depth': [4, 6, 10],
#     }

# grid_search = GridSearchCV(pipeline, params, scoring="accuracy", cv=skf, n_jobs=1, verbose=1)

# grid_search.fit(train["data"], train["target"], )

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', CatBoostClassifier(depth=6)),
])
pipeline.fit(train["data"], train["target"])

In [None]:
predictions = pipeline.predict(test["data"])
accuracy_score(test["target"], predictions)

In [None]:
print(classification_report(test["target"], predictions, target_names=test["target_names"]))