In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from textvec import vectorizers

**Fetching 20newsgroups dataset with sklearn:**

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

**Frequency counting:**

In [3]:
cv = CountVectorizer()
cv_train = cv.fit_transform(newsgroups_train.data)
cv_test = cv.transform(newsgroups_test.data)

# Basic usage

### TfIdf

In [4]:
tfidf = TfidfTransformer()
tfidf_train = tfidf.fit_transform(cv_train)
tfidf_test = tfidf.transform(cv_test)

In [5]:
clf = LogisticRegression(multi_class="auto", solver="liblinear")
clf.fit(tfidf_train, newsgroups_train.target)
preds = clf.predict(tfidf_test)
accuracy_score(newsgroups_test.target, preds)

0.8279341476367499

### TfIcf

In [6]:
tficf = vectorizers.TfIcfVectorizer()
tficf_train = tficf.fit_transform(cv_train, newsgroups_train.target)
tficf_test = tficf.transform(cv_test)

In [7]:
clf = LogisticRegression(multi_class="auto", solver="liblinear")
clf.fit(tficf_train, newsgroups_train.target)
preds = clf.predict(tficf_test)
accuracy_score(newsgroups_test.target, preds)

0.8416091343600637

# Sklearn Pipeline
Textvec vectorizers also can be used in scikit-learn `Pipeline` class. 

### TfIdf

In [8]:
pipeline = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("transformer", TfidfTransformer()),
    ("clf", LogisticRegression(multi_class="auto", solver="liblinear"))
])

In [9]:
pipeline.fit(newsgroups_train.data, newsgroups_train.target)
preds = pipeline.predict(newsgroups_test.data)
accuracy_score(newsgroups_test.target, preds)

0.8279341476367499

### TfIcf

In [10]:
pipeline = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("transformer", vectorizers.TfIcfVectorizer()),
    ("clf", LogisticRegression(multi_class="auto", solver="liblinear"))
])

In [11]:
pipeline.fit(newsgroups_train.data, newsgroups_train.target)
preds = pipeline.predict(newsgroups_test.data)
accuracy_score(newsgroups_test.target, preds)

0.8420074349442379