# Text classification with sklearn

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
all_train = fetch_20newsgroups(subset='train')

In [None]:
all_train.target_names

In [None]:
 print("\n".join(all_train.data[0].split("\n")[:10]))

In [None]:
categories = ['rec.autos', 'rec.sport.baseball', 'sci.space']

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [None]:
newsgroups_train.data[0][:500]

## Vectorization with CountVectorizer

In [None]:
# example 1

n_features = 1000 
count_vectorizer = CountVectorizer(max_df=0.95, min_df=0.05,
                                max_features=n_features,
                                stop_words='english')

train_count_vectorizer = count_vectorizer.fit_transform(newsgroups_train.data)
test_count_vectorizer = count_vectorizer.transform(newsgroups_test.data)

clf = LogisticRegression(random_state=0).fit(train_count_vectorizer, newsgroups_train.target)
predicted = clf.predict(test_count_vectorizer)

print(classification_report(newsgroups_test.target, predicted))

In [None]:
# example 2

count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))

train_count_vectorizer = count_vectorizer.fit_transform(newsgroups_train.data)
test_count_vectorizer = count_vectorizer.transform(newsgroups_test.data)

clf = LogisticRegression(random_state=0).fit(train_count_vectorizer, newsgroups_train.target)
predicted = clf.predict(test_count_vectorizer)

print(classification_report(newsgroups_test.target, predicted))

## Vectorization with TFIDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=500, min_df=10)

tfidf_train = tfidf_vectorizer.fit_transform(newsgroups_train.data)
tfidf_test =  tfidf_vectorizer.transform(newsgroups_test.data)

clf = LogisticRegression().fit(tfidf_train, newsgroups_train.target)

predicted = clf.predict(tfidf_test)
print(classification_report(newsgroups_test.target, predicted))