In [None]:
import pandas as pd

## The 20 newsgroups dataset

The [20 Newsgroups data set](http://qwone.com/~jason/20Newsgroups/) is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups.
The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering.

The data is organized into 20 different newsgroups, each corresponding to a different topic:

- 'atheism',
- 'comp.graphics',
- 'comp.os.ms-windows.misc',
- 'comp.sys.ibm.pc.hardware',
- 'comp.sys.mac.hardware',
- 'comp.windows.x',
- 'misc.forsale',
- 'rec.autos',
- 'rec.motorcycles',
- 'rec.sport.baseball',
- 'rec.sport.hockey',
- 'sci.crypt',
- 'sci.electronics',
- 'sci.med',
- 'sci.space',
- 'soc.religion.christian',
- 'talk.politics.guns',
- 'talk.politics.mideast',
- 'talk.politics.misc',
- 'talk.religion.misc']

 we will work on a partial dataset with only 6 categories out of the 20 available in the dataset

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
# choose, for example, 6 categories
categories = [
    'alt.atheism',
    'comp.windows.x',
    'rec.autos',
    'rec.sport.baseball',
    'sci.electronics',
    'sci.space',
]

train = fetch_20newsgroups(subset='train', 
                                categories=categories,
                                remove=('headers', 'footers', 'quotes')
                          )

test = fetch_20newsgroups(subset='test', 
                                categories=categories,
                                remove=('headers', 'footers', 'quotes')
                          )

In [None]:
train_data = pd.DataFrame({'text' : train['data'], 
                           'category' : train['target']})
train_data.head()

In [None]:
test_data = pd.DataFrame({'text' : test['data'], 
                           'category' : test['target']})
test_data.head()

In [None]:
train['target_names']

In [None]:
# topics
train_data.category.value_counts()

In [None]:
# topics
test_data.category.value_counts()

In [None]:
# space
print(train_data[train_data.category==5].sample().iloc[0,0])

**Goal**:  classify documents from the dataset by their topic

## Training a Naive Bayes model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
X_train = train_data.text
y_train = train_data.category

In [None]:
pipe = Pipeline(steps=[
    ('vect', TfidfVectorizer()), 
    ('clf', MultinomialNB()) 
])

In [None]:
params_dic =  {'vect__max_features' : [1000,2000,5000,10000],
               'vect__stop_words' : ['english', None],
               'vect__min_df' : [5,10,20,50],
               'vect__ngram_range' : [(1,1), (1,2),(1,3)],
               'vect__use_idf' : [True,False]}

In [None]:
grid = GridSearchCV(pipe,params_dic,scoring='accuracy',cv=5, n_jobs=-1, verbose=True)
grid.fit(X_train,y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
best_clf = grid.best_estimator_

In [None]:
# evaluate the model
X_test = test_data.text
y_test = test_data.category
y_test_pred = best_clf.predict(X_test)

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
train['target_names']

In [None]:
best_clf.predict(['I always wanted to be an astronaut','I hate Windows 10'])

## Logistic regression

In [None]:
pipe = Pipeline(steps=[
    ('vect', TfidfVectorizer()), 
    ('clf', LogisticRegression()) 
])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
pipe['clf'].coef_.shape