In [2]:
from sklearn.datasets import twenty_newsgroups

In [3]:
data = twenty_newsgroups.fetch_20newsgroups()

In [4]:
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
categories = ['alt.atheism','sci.med','soc.religion.christian','misc.forsale']

In [6]:
twenty_train = twenty_newsgroups.fetch_20newsgroups(subset='train',categories=categories,shuffle = True,random_state=42)

In [7]:
twenty_test = twenty_newsgroups.fetch_20newsgroups(subset='test',categories=categories,shuffle = True,random_state=42)

In [8]:
len(twenty_train.data)

2258

In [9]:
len(twenty_test.data)

1503

In [10]:
twenty_train.target_names

['alt.atheism', 'misc.forsale', 'sci.med', 'soc.religion.christian']

### Building Pipeline

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [12]:
cv = CountVectorizer(stop_words='english')

In [13]:
pipe = Pipeline([
   ('cv',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('mnb',MultinomialNB())
])

In [14]:
pipe.fit(twenty_train.data,twenty_train.target)

Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_a...inear_tf=False, use_idf=True)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [15]:
twenty_train.target_names

['alt.atheism', 'misc.forsale', 'sci.med', 'soc.religion.christian']

In [16]:
pipe.predict(['work hard'])

array([1], dtype=int64)

#### array([1]) means the sentence 'work hard' belongs to forsale category

In [18]:
pipe.predict(['i\'m looking for a job'])

array([1], dtype=int64)

#### array([1]) means the sentence 'i'm looking for a job' belongs to forsale category

In [19]:
pipe.predict(['i have cold '])

array([2], dtype=int64)

#### array([2]) means the sentence 'i have cold' belongs to med category

In [20]:
pipe.predict(['God Must Be Crazy'])

array([3], dtype=int64)

#### array([3]) means the sentence 'God Must be Crazy' belongs to religion.christian category

In [21]:
pipe.score(twenty_test.data,twenty_test.target)

0.8469727212242182