In [90]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [91]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [92]:
newsgroups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [93]:
print(newsgroups['DESCR'])

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

Classes                     20
Samples total            18846
Dimensionality               1
Features                  text

In [94]:
df = pd.DataFrame({'text': newsgroups['data'], 'target': newsgroups['target']})
df.head()

Unnamed: 0,text,target
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,\n\n\n\n\tFinally you said what you dream abou...,17
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4


In [95]:
df['target'].value_counts()

target
10    999
15    997
8     996
9     994
11    991
7     990
13    990
5     988
14    987
2     985
12    984
3     982
6     975
1     973
4     963
17    940
16    910
0     799
18    775
19    628
Name: count, dtype: int64

In [96]:
print(df['text'][0])



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [97]:
import re
import string

In [98]:

# %pip install nltk
import nltk

In [99]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [100]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/zeal.v/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zeal.v/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zeal.v/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zeal.v/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [101]:
df['text_processed'] = df['text'].apply(lambda doc: doc.lower())
df['text_processed'].head() 

0    \n\ni am sure some bashers of pens fans are pr...
1    my brother is in the market for a high-perform...
2    \n\n\n\n\tfinally you said what you dream abou...
3    \nthink!\n\nit's the scsi card doing the dma t...
4    1)    i have an old jasmine drive which i cann...
Name: text_processed, dtype: object

In [102]:
df['text_processed'] = df['text_processed'].apply(lambda doc: re.sub(r'\d+', '', doc)) # remove numbers
df['text_processed'].head()

0    \n\ni am sure some bashers of pens fans are pr...
1    my brother is in the market for a high-perform...
2    \n\n\n\n\tfinally you said what you dream abou...
3    \nthink!\n\nit's the scsi card doing the dma t...
4    )    i have an old jasmine drive which i canno...
Name: text_processed, dtype: object

In [103]:
df['text_processed'] = df['text_processed'].apply(lambda doc: doc.split())
df['text_processed'].head()

0    [i, am, sure, some, bashers, of, pens, fans, a...
1    [my, brother, is, in, the, market, for, a, hig...
2    [finally, you, said, what, you, dream, about.,...
3    [think!, it's, the, scsi, card, doing, the, dm...
4    [), i, have, an, old, jasmine, drive, which, i...
Name: text_processed, dtype: object

In [104]:
df['text_processed'] = df['text_processed'].apply(
    lambda tokens: [w.translate(str.maketrans('','', string.punctuation)) for w in tokens]
)
df['text_processed'].head()

0    [i, am, sure, some, bashers, of, pens, fans, a...
1    [my, brother, is, in, the, market, for, a, hig...
2    [finally, you, said, what, you, dream, about, ...
3    [think, its, the, scsi, card, doing, the, dma,...
4    [, i, have, an, old, jasmine, drive, which, i,...
Name: text_processed, dtype: object

In [105]:
df['text_processed'] = df['text_processed'].apply(
    lambda tokens: [word for word in tokens if word.isalpha() and len(word) > 1]
)
df['text_processed'].head()

0    [am, sure, some, bashers, of, pens, fans, are,...
1    [my, brother, is, in, the, market, for, highpe...
2    [finally, you, said, what, you, dream, about, ...
3    [think, its, the, scsi, card, doing, the, dma,...
4    [have, an, old, jasmine, drive, which, cannot,...
Name: text_processed, dtype: object

In [106]:
stop_words = set(stopwords.words('english'))
df['text_processed'] = df['text_processed'].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)
df['text_processed'].head()

0    [sure, bashers, pens, fans, pretty, confused, ...
1    [brother, market, highperformance, video, card...
2    [finally, said, dream, mediterranean, new, are...
3    [think, scsi, card, dma, transfers, disks, scs...
4    [old, jasmine, drive, cannot, use, new, system...
Name: text_processed, dtype: object

In [107]:
from nltk.corpus import wordnet

In [108]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/zeal.v/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [109]:
def get_wordnet_pos(treebank_tag):
    """
    Convert an NLTK POS tag to the corresponding WordNet POS tag.
    :param treebank_tag: The POS tag in the Treebank format from NLTK
    :return: The corresponding WordNet part of speech
    """
    if treebank_tag.startswith('J'):  # Check if the tag indicates an adjective
        return wordnet.ADJ  # Return the WordNet constant for adjectives
    elif treebank_tag.startswith('N'):  # Check if the tag indicates a noun
        return wordnet.NOUN  # Return the WordNet constant for nouns
    elif treebank_tag.startswith('V'):  # Check if the tag indicates a verb
        return wordnet.VERB  # Return the WordNet constant for verbs
    elif treebank_tag.startswith('R'):  # Check if the tag indicates an adverb
        return wordnet.ADV  # Return the WordNet constant for adverbs
    else:
        return wordnet.NOUN  # Default to returning the noun constant if no match is found

In [110]:
lemmatizer = WordNetLemmatizer() # Create a WordNet lemmatizer object to use in the lemmatization process

In [111]:
nltk.download('averaged_perceptron_tagger_eng') # download the model for pos tagging 

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/zeal.v/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [112]:
df['text_processed'] = df['text_processed'].apply(
    lambda tokens: [lemmatizer.lemmatize(w,
                                         pos = get_wordnet_pos(nltk.pos_tag([w])[0][1])) 
    for w in tokens]
)

df['text_processed'].head()

0    [sure, bashers, pen, fan, pretty, confuse, lac...
1    [brother, market, highperformance, video, card...
2    [finally, say, dream, mediterranean, new, area...
3    [think, scsi, card, dma, transfer, disk, scsi,...
4    [old, jasmine, drive, cannot, use, new, system...
Name: text_processed, dtype: object

In [113]:
df['text_processed'] = df['text_processed'].apply(lambda tokens: ' '.join(tokens))
df['text_processed'].head()

0    sure bashers pen fan pretty confuse lack kind ...
1    brother market highperformance video card supp...
2    finally say dream mediterranean new area great...
3    think scsi card dma transfer disk scsi card dm...
4    old jasmine drive cannot use new system unders...
Name: text_processed, dtype: object

In [114]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [115]:
X_train, X_test, y_train, y_test = train_test_split(df['text_processed'], df['target'], 
                                                    test_size=0.2, random_state=42)

In [116]:
X_train.shape, X_test.shape

((15076,), (3770,))

In [117]:
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

X_train_counts.shape, X_test_counts.shape

((15076, 91200), (3770, 91200))

In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [119]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [120]:
X_train_tfidf.shape, X_test_tfidf.shape

((15076, 91200), (3770, 91200))

In [121]:
X_test_tfidf.toarray()

array([[0.        , 0.13957863, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [122]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [123]:
model = MultinomialNB()
model.fit(X_train_counts, y_train)

y_pred = model.predict(X_test_counts)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.39      0.49       151
           1       0.53      0.74      0.61       202
           2       0.85      0.27      0.41       195
           3       0.54      0.74      0.62       183
           4       0.82      0.63      0.71       205
           5       0.72      0.79      0.75       215
           6       0.87      0.58      0.69       193
           7       0.88      0.69      0.77       196
           8       0.49      0.68      0.57       168
           9       0.94      0.77      0.85       211
          10       0.90      0.89      0.90       198
          11       0.64      0.79      0.71       201
          12       0.79      0.56      0.66       202
          13       0.84      0.81      0.83       194
          14       0.74      0.78      0.76       189
          15       0.46      0.91      0.61       202
          16       0.73      0.66      0.70       188
          17       0.62    

In [None]:
model = MultinomialNB() # Create a new Multinomial Naive Bayes model object 
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.24      0.36       151
           1       0.68      0.66      0.67       202
           2       0.69      0.58      0.63       195
           3       0.52      0.76      0.62       183
           4       0.83      0.65      0.73       205
           5       0.88      0.81      0.85       215
           6       0.84      0.64      0.73       193
           7       0.84      0.72      0.78       196
           8       0.49      0.77      0.60       168
           9       0.91      0.83      0.87       211
          10       0.87      0.91      0.89       198
          11       0.62      0.83      0.71       201
          12       0.82      0.62      0.71       202
          13       0.89      0.82      0.85       194
          14       0.79      0.80      0.80       189
          15       0.39      0.95      0.55       202
          16       0.70      0.80      0.74       188
          17       0.76    

In [125]:
# lematization is better than stemming because it keeps the meaning of the word and 
# it is more accurate than stemming 0.82 vs 0.77   

In [126]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity