In [30]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
cv =  CountVectorizer(ngram_range=(1,3))
cv.fit(['hey i am here to learn n_grams'])
cv.vocabulary_

{'hey': 6,
 'am': 0,
 'here': 3,
 'to': 12,
 'learn': 9,
 'n_grams': 11,
 'hey am': 7,
 'am here': 1,
 'here to': 4,
 'to learn': 13,
 'learn n_grams': 10,
 'hey am here': 8,
 'am here to': 2,
 'here to learn': 5,
 'to learn n_grams': 14}

In [37]:
corpus = [
    'This sentence is often used as an example of a pangram,',
    'which is a sentence that contains every letter of the alphabet at least once.',
    'Its a short and simple sentence, but it contains all 26 letters of the English alphabet,',
    'making it a useful tool for testing typewriters, keyboards, and fonts.'
]

In [275]:
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(text)
    filtered_token = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)
    return " ".join(filtered_token)
preprocess('thor ate pizza')

'thor eat pizza'

In [39]:
corpus_processed = [preprocess(token) for token in corpus]
corpus_processed

['sentence example pangram',
 'sentence contain letter alphabet',
 'short simple sentence contain 26 letter English alphabet',
 'make useful tool testing typewriter keyboard font']

In [40]:
v = CountVectorizer(ngram_range=(1,1))
v.fit(corpus_processed)
v.vocabulary_

{'sentence': 10,
 'example': 4,
 'pangram': 9,
 'contain': 2,
 'letter': 7,
 'alphabet': 1,
 'short': 11,
 'simple': 12,
 '26': 0,
 'english': 3,
 'make': 8,
 'useful': 16,
 'tool': 14,
 'testing': 13,
 'typewriter': 15,
 'keyboard': 6,
 'font': 5}

In [41]:
v.transform(corpus).toarray()

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1]], dtype=int64)

## News Category Classification Problem
### Okay now that we know basics of BAG of n grams vectorizer 😎 It is the time to work on a real problem. Here we want to do a news category classification. We will use bag of n-grams and traing a machine learning model that can categorize any news into one of the following categories,

1. BUSINESS
2. SPORTS
3. CRIME
4. SCIENCE

In [244]:
import json
import pandas as pd

with open('News_Category_Dataset_v3.json', 'r') as f:
    data = [json.loads(line) for line in f]


In [245]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [246]:
df.shape

(209527, 6)

In [247]:
df = df.sample(200000)

In [248]:
df = df.reset_index(drop=True)

In [249]:
# df.drop(df[['link', 'headline','authors', 'date']], axis=1, inplace=True)

In [250]:
df.shape

(200000, 6)

In [251]:
df['category'].value_counts()

POLITICS          33966
WELLNESS          17129
ENTERTAINMENT     16605
TRAVEL             9419
STYLE & BEAUTY     9371
PARENTING          8403
HEALTHY LIVING     6369
QUEER VOICES       6083
FOOD & DRINK       6039
BUSINESS           5742
COMEDY             5143
SPORTS             4867
BLACK VOICES       4389
HOME & LIVING      4121
PARENTS            3764
THE WORLDPOST      3511
WEDDINGS           3484
CRIME              3437
WOMEN              3402
IMPACT             3315
DIVORCE            3253
WORLD NEWS         3162
MEDIA              2809
WEIRD NEWS         2623
GREEN              2501
RELIGION           2469
WORLDPOST          2452
STYLE              2151
SCIENCE            2106
TECH               2007
TASTE              1994
MONEY              1687
ARTS               1430
ENVIRONMENT        1373
FIFTY              1333
GOOD NEWS          1331
U.S. NEWS          1301
ARTS & CULTURE     1289
COLLEGE            1091
LATINO VOICES      1080
CULTURE & ARTS     1021
EDUCATION       

In [252]:
min_sample = 9371
df_travel =  df[df.category=='TRAVEL'].sample(min_sample, random_state=100)
df_style_and_beauty =  df[df.category=='STYLE & BEAUTY'].sample(min_sample, random_state=100)
df_wellness =  df[df.category=='WELLNESS'].sample(min_sample, random_state=100)
df_entertainment =  df[df.category=='ENTERTAINMENT'].sample(min_sample, random_state=100)
df_politics =  df[df.category=='POLITICS'].sample(min_sample, random_state=100)

In [253]:
df_balanced = pd.concat([df_travel, df_style_and_beauty, df_wellness, df_entertainment, df_politics])

In [254]:
df_balanced.category.value_counts()

TRAVEL            9371
STYLE & BEAUTY    9371
WELLNESS          9371
ENTERTAINMENT     9371
POLITICS          9371
Name: category, dtype: int64

In [255]:
df_balanced.drop(columns=['link', 'headline', 'authors', 'date'], inplace=True)


In [256]:
df_balanced = df_balanced.reset_index(drop=True)

In [257]:
df_balanced['category_num'] = df_balanced.category.map({'TRAVEL':0, 'STYLE & BEAUTY':1, 'WELLNESS':2, 'ENTERTAINMENT':3, 'POLITICS':4})

In [258]:
df_balanced.tail()

Unnamed: 0,category,short_description,category_num
46850,POLITICS,,4
46851,POLITICS,"Today, that oath I took over two decades ago, ...",4
46852,POLITICS,The president admits the combativeness of his ...,4
46853,POLITICS,The New Jersey congressman said abandoning the...,4
46854,POLITICS,"""A group of losers,"" according to Trump, appea...",4


In [259]:
df_balanced.shape

(46855, 3)

In [260]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced.short_description, df_balanced.category_num, test_size=0.2, random_state=100,
                                                   stratify=df_balanced.category_num)

In [261]:
print(X_train.shape)
print(X_test.shape)

(37484,)
(9371,)


In [262]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [263]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('mnb', MultinomialNB())
])

In [264]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [265]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.75      0.76      1875
           1       0.62      0.76      0.68      1874
           2       0.66      0.88      0.75      1874
           3       0.74      0.46      0.57      1874
           4       0.82      0.69      0.75      1874

    accuracy                           0.71      9371
   macro avg       0.72      0.71      0.70      9371
weighted avg       0.72      0.71      0.70      9371



In [266]:
from sklearn.ensemble import RandomForestClassifier
clr = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rfc', RandomForestClassifier())
])
clr.fit(X_train, y_train)
y_pred = clr.predict(X_test)

In [267]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.60      0.65      1875
           1       0.79      0.62      0.69      1874
           2       0.71      0.76      0.74      1874
           3       0.51      0.75      0.61      1874
           4       0.69      0.59      0.64      1874

    accuracy                           0.66      9371
   macro avg       0.68      0.66      0.67      9371
weighted avg       0.68      0.66      0.67      9371



In [270]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('mnb', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.76      0.74      1875
           1       0.60      0.72      0.66      1874
           2       0.57      0.93      0.71      1874
           3       0.86      0.33      0.48      1874
           4       0.86      0.61      0.72      1874

    accuracy                           0.67      9371
   macro avg       0.72      0.67      0.66      9371
weighted avg       0.72      0.67      0.66      9371



In [272]:
X_test[:5]

1179     Your baby might be too young for PB&J sandwich...
22777    Plus, researchers found that "the volunteers w...
29932    I interviewed model/actress Lauren Hutton in l...
19199    Nike's new Olympics ad campaign celebrates the...
1353     6. Salt flats If lake or pond water evaporates...
Name: short_description, dtype: object

In [273]:
y_test[:5]

1179     0
22777    2
29932    3
19199    2
1353     0
Name: category_num, dtype: int64

In [274]:
y_pred[:5]

array([0, 2, 1, 1, 0], dtype=int64)

In [277]:
#here we are doing preprocessing of text by the help of above function it basically stopwords from text
df_balanced['preprocessed_text'] = df_balanced.short_description.apply(preprocess)

In [279]:
df_balanced.head()

Unnamed: 0,category,short_description,category_num,preprocessed_text
0,TRAVEL,*Cue wanderlust*,0,cue wanderlust
1,TRAVEL,And it's not just snow ghosts -- the towering ...,0,snow ghost tower pine tree drape snow encrust ...
2,TRAVEL,Best For WiFi Users Alaska Airlines This is a ...,0,good WiFi Users Alaska Airlines tough airline ...
3,TRAVEL,"There are times, after riding for 10 hours on ...",0,time ride 10 hour bike find unusually deep tho...
4,TRAVEL,These electric and eclectic world cities are r...,0,electric eclectic world city ripe visit year


In [280]:
# here we split data with preprocessed_text
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced.preprocessed_text, df_balanced.category_num, test_size=0.2, random_state=100,
                                                   stratify=df_balanced.category_num)

In [281]:
# finally predicting with preprocessed text 
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('mnb', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [282]:
# with preprocessed_text it increases precision little more
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.80      0.77      1875
           1       0.61      0.79      0.69      1874
           2       0.65      0.90      0.75      1874
           3       0.86      0.37      0.52      1874
           4       0.84      0.68      0.75      1874

    accuracy                           0.71      9371
   macro avg       0.74      0.71      0.70      9371
weighted avg       0.74      0.71      0.70      9371

