In [32]:
import pandas as pd

data = pd.read_json("news_dataset.json")

print(data.shape)
data.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [33]:
data.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [34]:
min_count = 1381

data_bs = data[data.category == "BUSINESS"].sample(min_count, random_state=2022)
data_sp = data[data.category == "SPORTS"].sample(min_count, random_state=2022)
data_cr = data[data.category == "CRIME"].sample(min_count, random_state=2022)
data_sc = data[data.category == "SCIENCE"].sample(min_count, random_state=2022)


In [35]:
data_balanced = pd.concat([data_bs, data_sp, data_cr, data_sc] ,axis=0)

data_balanced.category.value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [36]:
target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

data_balanced['category_num'] = data_balanced.category.map(target)
data_balanced.head()

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_balanced['text'], data_balanced['category_num'], random_state=2022, stratify=data_balanced['category_num'])

In [38]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

clf = Pipeline(
    [
        ('Vectorizer',CountVectorizer()),
        ('Classifier',MultinomialNB())
    ]
)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


0.8544532947139754

In [39]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

clf = Pipeline(
    [
        ('Vectorizer',CountVectorizer(ngram_range=(1,2))),
        ('Classifier',MultinomialNB())
    ]
)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


0.8262128892107169

In [40]:
import spacy
nlp = spacy.load("en_core_web_sm")
def preprocessing(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop == False and token.is_punct == False:
            filtered_tokens.append(token.lemma_)

    return ' '.join(filtered_tokens)

In [41]:
data_balanced['processed_text'] = data_balanced['text'].apply(preprocessing)
data_balanced.head()

Unnamed: 0,text,category,category_num,processed_text
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0,GCC Business Leaders remain Confident Face Reg...
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0,Honest Review employee wake morning love impor...
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0,Mike McDerment ceo FreshBooks Talks give build...
502,How to Market Your Business While Traveling th...,BUSINESS,0,market business travel World recently amazing ...
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0,Leverage intuition decision making feel safe r...


In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_balanced['processed_text'], data_balanced['category_num'], random_state=2022, stratify=data_balanced['category_num'])

In [43]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

clf = Pipeline(
    [
        ('Vectorizer',CountVectorizer(ngram_range=(1,2))),
        ('Classifier',MultinomialNB())
    ]
)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8638667632150615