## Import spacy and do some preprocessing steps

In [56]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [57]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("We just opened our wings, the flying part is coming soon")
for token in doc:
    if token.is_stop:
        print(token)

We
just
our
the
part
is


In [59]:
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)

## working on news classifiction dataset

In [1]:
import pandas as pd

df = pd.read_json('news_dataset.json')

In [2]:
df.head(5)

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [3]:
df.shape

(12695, 2)

In [5]:
df['category'].value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [8]:
min_samples = 1381
df_bussiness = df[df.category=='BUSINESS'].sample(min_samples , random_state=2022)
df_sport = df[df.category=='SPORTS'].sample(min_samples , random_state=2022)
df_crime = df[df.category=='CRIME'].sample(min_samples , random_state=2022)
df_science = df[df.category=='SCIENCE'].sample(min_samples , random_state=2022)

In [51]:
print(df_bussiness.shape,
df_sport.shape,
df_crime.shape,
df_science.shape)

(1381, 2) (1381, 2) (1381, 2) (1381, 2)


In [12]:
df_balanced = pd.concat([df_bussiness,df_sport,df_crime,df_science], axis=0)

In [14]:
df_balanced['category'].value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [68]:
target = {'BUSINESS':0,'SPORTS':1,'CRIME':2,'SCIENCE':3}

In [15]:
target = {'BUSINESS':0,'SPORTS':1,'CRIME':2,'SCIENCE':3}
df_balanced['category_num'] = df_balanced.category.map(target)

In [16]:
df_balanced.head()

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


## Train and test dataset without Preprocessing the data

In [46]:
from sklearn.model_selection import train_test_split
X_test,X_train,y_test,y_train = train_test_split(
    df_balanced.text,
    df_balanced.category_num,
    test_size = 0.5,
    random_state = 2022,
    stratify=df_balanced.category_num
)

In [47]:
print(X_train.shape)

(2762,)


In [48]:
y_test.value_counts()

category_num
2    691
3    691
0    690
1    690
Name: count, dtype: int64

In [49]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [50]:
clf = Pipeline([
        ('Vectorizer_bow',CountVectorizer(ngram_range=(1,2))),
        ('Multi NB', MultinomialNB())
])   

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.66      0.90      0.76       690
           1       0.91      0.79      0.84       690
           2       0.87      0.88      0.88       691
           3       0.91      0.70      0.80       691

    accuracy                           0.82      2762
   macro avg       0.84      0.82      0.82      2762
weighted avg       0.84      0.82      0.82      2762



In [53]:
X_test[:5]

3616    Police Find Body Of Missing Indiana University...
3600    NASA's Commercial Crew Vehicles: Practical But...
5626    Here's How To Watch The Geminid Meteor Shower ...
7436    Yes, Positive Emotions Really Can Transform Yo...
5582    Getting Off the Linear Career Track It can be ...
Name: text, dtype: object

In [54]:
y_test[:5]

3616    2
3600    3
5626    3
7436    3
5582    0
Name: category_num, dtype: int64

In [55]:
y_pred[:5]

array([2, 3, 3, 0, 0], dtype=int64)

## Train and test the dataset after Some preprocessing text

In [60]:
df_balanced['preprocessed_text'] = df_balanced.text.apply(preprocess)

In [62]:
from sklearn.model_selection import train_test_split
X_test,X_train,y_test,y_train = train_test_split(
    df_balanced.preprocessed_text,
    df_balanced.category_num,
    test_size = 0.5,
    random_state = 2022,
    stratify=df_balanced.category_num
)

In [63]:
clf = Pipeline([
        ('Vectorizer_bow',CountVectorizer(ngram_range=(1,2))),
        ('Multi NB', MultinomialNB())
])   

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       690
           1       0.90      0.87      0.88       690
           2       0.85      0.92      0.88       691
           3       0.92      0.79      0.85       691

    accuracy                           0.87      2762
   macro avg       0.87      0.87      0.87      2762
weighted avg       0.87      0.87      0.87      2762



In [64]:
X_test[:5]

3616    Police Find Body Missing Indiana University St...
3600       NASA Commercial Crew Vehicles : Practical Sexy
5626            Watch Geminid Meteor Shower long cloudy .
7436             Yes , Positive Emotions Transform Health
5582    Getting Linear Career Track nerve - wracking w...
Name: preprocessed_text, dtype: object

In [66]:
y_test[:5]

3616    2
3600    3
5626    3
7436    3
5582    0
Name: category_num, dtype: int64

In [65]:
y_pred[:5]

array([2, 3, 3, 0, 0], dtype=int64)