Bag of words

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(['I am good boy.'])
v.vocabulary_

{'am': 0, 'good': 3, 'boy': 2, 'am good': 1, 'good boy': 4}

In [8]:
corpus = [
    'Thor is a good boy.',
    'Loki is a god.',
    'Spiderman is the greatest avenger of all time.'
]

In [9]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [11]:
from spacy.lang.en.stop_words import STOP_WORDS

Preprocessing of data by removing stopwords and punct and lemmatizing them.

In [36]:
def preprocessing(text):
    doc = nlp(text)
    list1 = []
    for tokens in doc:
        if tokens.is_stop or tokens.is_punct:
            continue    
        list1.append(str(tokens))
    return " ".join(list1)

In [37]:
preprocessing('I am a good boy.')

'good boy'

In [38]:
post_processing  =[preprocessing(text) for text in corpus]
post_processing

['Thor good boy', 'Loki god', 'Spiderman greatest avenger time']

In [39]:
import pandas as pd

## Excercise to classify the data according to the category.

In [44]:
df = pd.read_json("news_dataset.json")
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [45]:
df["text"] = df['text'].apply(preprocessing) #applying the preprocessing

Having equal number of samples.

In [48]:
df_business = df[df.category == 'BUSINESS'].sample(1381,random_state=10)
df_science = df[df.category == 'SCIENCE'].sample(1381,random_state=10)
df_crime = df[df.category == 'CRIME'].sample(1381,random_state=10)
df_sports = df[df.category == 'SPORTS'].sample(1381,random_state=10)

In [49]:
df_balanced = pd.concat([df_business,df_science,df_crime,df_sports],axis = 0)

In [53]:
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger Cat Die University Califo...,SCIENCE
1,WATCH Freaky Vortex Opens Flooded Lake,SCIENCE
2,Entrepreneurs Today Need Big Budget Start year...,BUSINESS
3,Roads Recharge Electric Car Drive high tech hi...,BUSINESS
4,Civilian Guard Fires Gun Protecting Recruiting...,CRIME


Changing the words to numbers

In [59]:
df_balanced['category_num'] = df_balanced.category.map({
 "BUSINESS" :0,
    "SCIENCE":1,
    'CRIME':2,
    'SPORTS':3,   
})

In [62]:
df_balanced.head()

Unnamed: 0,text,category,category_num
4528,Impossible Goals Quest Lose 175 Pounds people ...,BUSINESS,0
3265,Way Chipotle Completely Revolutionized Eat,BUSINESS,0
727,Sexual Abuse Old Normal let Bill O’Reilly domi...,BUSINESS,0
10317,Secret Building Successful Business Wo Destroy...,BUSINESS,0
8127,Nike Latest Company Ramp Parental Leave world ...,BUSINESS,0


Using train_test_split to give input to the model.

In [63]:
from sklearn.model_selection import train_test_split as tst
X_train,X_test,y_train,y_test = tst(
    df_balanced['text'],
    df_balanced.category_num,
    random_state = 20,
    test_size = 0.2
)

Creating a pipeline using count vectorizer and naivebayes

In [77]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
model = Pipeline([
    ("vectorizer",CountVectorizer(ngram_range=(1,2))),
    ("multi nb",MultinomialNB())]
)

In [78]:
model.fit(X_train,y_train)

In [79]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88       285
           1       0.91      0.82      0.87       274
           2       0.86      0.92      0.89       251
           3       0.91      0.88      0.89       295

    accuracy                           0.88      1105
   macro avg       0.88      0.88      0.88      1105
weighted avg       0.88      0.88      0.88      1105

