In [6]:
# Codebasics - Bag of N grams tutorial
# Ltes generate n-grams using CountVectorizer  == Vocabulary

from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(ngram_range=(1,2))       # bi gram
v = CountVectorizer(ngram_range=(1,3))       # tri grams
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [90]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # removed stop words , punctuations
#     and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)  # LEMMA gives base word  | ate -> eat |
    
    return " ".join(filtered_tokens) 
# dir(nlp)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [72]:
preprocess("Thor ate pizza")

'Thor eat pizza'

In [73]:
preprocess("Loki is eating pizza")
# removed stop words and included lemma

'Loki eat pizza'

In [9]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]
corpus_processed = [
    preprocess(text) for text in corpus
]
corpus_processed

['Thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [12]:
# Vocabulary created with bi-gram
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [11]:
# From v.vocabulary_  we check which words are coming 
v.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [1]:
# News Category Classification Problem

#  basics of BAG of n grams vectorizer 😎 
# It is the time to work on a real problem
# We will use bag of n-grams and traing a machine learning 
# model that can categorize any news into one of the following categories,


# SO bussiness text == Input ,   Category == Output

# BUSINESS     SPORTS      CRIME       SCIENCE     

import pandas as pd

df = pd.read_json('NLP_news_dataset.json')
print(df.shape)

df.head()



(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [4]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [44]:
# a simple technique of undersampling.

min_samples = 1381 
# we have these many SCIENCE articles and SCIENCE is our minority class
# SO We will adjust other categories to adjust all 4 categories data in same shape


df_business = df[df.category=="BUSINESS"].sample(min_samples, random_state=2022) 
df_sports = df[df.category=="SPORTS"].sample(min_samples, random_state=2022)
df_crime = df[df.category=="CRIME"].sample(min_samples, random_state=2022)
df_science = df[df.category=="SCIENCE"].sample(min_samples, random_state=2022)

df_balanced = pd.concat([df_business,df_sports,df_crime,df_science],axis=0)
df_balanced.category.value_counts()
# df_balanced.head()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [61]:
# lets mark category in numeric terms where it defines each category since it cannot understand text

# target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

# df_balanced['category_number'] = df_balanced['category'].map({
#     'BUSINESS': 0,
#     'SPORTS': 1, 
#     'CRIME': 2, 
#     'SCIENCE': 3
# })
# df_balanced

from sklearn.preprocessing import LabelEncoder
# # df_balanced.head()
le_bussiness_category = LabelEncoder()
df_balanced['category_number'] = le_bussiness_category.fit_transform(df_balanced.category)
# df_balanced.category.value_counts()
df_balanced.head()

#   'BUSINESS': 0,
#     'CRIME': 1, 
#     'SCIENCE': 2
#     'SPORTS': 3, 

Unnamed: 0,text,category,category_number
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0
502,How to Market Your Business While Traveling th...,BUSINESS,0
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0


In [62]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
df_balanced.text,
df_balanced.category,
test_size=0.2,
random_state=2022,
stratify=df_balanced.category_number
)

y_train.value_counts()

SPORTS      1105
SCIENCE     1105
BUSINESS    1105
CRIME       1104
Name: category, dtype: int64

In [69]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


#Since it is CountVectorizer , we can use Sklearn pipeline for this
pipe_line =Pipeline(
    [
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 3))   ),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
    ]
)

#2. fit with X_train and y_train
pipe_line.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = pipe_line.predict(X_test)

y_pred

#4. print the classfication report
print(classification_report(y_test, y_pred))

# CHECKED with bi gram and trigram| ngram_range = (1, 2)| ngram_range = (1, 3) | , BOW wins overall here 

              precision    recall  f1-score   support

    BUSINESS       0.65      0.91      0.76       276
       CRIME       0.87      0.86      0.86       277
     SCIENCE       0.92      0.70      0.79       276
      SPORTS       0.91      0.76      0.83       276

    accuracy                           0.81      1105
   macro avg       0.84      0.81      0.81      1105
weighted avg       0.84      0.81      0.81      1105



In [74]:
y_pred[:5]

array(['BUSINESS', 'SPORTS', 'SPORTS', 'BUSINESS', 'SCIENCE'], dtype='<U8')

In [78]:
# df_balanced['preprocessed_txt'] = df.text.apply(preprocess)  # LEMMA AND PREPROCESS DONE FOR COLUMN TEXT 
df_balanced.head()

Unnamed: 0,text,category,category_number,preprocessed_txt
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,0,gcc Business leader remain Confident face Regi...
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,0,Honest Review employee wake morning love impor...
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,0,Mike McDerment ceo FreshBooks talk give build ...
502,How to Market Your Business While Traveling th...,BUSINESS,0,market business travel World recently amazing ...
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,0,Leverage intuition decision making feel safe r...


In [87]:
#                                 BUILDING MODEL WITH PREPROCESSED TEXT 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_txt,
    df_balanced.category,
    test_size = 0.2,
    random_state=2022,
    stratify=df_balanced.category_number
)

# y_train.value_counts()
# PIPE LINE OBJECT on PREPROCESS DATA 

clf = Pipeline(
    [
        ('countvectorizer',CountVectorizer()), # CountVectorizer(ngram_range = (1, 1))
        ('Multi', MultinomialNB())
    ]
)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

# Classification report
print(classification_report(y_test, y_pred))


# If you compare above classification report for (1,2)
# gram with the one from unprocessed text, you will find
# some improvement in the model that uses preprocessed
# cleaned up text. Hence we can conclude that for this
# particular problem using preprocessing
# (removing stop words, lemmatization) 
# is improving the performance of the model



              precision    recall  f1-score   support

    BUSINESS       0.83      0.88      0.85       276
       CRIME       0.87      0.91      0.89       277
     SCIENCE       0.92      0.83      0.87       276
      SPORTS       0.90      0.88      0.89       276

    accuracy                           0.88      1105
   macro avg       0.88      0.88      0.88      1105
weighted avg       0.88      0.88      0.88      1105



In [88]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[243,  14,  10,   9],
       [ 10, 252,   5,  10],
       [ 30,   9, 230,   7],
       [ 11,  15,   6, 244]], dtype=int64)

In [None]:
#                                        Fake News Detection                ==>    EXERCISE

# This data consists of two columns. - Text - label

# Text is the statements or messages regarding a particular event/situation.

# label feature tells whether the given Text is Fake or Real.

# As there are only 2 classes, this problem comes under the Binary Classification.

In [95]:
import pandas as pd
df = pd.read_csv('Fake_Real_Data NLP__16.csv')
df.head()
# df.label.value_counts()  # --> distribution of labels  # our input output are matched perfectly

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [None]:
# pre processing
from sklearn.preprocessing import LabelEncoder
lable = LabelEncoder()
df['label_number'] = lable.fit_transform(df['label'])
# df.label.value_counts()                               #  thus same as df.label

df['preprocess_text'] = df.Text.apply(preprocess)
df.head()

In [None]:

#        Modelling without Pre-processing Text data using pipeline

#  Splitting Data
frosparse sklearn.mtrain_test_splitpelinel_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.text,
    df.label,
    test_size=0.2,
    random_state = 2022,
    stratify=df.label
)


#         Modelling with Pre-processing Text data
# Splitting Data
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(
#     df.preprocess_text,
#     df.label,
#     test_size=0.2,
#     random_state = 2022,
#     stratify=df.label
# )

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('count_vector with 1 gram', CountVectorizer() ),
    ('model _ multi NB', RandomForestClassifier()),
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
# Final Observations
# As machine learning algorithms do not work on text data directly,
# we need to convert them into numeric vectors and feed that into models while training.

# In this process, we convert text into a very high dimensional 
# numeric vector using the technique of Bag of words and we use sklearn CountVectorizer for this.

# Without Pre-Processing Data

# From the above in most of the cases, we can see that when we have 
# the count vectorizer above trigrams or at trigrams, the performance
# keeps degrading. The major possible reason for this as the ngram_range
# keeps increasing, the number of dimensions/features 
# (possible combination of words) also increases enormously and
# models have the risk of overfitting and resulting in terrible performance.

# For this reason, models like KNN failed terribly when performed 
# with trigrams and using the euclidean distance. 
# K-Nearest Neighbours(KNN) doesn't work well with high-dimensional
# data because, with a large number of dimensions, it becomes difficult
# for the algorithm to calculate the distance in each dimension. 
# In higher dimensional space, the cost to calculate distance 
# becomes expensive and hence impacts the performance of the model. 
# It performed well for class 1 and had terrible results for Class 0.

# Both recall and F1 scores increase better when trained with the
# same KNN model but with cosine distance as cosine distance does 
# not get influenced by the number of dimensions as it uses the 
# angle better the two text vectors to calculate the similarity.

# With respect to Naive and RandomForest models, both performed 
# really well, and random forest with trigrams has a better edge on the recall metric.

# As Random Forest uses Bootstrapping(row and column Sampling)
# with many decision trees and overcomes the high variance and 
# overfitting of high dimensional data and also uses feature 
# importance of words for better classifying the categories.

# The easy calculation of probabilities for the words in the 
# corpus(Bag of words) and storing them in a contingency table
# is the major reason for the Multinomial NaiveBayes to be a
# text classification friendly algorithm.


# With Pre-Processing Data
# Have trained the best model RandomForest on the pre-processed data,
# but RandomForest with trigrams fails to produce the same results here.

# But the same randomForest with Unigram to Trigram features helps
# to produce very amazing results and is tops in the entire list with very good F1 scores and Recall scores.

# Machine Learning is like a trial and error scientific method, 
# where we keep trying all the possible algorithms we have and 
# select the one which gives good results and satisfies the 
# requirements like latency, interpretability, etc.