# Exercise for Spacy POS tutorial,

You are parsing a news story from cnbc.com. News story is stores in news_story.txt which is available in this same folder on github. You need to,
i. Extract all NOUN tokens from this story. You will have to read the file in python first to collect all the text and then extract NOUNs in a python list
ii. Extract all numbers (NUM POS type) in a python list
iii. Print a count of all POS tags in this story

In [69]:


token_list = []
text = ''
with open('news_story.txt') as file:
    file = file.readlines()
    for line in file:
        text += line.strip()
            
# Tokenization and getting NOUNs
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
for token in doc:
    if token.pos_ == 'NOUN':
        token_list.append((token, token.pos_))
print(token_list)

[(Inflation, 'NOUN'), (climb, 'NOUN'), (consumers, 'NOUN'), (brink, 'NOUN'), (expansion, 'NOUN'), (consumer, 'NOUN'), (price, 'NOUN'), (index, 'NOUN'), (measure, 'NOUN'), (prices, 'NOUN'), (goods, 'NOUN'), (services, 'NOUN'), (%, 'NOUN'), (year, 'NOUN'), (estimate, 'NOUN'), (%, 'NOUN'), (gain, 'NOUN'), (ease, 'NOUN'), (peak, 'NOUN'), (level, 'NOUN'), (summer, 'NOUN'), (food, 'NOUN'), (energy, 'NOUN'), (prices, 'NOUN'), (core, 'NOUN'), (%, 'NOUN'), (expectations, 'NOUN'), (%, 'NOUN'), (gain, 'NOUN'), (hopes, 'NOUN'), (inflation, 'NOUN'), (month, 'NOUN'), (month, 'NOUN'), (gains, 'NOUN'), (expectations, 'NOUN'), (%, 'NOUN'), (headline, 'NOUN'), (%, 'NOUN'), (estimate, 'NOUN'), (%, 'NOUN'), (increase, 'NOUN'), (core, 'NOUN'), (outlook, 'NOUN'), (%, 'NOUN'), (gain, 'NOUN'), (price, 'NOUN'), (gains, 'NOUN'), (workers, 'NOUN'), (ground, 'NOUN'), (wages, 'NOUN'), (inflation, 'NOUN'), (%, 'NOUN'), (month, 'NOUN'), (increase, 'NOUN'), (%, 'NOUN'), (earnings, 'NOUN'), (year, 'NOUN'), (earnings, 

In [70]:
# ii. Extract all numbers (NUM POS type) in a python list

numbers = []

for token in doc:
    if token.pos_ == 'NUM':
        numbers.append((token, token.pos_))

print(numbers)

[(8.3, 'NUM'), (8.1, 'NUM'), (6.2, 'NUM'), (6, 'NUM'), (0.3, 'NUM'), (0.2, 'NUM'), (0.6, 'NUM'), (0.4, 'NUM'), (0.1, 'NUM'), (0.3, 'NUM'), (2.6, 'NUM'), (2021, 'NUM'), (1984, 'NUM'), (one, 'NUM'), (two, 'NUM'), (two, 'NUM'), (2, 'NUM')]


In [74]:
# iii. Print a count of all POS tags in this story

count = doc.count_by(spacy.attrs.POS)
count

{92: 95,
 100: 27,
 86: 15,
 85: 39,
 96: 20,
 97: 30,
 90: 34,
 95: 4,
 87: 13,
 89: 10,
 84: 23,
 93: 17,
 94: 4,
 98: 8,
 101: 1}

In [75]:
for k,v in count.items():

    print(doc.vocab[k].text, "|",v)

NOUN | 95
VERB | 27
ADV | 15
ADP | 39
PROPN | 20
PUNCT | 30
DET | 34
PRON | 4
AUX | 13
CCONJ | 10
ADJ | 23
NUM | 17
PART | 4
SCONJ | 8
X | 1


# Displacy

In [80]:
from spacy import displacy 

text = "Hello Mate, I love burgers from McDonald's, that's fasho dude. Elon Musk be earning those G's from Tesla"
doc = nlp(text)
for ent in doc.ents:
    print(ent, "|", ent.label_, '|', spacy.explain(ent.label_))
    

displacy.render(doc, style='ent')

nlp.pipe_labels['ner']

McDonald's | ORG | Companies, agencies, institutions, etc.
Tesla | ORG | Companies, agencies, institutions, etc.


['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

# Named Entity Recognition (NER): Exercises

Excercice: 1
Extract all the Geographical (cities, Countries, states) names from a given text

In [111]:
text = """Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that
in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,
in Bihar it is Litti Chowkha and so on for all other states"""

doc = nlp(text)

print(nlp.pipe_labels['ner'])

entities = []


for ent in doc.ents:
    if ent.label_ == 'GPE':
        entities.append(ent)
print(entities)
print(len(entities))

['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
[Kiran, India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar]
8


Excersie: 2
Extract all the birth dates of cricketers in the given Text

In [109]:
text = """Sachin Tendulkar was born on 24 April 1973, Virat Kholi was born on 5 November 1988, Dhoni was born on 7 July 1981
and finally Ricky ponting was born on 19 December 1974."""

doc = nlp(text)

entities = []

for ent in doc.ents:
    if ent.label_ == 'DATE':
        entities.append(ent)
        
print(entities)
print(len(entities))

[24 April 1973, 5 November 1988, 7 July 1981, 19 December 1974]
4


# Bag of words: Exercises
In this Exercise, you are going to classify whether a given movie review is positive or negative.
you are going to use Bag of words for pre-processing the text and apply different classification algorithms.
Sklearn CountVectorizer has the inbuilt implementations for Bag of Words.

In [166]:
#Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [167]:
#1. read the data provided in the same directory with name 'movies_sentiment_data.csv' and store it in df variable
df = pd.read_csv('IMDB Dataset.csv').head(10000)


#2. print the shape of the data
print(df.shape)

#3. print top 5 datapoints
print(df.head())

(10000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [168]:
#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative
df['category'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

data = df['review']
target = df['category']
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
print(df['category'].value_counts())

#Do the 'train-test' splitting with test size of 20%
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data, target, test_size = 0.2)

x_train.shape

category
1    5028
0    4972
Name: count, dtype: int64


(8000,)

#### Exercise-1

using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.
Note:

use CountVectorizer for pre-processing the text.

use Random Forest as the classifier with estimators as 50 and criterion as entropy.

print the classification report.

References:

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [179]:
#1. create a pipeline object
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

model = Pipeline([
    ('vectorizer', CountVectorizer()),                                                    #initializing the vectorizer
    ('random_forest', (RandomForestClassifier(n_estimators=50, criterion='entropy')))      #using the RandomForest classifier
])



#2. fit with X_train and y_train
model.fit(x_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = model.predict(x_test)


#4. print the classfication report
score = classification_report(y_test, y_pred)
print(score)

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       984
           1       0.82      0.81      0.81      1016

    accuracy                           0.81      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.81      0.81      0.81      2000



#### Exercise-2

using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative..
Note:

use CountVectorizer for pre-processing the text.
use KNN as the classifier with n_neighbors of 10 and metric as 'euclidean'.
print the classification report.
References:

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [181]:
#1. create a pipeline object
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

model = Pipeline([
    ('vectorizer', CountVectorizer()),                                                    #initializing the vectorizer
    ('knn', (KNeighborsClassifier(n_neighbors=10, metric='euclidean')))      #using the KNN classifier
])


#2. fit with X_train and y_train
model.fit(x_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = model.predict(x_test)

#4. print the classfication report
score = classification_report(y_test, y_pred)
print(score)

              precision    recall  f1-score   support

           0       0.63      0.68      0.65       984
           1       0.66      0.61      0.63      1016

    accuracy                           0.64      2000
   macro avg       0.65      0.65      0.64      2000
weighted avg       0.65      0.64      0.64      2000



#### Exercise-3

using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative..
Note:

use CountVectorizer for pre-processing the text.
use Multinomial Naive Bayes as the classifier.
print the classification report.
References:

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [182]:
#1. create a pipeline object
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

model = Pipeline([
    ('vectorizer', CountVectorizer()),                                                    #initializing the vectorizer
    ('naive_bayes', (MultinomialNB()))      #using the NB classifier
])


#2. fit with X_train and y_train
model.fit(x_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = model.predict(x_test)

#4. print the classfication report
score = classification_report(y_test, y_pred)
print(score)

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       984
           1       0.87      0.80      0.83      1016

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



# Stop Words: Exercise
Run this cell to import all necessary packages

In [183]:

#import spacy and load the model

import spacy
nlp = spacy.load("en_core_web_sm")


#### Exercise1:

From a Given Text, Count the number of stop words in it.
Print the percentage of stop word tokens compared to all tokens in a given text.

In [203]:
text = '''
Thor: Love and Thunder is a 2022 American superhero film based on Marvel Comics featuring the character Thor, produced by Marvel Studios and 
distributed by Walt Disney Studios Motion Pictures. It is the sequel to Thor: Ragnarok (2017) and the 29th film in the Marvel Cinematic Universe (MCU).
The film is directed by Taika Waititi, who co-wrote the script with Jennifer Kaytin Robinson, and stars Chris Hemsworth as Thor alongside Christian Bale, Tessa Thompson,
Jaimie Alexander, Waititi, Russell Crowe, and Natalie Portman. In the film, Thor attempts to find inner peace, but must return to action and recruit Valkyrie (Thompson),
Korg (Waititi), and Jane Foster (Portman)—who is now the Mighty Thor—to stop Gorr the God Butcher (Bale) from eliminating all gods.
'''

#step1: Create the object 'doc' for the given text using nlp()
doc = nlp(text)


#step2: define the variables to keep track of stopwords count and total words count
from spacy.lang.en import stop_words

stop_words = stop_words.STOP_WORDS


stop_words_count = 0
words_count = 0

#step3: iterate through all the words in the document
for token in doc:
    if token.text.lower() in stop_words:
        stop_words_count += 1
    else:
        words_count += 1
    
#step4: print the count of stop words
print(stop_words_count)

#step5: print the percentage of stop words compared to total words in the text
percentage = stop_words_count / (stop_words_count + words_count)
print('Percentage of stop words:', percentage * 100, '%')

40
Percentage of stop words: 25.0 %


#### Exercise2:

Spacy default implementation considers "not" as a stop word. But in some scenarios removing 'not' will completely change the meaning of the statement/text. For Example, consider these two statements:

- this is a good movie       ----> Positive Statement
- this is not a good movie   ----> Negative Statement
So, after applying stopwords to those 2 texts, both will return "good movie" and does not respect the polarity/sentiments of text.

Now, your task is to remove this stop word "not" in spaCy and help in distinguishing the texts.

Hint: GOOGLE IT! Google is your friend.

In [215]:
#use this pre-processing function to pass the text and to remove all the stop words and finally get the cleaned form
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)       

#Step1: remove the stopword 'not' in spacy
nlp.vocab['not'].is_stop = False

#step2: send the two texts given above into the pre-process function and store the transformed texts
text1 = preprocess('this is a good movie')
text2 = preprocess('this is not a good movie')


#step3: finally print those 2 transformed texts
print('Text 1:', text1)
print('Text 2:', text2)

Text 1: good movie
Text 2: not good movie


#### Exercise3:

From a given text, output the most frequently used token after removing all the stop word tokens and punctuations in it.

In [236]:
import string

text = ''' The India men's national cricket team, also known as Team India or the Men in Blue, represents India in men's international cricket.
It is governed by the Board of Control for Cricket in India (BCCI), and is a Full Member of the International Cricket Council (ICC) with Test,
One Day International (ODI) and Twenty20 International (T20I) status. Cricket was introduced to India by British sailors in the 18th century, and the 
first cricket club was established in 1792. India's national cricket team played its first Test match on 25 June 1932 at Lord's, becoming the sixth team to be
granted test cricket status.
'''


#step1: Create the object 'doc' for the given text using nlp()
doc = nlp(text)


#step2: remove all the stop words and punctuations and store all the remaining tokens in a new list
remaining_tokens = []
for token in doc:
  if token.is_stop or token.is_punct:    #check whether a given token is stop word or punctuations
    continue
  remaining_tokens.append(token.text)


#step3: create a new dictionary and get the frequency of words by iterating through the list which contains stored tokens  
frequency_tokens = {}
for token in remaining_tokens:
  if token != '\n' and token != ' ':      #As spacy considers new line and empty spaces as seperate token, it's better to ignore them
    if token not in frequency_tokens:     #if a particular token occurs for the first time, we initialise it to 1
      frequency_tokens[token] = 1
    else:
      frequency_tokens[token] += 1        #if a partcular token is already present, then increment by 1 based on value already presented


#step4: get the maximum frequency word
max_freq_word = max(frequency_tokens.keys(), key=(lambda key: frequency_tokens[key]))


#step5: finally print the result
print(f"Maximum frequency word: {max_freq_word}") 

Maximum frequency word: India


# Bag of n_grams: Exercise

Fake news refers to misinformation or disinformation in the country which is spread through word of mouth and more recently through digital communication such as What's app messages, social media posts, etc.

Fake news spreads faster than Real news and creates problems and fear among groups and in society.

We are going to address these problems using classical NLP techniques and going to classify whether a given message/ text is Real or Fake Message.

You will use a Bag of n-grams to pre-process the text and apply different classification algorithms.

Sklearn CountVectorizer has the inbuilt implementations for Bag of Words.

About Data: Fake News Detection
Credits: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset

This data consists of two columns. - Text - label

Text is the statements or messages regarding a particular event/situation.

label feature tells whether the given Text is Fake or Real.

As there are only 2 classes, this problem comes under the Binary Classification.

In [7]:
import pandas as pd
df_fake = pd.read_csv(r"C:\Users\teore\OneDrive\Documents\GitHub\NLP_tutorial\Fake.csv")
df_true = pd.read_csv(r"C:\Users\teore\OneDrive\Documents\GitHub\NLP_tutorial\True.csv")

df_fake['category'] = 'Fake'
df_true['category'] = 'True'

df_combined = pd.concat([df_fake, df_true])
df_combined.to_csv(r"C:\Users\teore\OneDrive\Documents\GitHub\NLP_tutorial\Fake_Real_Data.csv")
df_combined

Unnamed: 0,title,text,subject,date,category
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",Fake
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",True
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",True
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",True
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",True


In [8]:
#import pandas library
import pandas as pd

#read the dataset with name "Fake_Real_Data.csv" and store it in a variable df
df = pd.read_csv(r"C:\Users\teore\OneDrive\Documents\GitHub\NLP_tutorial\Fake_Real_Data.csv")

#print the shape of dataframe
print(df.shape)

#print top 5 rows
print(df.head())

#check the distribution of labels 
print(df['subject'].value_counts())

#Add the new column "label_num" which gives a unique number to each of these labels 
df['label_num'] = df['category'].map({'Fake': 0, 'True': 1})

#check the results with top 5 rows
df.head()

(44898, 6)
   Unnamed: 0                                              title  \
0           0   Donald Trump Sends Out Embarrassing New Year’...   
1           1   Drunk Bragging Trump Staffer Started Russian ...   
2           2   Sheriff David Clarke Becomes An Internet Joke...   
3           3   Trump Is So Obsessed He Even Has Obama’s Name...   
4           4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date category  
0  December 31, 2017     Fake  
1  December 31, 2017     Fake  
2  December 30, 2017     Fake  
3  December 29, 2017     Fake  
4  December 25, 2017     Fake  
sub

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,category,label_num
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",Fake,0
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",Fake,0
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",Fake,0
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",Fake,0
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",Fake,0


# Modelling without Pre-processing Text data

In [17]:
# Reduce sample
df_sampled = df.sample(n=10_000, random_state=42)

# import train-test-split from sklearn 
from sklearn.model_selection import train_test_split 


#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled['text'], 
    df_sampled['label_num'], 
    test_size=0.2, 
    random_state=2022,
    stratify=df_sampled['label_num']
)

#print the shapes of X_train and X_test
print('train_size', X_train.shape)
print('test_size', X_test.shape)
print('\n')
print(df_sampled['label_num'].value_counts())


train_size (8000,)
test_size (2000,)


label_num
0    5254
1    4746
Name: count, dtype: int64


### Attempt 1 :

using sklearn pipeline module create a classification pipeline to classify the Data.
Note:

using CountVectorizer with unigram, bigram, and trigrams.
use KNN as the classifier with n_neighbors of 10 and metric as 'euclidean' distance.
print the classification report.

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

#1. create a pipeline object
knn = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (1,3))),
    ('knn', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])



#2. fit with X_train and y_train
knn.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = knn.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.76      0.75      1051
           1       0.73      0.69      0.71       949

    accuracy                           0.73      2000
   macro avg       0.73      0.73      0.73      2000
weighted avg       0.73      0.73      0.73      2000



### Attempt 2 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
Note:

using CountVectorizer with unigram, bigram, and trigrams.
use KNN as the classifier with n_neighbors of 10 and metric as 'cosine' distance.
print the classification report.

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

#1. create a pipeline object
knn = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (1,3))),
    ('knn', KNeighborsClassifier(n_neighbors=10, metric='cosine'))
])



#2. fit with X_train and y_train
knn.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = knn.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.98      0.75      1051
           1       0.93      0.30      0.46       949

    accuracy                           0.66      2000
   macro avg       0.77      0.64      0.60      2000
weighted avg       0.76      0.66      0.61      2000



### Attempt 3 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
Note:

using CountVectorizer with only trigrams.
use RandomForest as the classifier.
print the classification report.

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

#1. create a pipeline object
rf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (3,3))),
    ('random_forest', RandomForestClassifier())
])



#2. fit with X_train and y_train
rf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = rf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1051
           1       0.95      0.91      0.93       949

    accuracy                           0.94      2000
   macro avg       0.94      0.93      0.94      2000
weighted avg       0.94      0.94      0.94      2000



### Attempt 4 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
Note:

using CountVectorizer with both unigram and bigrams.
use Multinomial Naive Bayes as the classifier with an alpha value of 0.75.
print the classification report.

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

#1. create a pipeline object
nb = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (1,2))),
    ('nb', MultinomialNB(alpha = 0.75))
])



#2. fit with X_train and y_train
nb.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = nb.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

MemoryError: 

### Use text pre-processing to remove stop words, punctuations and apply lemmatization

In [None]:
#use this utility function to get the preprocessed text data
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [None]:
# create a new column "preprocessed_txt" and use the utility function above to get the clean data

# Separate the data into two classes
df_true = df[df['label_num'] == 1].sample(n=1, random_state=42)
df_fake = df[df['label_num'] == 0].sample(n=1, random_state=42)

# Combine them to create a balanced dataset
df_balanced = pd.concat([df_true, df_fake]).sample(frac=1, random_state=42)  # Shuffle the dataset

# Print the new class distribution
print(df_balanced['label_num'].value_counts())

df_balanced["preprocessed_txt"] = df['text'].apply(preprocess)
# this will take some time, please be patient

In [None]:
#print the top 5 rows
df_balanced.head()

### Build a model with pre processed text

In [None]:
#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test = train_test_split(
    df['preprocessed_txt'], 
    df['label_num'], 
    test_size=0.2, 
    random_state=2022,
    stratify=df['label_num']
)


from imblearn.under_sampling import RandomUnderSampler

# Define the undersampler
undersampler = RandomUnderSampler(sampling_strategy=1.0, random_state=42)

# Resample both X_train and y_train
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train.to_frame(), y_train)

# Convert back to Series
X_train = X_train_resampled.squeeze()  # Ensure it's a Series
y_train = y_train_resampled
#Note: Make sure to use only the "preprocessed_txt" column for splitting

Let's check the scores with our best model till now

### Random Forest
### Attempt1 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
Note:

using CountVectorizer with only trigrams.
use RandomForest as the classifier.
print the classification report.

In [None]:
#1. create a pipeline object
rf = Pipeline([('vectorizer', CountVectorizer(ngram_range = (3,3)), 
                ('random_forest', RandomForestClassifier()))])


#2. fit with X_train and y_train
rf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = rf.predict(X_test)


#4. print the classfication report
print(classfication_report(y_test, y_pred))

### Attempt2 :

using the sklearn pipeline module create a classification pipeline to classify the Data.
Note:

using CountVectorizer with unigram, Bigram, and trigrams.
use RandomForest as the classifier.
print the classification report.

In [None]:
#1. create a pipeline object
rf = Pipeline([('vectorizer', CountVectorizer(ngram_range = (2,3)), 
                ('random_forest', RandomForestClassifier()))])


#2. fit with X_train and y_train
rf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = rf.predict(X_test)


#4. print the classfication report
print(classfication_report(y_test, y_pred))

In [None]:
#finally print the confusion matrix for the best model
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm


from matplotlib import pyplot as plt
import seaborn as sn
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('Truth')