# Implemation of Naive Bayes in nlp

## Syed Sajjad Askari
## 2139484

* About Dataset
* import Library
* EDA(Exploratory Data Analysis)
* Text Cleaning
* Data Visualization
* label Encoding
* Data Preprocessing
* Train-Test-Split
* Model Building
* Model Evaluation


## About Dataset

*IMDB dataset have 50K movie reviews for natural language processing or Text analytics. 
This is a dataset for binary sentiment classification. 
We have to built NLP model for So Sentiment Analysis which can predict the number of positive and negative reviews using either classification or deep learning algorithms.*

### Import Lib

In [44]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
import nltk
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')
import re

### Download nltk pckgs

In [45]:
# download some library which hold in nltk only first time
nltk.download('punkt') #punctuation
nltk.download('wordnet')#for lemmatization
nltk.download('stopwords')#for stopwords corpus
nltk.download('omw-1.4')

In [46]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Exploratory Data Analysis

In [47]:
df=pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df

In [48]:
df.shape

In [49]:
#check for null values
df.isnull().sum()

In [50]:
#check for duplicated values
df.duplicated().sum()

In [51]:
#drop duplicates
df.drop_duplicates(inplace=True)
df=df.reset_index()

In [52]:
df.drop("index",axis=1,inplace=True)

In [53]:
df.head()

In [54]:
df['sentiment'].value_counts()

In [55]:
sns.barplot(x=df['sentiment'].value_counts().index,y=df['sentiment'].value_counts(),data=df)
plt.show()

#### As Data set is complete balance 

## Text Cleaning

### 1. Major Cleaning 

* Remove URLS
* Remove HTMLS tags
* Remove numbers
* Remove emails


In [56]:
#remove URLS 
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

#remove HTMLS tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

#remove numbers
def remove_numbers(text):
    removed_numbers = text.replace(r'\d+','')
    return removed_numbers

#remove emails 
def remove_emails(text):
    no_emails = text.replace(r"\S*@\S*\s?",'')
    return no_emails



    return text 

In [57]:
#Call Function
df['review'] = df['review'].apply(remove_urls)
df['review'] = df['review'].apply(remove_html)
df['review'] = df['review'].apply(remove_numbers)
df['review'] = df['review'].apply(remove_emails)

In [58]:
df

In [59]:
#2. Total No of Words
df['num_words']=df['review'].apply(lambda x: len(str(x).split()))

In [60]:
df

### 2. Fine Cleaning 

* Convert text into Lower case
* Word_tokenize
* Remove punctuation and Stopwords
* Remove Specials Characters
* Lamatization
* Return text

In [61]:
lst=['ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", "won't", 'wouldn', "wouldn't",
'are not','could not','would not','did not','does not','did not','was not','wasnt','does not','had not','have not','is not','might not','must not','need not','shall not','was not','no so','had not','wont',"won't",'do not','is not','isnt',"isn't",'not','never','am not']

In [62]:
def clean_text(text):
    #1. convert text into lower case
    text=text.lower()
    
    #2. Replace All negative words by not  
    lst1=set(lst)
    for i in lst1:
        if i in text:
            text=text.replace(i,'not')
            text=re.sub(r'not\'t', 'not',text)
            text = re.sub(r"  "," ",text)
    
    #2.word_tokenize
    text=text.strip()
    text=word_tokenize(text)
    
    #3.remove negativity from stopwords
    sw=stopwords.words('english')
    unwanted_ele=set(lst)
    updated_sw = [ele for ele in sw if ele not in unwanted_ele]
    
    #3.remove punctuation and stopwords
    words=updated_sw+list(string.punctuation)+list(["would","could","should","will","have","had"])
    text=[word for word in text if word not in words]
    
    #4. use only spcl characters consider only alphbets
    text=[word for word in text if word.isalpha() ==True]
    
    #5. apply lamatization to covert all plurals into singular
    lemma=WordNetLemmatizer()
    text=[lemma.lemmatize(word) for word in text]
    #text=set(text)  #remove duplicates words
    
    #join text
    text=' '.join(text)
    
    return text

In [63]:
import string
#call function
df['review']=df['review'].apply(clean_text)

In [64]:
#2. Total No of Words
df['num_words_clean']=df['review'].apply(lambda x: len(str(x).split()))

In [65]:
df

In [66]:
#Hist Plot for Spam and Not Spam
plt.figure(figsize=(12,6))
sns.histplot(df[df['sentiment']=='positive']['num_words'],color='blue',bins=80)
sns.histplot(df[df['sentiment']=='negative']['num_words'],color='red',bins=80)
plt.show()

### Label Encoding of Target Feature

In [74]:
#Encode Target Categ
df['sentiment']=df['sentiment'].map({'positive':1,'negative':0})

In [75]:
#sen_len=[len(word_tokenize(sent)) for sent in df['review']]
#df['sen_len']=sen_len

In [76]:
#max len of words
df['num_words_clean'].max()

In [77]:
df.describe()

In [78]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
import re,string
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from nltk import pos_tag
from nltk.corpus import wordnet

## Data Preprocessing

### Input & Output 

In [79]:
#Input & Output 
#X= df.drop(columns=['sentiment','num_words','num_words_clean'],axis=1)
X=df['review']
Y=df['sentiment']

In [80]:
X

In [81]:
Y

In [82]:
#Entire Corpus
corpus=[]
for sen in df['review']:
    corpus.append(sen)

In [83]:
corpus[:3]

### Train-Test split (80/20)

In [84]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=5)

In [85]:
X_train

In [86]:
X_train.shape,X_test.shape

In [87]:
#Min length of to cover 95% population or records
np.quantile(df['num_words_clean'],0.95)

## Model Building

### Multinomial NB

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification)

In [88]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

### 1- CountVectorizer

In [89]:
#CountVectorizer
cv=CountVectorizer(max_features=10000,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(X_train)
#transformed test reviews
cv_test_reviews=cv.transform(X_test)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

### 2- TfidfVectorizer

In [90]:
#TfidfVectorizer
tv=TfidfVectorizer(max_features=10000,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(X_train)
#transformed test reviews
tv_test_reviews=tv.transform(X_test)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

### Train model

In [92]:
#training the model with CountVectorizer features
mnb=MultinomialNB()
#fitting the nb for bag of words
mnb_bow=mnb.fit(cv_train_reviews,Y_train)
print(mnb_bow)

In [93]:
#training the model with TfidfVectorizer features
#fitting the nb for tfidf features
mnb_tfidf=mnb.fit(tv_train_reviews,Y_train)
print(mnb_tfidf)

In [94]:
#Predicting the model for bag of words or CountVectorizer
mnb_bow_predict=mnb.predict(cv_test_reviews)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_reviews)

### Model Evaluation 

In [95]:
#Model Evaluation
#Accuracy score for bag of words
mnb_bow_score=accuracy_score(Y_test,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)
#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(Y_test,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

In [96]:
mnb_bow_report = classification_report(Y_test,mnb_bow_predict,target_names = ['0','1'])
print(mnb_bow_report)
cm_cv = confusion_matrix(Y_test,mnb_bow_predict)
cm_cv

In [97]:
mnb_tfidf_report = classification_report(Y_test,mnb_tfidf_predict,target_names = ['0','1'])
print(mnb_tfidf_report)
cm_tv = confusion_matrix(Y_test,mnb_tfidf_predict)
cm_tv

In [98]:
cm_cv = pd.DataFrame(cm_cv, index=[0,1], columns=[0,1])
cm_cv.index.name = 'Actual'
cm_cv.columns.name = 'Predicted'
plt.figure(figsize = (7,5))
sns.heatmap(cm_cv,cmap= "Blues",annot = True, fmt='',xticklabels=["Negative Review 0","Positive Reviews 1"],yticklabels=["Negative Review 0","Positive Reviews 1"])

plt.title("CM for MNB Model WIth with CountVectorizer features",color='red',size=14)
plt.show()

In [99]:
cm_tv = pd.DataFrame(cm_tv, index=[0,1], columns=[0,1])
cm_tv.index.name = 'Actual'
cm_tv.columns.name = 'Predicted'
plt.figure(figsize = (7,5))
sns.heatmap(cm_tv,cmap= "Blues",annot = True, fmt='',xticklabels=["Negative Review 0","Positive Reviews 1"]
            ,yticklabels=["Negative Review 0","Positive Reviews 1"])
plt.title("CM for MNB Model WIth with TfidfVectorizer features",color='red',size=14)
plt.show()

### Accuracy of MNB by BOW and TFIDF 

In [100]:
df_results_mnb=pd.DataFrame({"MNB BOW score":[mnb_bow_score],"MNB tfidf score":[mnb_tfidf_score]})
df_results_mnb=df_results_mnb.round(4)
df_results_mnb=df_results_mnb*100
df_results_mnb