# Import the Important packages

In [38]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB # classifier 
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    plot_confusion_matrix,
)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [19]:
# text preprocessing modules
from string import punctuation 
# text preprocessing modules
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re #regular expression

In [None]:
nltk.download('stopwords')

In [25]:
# Download dependency
for dependency in (
    "brown",
    "names",
    "wordnet",
    "averaged_perceptron_tagger",
    "universal_tagset",
):
    nltk.download(dependency)
    
import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ybash\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\ybash\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\names.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ybash\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ybash\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\ybash\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ybash\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

# read dataset

In [8]:
df = pd.read_csv('IMDB.csv')

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
#shape of data
df.shape

(50000, 2)

In [14]:
#check null value
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [17]:
df.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [40]:

le=LabelEncoder()
df['sentiment']= le.fit_transform(df['sentiment'])


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,One reviewer mentioned watching Oz episode hoo...
1,A wonderful little production. <br /><br />The...,1,A wonderful little production br br The filmin...
2,I thought this was a wonderful way to spend ti...,1,I thought wonderful way spend time hot summer ...
3,Basically there's a family where a little boy ...,0,Basically family little boy Jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,Petter Mattei Love Time Money visually stunnin...


In [41]:
stop_words =  stopwords.words('english')
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text =  re.sub(r'http\S+',' link ', text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    # Optionally, shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer() 
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    
    # Return a list of words
    return(text)

In [42]:
df["cleaned_review"] = df["review"].apply(text_cleaning)

In [43]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,One reviewer mentioned watching Oz episode hoo...
1,A wonderful little production. <br /><br />The...,1,A wonderful little production br br The filmin...
2,I thought this was a wonderful way to spend ti...,1,I thought wonderful way spend time hot summer ...
3,Basically there's a family where a little boy ...,0,Basically family little boy Jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,Petter Mattei Love Time Money visually stunnin...


In [44]:
X = df["cleaned_review"]
y = df.sentiment.values

In [45]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=y,
)

In [46]:
sentiment_classifier = Pipeline(steps=[
                               ('pre_processing',TfidfVectorizer(lowercase=False)),
                                 ('naive_bayes',MultinomialNB())
                                 ])

In [47]:
sentiment_classifier.fit(X_train,y_train)

Pipeline(steps=[('pre_processing', TfidfVectorizer(lowercase=False)),
                ('naive_bayes', MultinomialNB())])

In [51]:
y_preds =sentiment_classifier.predict(X_valid)

In [52]:
accuracy_score(y_valid,y_preds)

0.8708

In [60]:
import pickle
# create an iterator object with write permission - model.pkl
with open('sentiment_model_pipeline.pkl', 'wb') as files:
    pickle.dump(sentiment_classifier, files)
