### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
import nltk
import re

nltk.download('punkt') # punkt tokenizer

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Import dataset

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/imdb_sentiment_analysis_dataset.csv')
data.head()

In [None]:
data = data.sample(15000).reset_index(drop=True)
data.head()

In [None]:
data.shape

In [None]:
X = data.iloc[:, 0].values
y = data.iloc[:, 1].values

### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
len(X_train), len(y_test)

### Encoding target variable

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_train

### Cleaning the text data

In [None]:
wordnet = WordNetLemmatizer()

In [None]:
def clean_data(X):
  corpus = []

  for review in X:
    review = review.lower()
    review = re.sub('<br \/>', ' ', review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    words = nltk.word_tokenize(review)
    words = [wordnet.lemmatize(word) for word in words if word not in stopwords.words('english')]
    w = ' '.join(words)
    corpus.append(w)
  return corpus

In [None]:
X_train = clean_data(X_train)

In [None]:
X_train[0]

A vectorizer helps us convert text data to computer understandable numeric data.

In [None]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train).toarray()

In [None]:
X_train[0]

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

In [None]:
X_test = clean_data(X_test)
X_test = tfidf.transform(X_test).toarray()

In [None]:
len(X_train), len(X_train[0])

In [None]:
len(X_test), len(X_test[0])

In [None]:
y_pred = mnb.predict(X_test)

In [None]:
# Convert y_pred to positive or negative
y_pred = le.inverse_transform(y_pred)
print(y_pred[:10])

In [None]:
len(y_test), len(y_pred)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

##### The model gives an accuracy of around 85.66%

In [None]:
import pickle

# open a file, where you want to store the data
file = open("naivebayes.pkl", 'wb')

# dump information to that file
pickle.dump([mnb, tfidf, le], file)

In [None]:
from google.colab import files
files.download('naivebayes.pkl')

In [None]:
files.download('app.py')

#### install streamlit and restart runtime

In [None]:
!pip install -q streamlit

In [None]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip

In [None]:
!unzip ngrok-stable-linux-amd64.zip

In [None]:
get_ipython().system_raw('./ngrok http 8501 &')

In [None]:
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    'import sys, json; print("Execute the next cell and the go to the following URL: " +json.load(sys.stdin)["tunnels"][0]["public_url"])'

In [None]:
!streamlit run /content/app.py