In [1]:
import pandas as pd  # type: ignore
import numpy as np # type: ignore
import matplotlib.pyplot as plt # type: ignore
import re
from nltk.tokenize import word_tokenize # type: ignore
from nltk.corpus import stopwords # type: ignore
from nltk.stem import PorterStemmer, WordNetLemmatizer # type: ignore
from sklearn.feature_extraction.text import TfidfVectorizer # type: ignore
from sklearn.preprocessing import LabelEncoder # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from keras.utils import to_categorical # type: ignore
from keras import Sequential # type: ignore
from keras.layers import Dense # type: ignore
import joblib # type: ignore
import nltk # type: ignore
import streamlit as st # type: ignore

# Load dataset
df = pd.read_csv('IMDB Dataset.csv')

# Text preprocessing
df['clean_text'] = df['review'].apply(lambda x: re.sub("<.*?>", "", x))
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'[^\w\s]', "", x))
df['clean_text'] = df['clean_text'].str.lower()
df['tokenize_text'] = df['clean_text'].apply(lambda x: word_tokenize(x))

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['filter_text'] = df['tokenize_text'].apply(lambda x: [word for word in x if word not in stop_words])

stem = PorterStemmer()
df['stem_text'] = df['filter_text'].apply(lambda x: [stem.stem(word) for word in x])

lemma = WordNetLemmatizer()
df['lemma_text'] = df['filter_text'].apply(lambda x: [lemma.lemmatize(word) for word in x])

X = df['stem_text']
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorizing text data
tfidf = TfidfVectorizer(lowercase=False)
X_train = tfidf.fit_transform(X_train.apply(lambda x: ''.join(x)))
X_test = tfidf.transform(X_test.apply(lambda x: ''.join(x)))

# Encoding labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
y_train = to_categorical(y_train, num_classes=2)

# Building and training the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(2, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10)

# Saving the model and vectorizer
joblib.dump(model, 'model4.pkl')
joblib.dump(tfidf, 'tfidf4.pkl')

# Loading the model and vectorizer
model = joblib.load('model4.pkl')
tfidf_vector = joblib.load('tfidf4.pkl')

# Defining the sentiment prediction function
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def predict_sentiment(review):
    cleaned_review = re.sub('<.*?>', '', review)
    cleaned_review = re.sub(r'[^\w\s]', '', cleaned_review)
    cleaned_review = cleaned_review.lower()
    tokenized_review = word_tokenize(cleaned_review)
    filtered_review = [word for word in tokenized_review if word not in stop_words]
    stemmed_review = [stemmer.stem(word) for word in filtered_review]
    tfidf_review = tfidf_vector.transform([''.join(stemmed_review)])
    sentiment_prediction = model.predict(tfidf_review)
    if sentiment_prediction[0][1] > 0.6:
        return 'positive'
    else:
        return 'negative'

# Streamlit app
st.title('Sentiment Analysis Chatbot')
st.write('Enter a movie review to get the sentiment prediction (positive/negative).')

review = st.text_input('Enter your review:')
if st.button('Predict'):
    sentiment = predict_sentiment(review)
    st.write(f'The predicted sentiment is: {sentiment}')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tentu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 23ms/step - accuracy: 0.5005 - loss: 0.6932
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 23ms/step - accuracy: 0.9106 - loss: 0.2178
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.9999 - loss: 2.1067e-04
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 22ms/step - accuracy: 1.0000 - loss: 2.7593e-06
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 22ms/step - accuracy: 1.0000 - loss: 2.4724e-06
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 23ms/step - accuracy: 1.0000 - loss: 2.1211e-06
Epoch 7/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 22ms/step - accuracy: 1.0000 - loss: 1.6829e-06
Epoch 8/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 23ms/step - accuracy: 1.0000 - l

2024-08-09 15:10:32.012 
  command:

    streamlit run C:\Users\tentu\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-08-09 15:10:32.024 Session state does not function when running a script without `streamlit run`
