<a href="https://colab.research.google.com/github/victorjoseij/Natural_language_processing/blob/main/Sentiment_Analysis_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents
doc1 = "Dance, an art form that ignites the soul, where every step tells a story and every movement expresses emotions.The rhythm of the music becomes the heartbeat of a dancer, and the stage transforms into a canvas where dreams take flight."

doc2 = "In the realm of dance, passion flows through every movement, weaving tales of joy, sorrow, and triumph on the stage.To dance is to surrender to the music, to embrace the freedom of expression, and to find solace in the graceful cadence of movement."

# Convert documents to vectors
vectorizer = CountVectorizer()
vectorized_docs = vectorizer.fit_transform([doc1, doc2])

# Calculate cosine similarity
cosine_sim = cosine_similarity(vectorized_docs)

print("Cosine Similarity:", cosine_sim[0][1])

Cosine Similarity: 0.543155118283426


In [None]:
# Function to calculate Jaccard similarity
def jaccard_similarity(doc1, doc2):
    set1 = set(doc1.split())
    set2 = set(doc2.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# Calculate Jaccard similarity
jaccard_sim = jaccard_similarity(doc1, doc2)

print("Jaccard Similarity:", jaccard_sim)

Jaccard Similarity: 0.06779661016949153


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [None]:
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to 'review' column
df['review'] = df['review'].apply(preprocess_text)

# Split into features and target
X = df['review'].values
y = df['sentiment'].values

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Tokenization and padding
max_words = 10000  # Example
max_len = 100  # Example

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Check the shape of the data
print("X_train shape:", X_train_pad.shape)
print("X_val shape:", X_val_pad.shape)
print("X_test shape:", X_test_pad.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)

X_train shape: (35000, 100)
X_val shape: (7500, 100)
X_test shape: (7500, 100)
y_train shape: (35000,)
y_val shape: (7500,)
y_test shape: (7500,)



Sentiment Analysis using Bayesian Classification:



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Train the Multinomial Naive Bayes classifier
bayes_classifier = MultinomialNB()
bayes_classifier.fit(X_train_tfidf, y_train)

# Predictions on validation set
y_pred_val = bayes_classifier.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy (Bayesian):", val_accuracy)

# Predictions on test set
y_pred_test = bayes_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Test Accuracy (Bayesian):", test_accuracy)

Validation Accuracy (Bayesian): 0.8494666666666667
Test Accuracy (Bayesian): 0.8569333333333333


Sentiment Analysis using RNN:

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform labels
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

# Convert target variable
y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)
y_test = y_test.astype(np.float32)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Build the RNN model
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_rnn.add(SimpleRNN(units=128, dropout=0.2))  # Adding dropout for regularization
model_rnn.add(Dense(units=1, activation='sigmoid'))

model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the RNN model
history_rnn = model_rnn.fit(X_train_pad, y_train, validation_data=(X_val_pad, y_val), epochs=5, batch_size=128)

# Evaluate the RNN model
val_loss_rnn, val_accuracy_rnn = model_rnn.evaluate(X_val_pad, y_val)
print("Validation Accuracy (RNN):", val_accuracy_rnn)

test_loss_rnn, test_accuracy_rnn = model_rnn.evaluate(X_test_pad, y_test)
print("Test Accuracy (RNN):", test_accuracy_rnn)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation Accuracy (RNN): 0.7991999983787537
Test Accuracy (RNN): 0.8014666438102722


Sentiment Analysis using LSTM:

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Build the LSTM model
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_lstm.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))  # Adding dropout for regularization
model_lstm.add(Dense(units=1, activation='sigmoid'))

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the LSTM model
history_lstm = model_lstm.fit(X_train_pad, y_train, validation_data=(X_val_pad, y_val), epochs=5, batch_size=128)

# Evaluate the LSTM model
val_loss_lstm, val_accuracy_lstm = model_lstm.evaluate(X_val_pad, y_val)
print("Validation Accuracy (LSTM):", val_accuracy_lstm)

test_loss_lstm, test_accuracy_lstm = model_lstm.evaluate(X_test_pad, y_test)
print("Test Accuracy (LSTM):", test_accuracy_lstm)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation Accuracy (LSTM): 0.8401333093643188
Test Accuracy (LSTM): 0.8438666462898254
