**Preprocessing:**

1. Tokenization
2. Stemming
3. Lemmatization

In [49]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [50]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [51]:
stop_words = set(stopwords.words('english'))
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [68]:
data = [
    "This movie was fantastic! I loved every minute of it.",
    "Terrible movie, complete waste of time.",
    "It was an okay movie, not bad but not great either.",
    "Amazing film! Definitely recommend watching it.",
    "Awful film, I do not recommend it to anyone.",
    "An absolute masterpiece, I was blown away!",
    "One of the worst movies I’ve ever seen.",
    "Great performances and an engaging story.",
    "Boring, predictable, and poorly written.",
    "Enjoyable film with a strong message.",
    "I wouldn't watch this again, not worth the time.",
    "Outstanding cinematography but the plot was weak.",
    "A must-watch for fans of the genre.",
    "Disappointing. I expected much more from it.",
    "Full of heart and emotion, a beautiful film.",
    "I fell asleep halfway through, it was that dull.",
    "A delightful family movie that everyone can enjoy.",
    "Poor acting and terrible special effects.",
    "A thought-provoking film that stays with you.",
    "Not my cup of tea, but some might enjoy it.",
    "I thoroughly enjoyed it from start to finish!",
    "Complete rubbish, couldn't wait for it to end.",
    "A fun ride with plenty of laughs.",
    "Too long and lacked direction.",
    "Great soundtrack but the story didn’t hold up.",
    "A charming movie with lovable characters.",
    "Not as good as I had hoped, very disappointing.",
    "Well-executed with a unique premise.",
    "I regret paying to see this in the theater.",
    "Top-notch action scenes, but not much else.",
    "Beautifully shot but ultimately forgettable.",
    "One of my favorite movies this year!",
    "I couldn't make it past the first 20 minutes.",
    "A compelling drama with outstanding performances.",
    "Too confusing and hard to follow.",
    "A perfect film for a cozy night in.",
    "Ridiculous plot but entertaining nonetheless.",
    "A refreshing take on the superhero genre.",
    "This is one of those movies you watch once and forget.",
    "A rollercoaster of emotions from start to finish.",
    "Didn't live up to the hype, very overrated.",
    "I loved it! Will definitely be watching again.",
    "Waste of time, don’t bother.",
    "A truly captivating film experience.",
    "Poor dialogue and a lackluster story.",
    "A solid entry in the franchise.",
    "Utterly terrible in every aspect.",
    "I highly recommend this to everyone.",
    "Barely watchable, very poorly done.",
    "One of the best movies I've seen in a while.",
    "Uninspired and boring from beginning to end.",
    "The acting saved this otherwise mediocre film.",
    "Just okay, nothing special.",
    "I can't believe this movie got such good reviews.",
    "Brilliant! A fantastic watch.",
    "Complete garbage, don’t waste your money.",
    "A film that truly speaks to the heart.",
    "Just when I thought it couldn’t get worse, it did.",
    "A powerful and moving story.",
    "Could have been so much better.",
    "An instant classic, loved every second.",
    "A dull film with very little excitement.",
    "A true work of art, visually stunning.",
    "One of the worst films I’ve ever seen.",
    "A beautifully crafted movie with great depth.",
    "Absolutely awful, avoid at all costs.",
    "A movie that will inspire generations to come.",
    "Completely forgettable, not worth your time.",
    "An exciting and thrilling adventure.",
    "I’ve never seen a movie this bad.",
    "A touching story with real emotional depth.",
    "Predictable and lacking originality.",
    "It was good but not as great as people say.",
    "A disappointing sequel to a great first movie.",
    "An uplifting and feel-good film.",
    "A total disaster, hard to watch.",
    "It was a decent movie, but not amazing.",
    "I hated every minute of this movie.",
    "Surprisingly good, I didn’t expect to like it.",
    "Poorly acted, poorly written, just bad.",
    "A standout film with great performances.",
    "I wish I could get those two hours back.",
    "Loved the cinematography, but the rest was meh.",
    "A perfect blend of humor and drama.",
    "What a waste of a great cast.",
    "A heartwarming story that left me in tears.",
    "Don't bother with this one, it's not worth it.",
    "An impressive and thrilling action movie.",
    "I didn't enjoy it, not my type of movie.",
    "A must-see for any movie fan."
]




In [69]:
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    stemmed = [stemmer.stem(word) for word in tokens]
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    return ' '.join(lemmatized)

preprocessed_data = [preprocess(review) for review in data]
print("Preprocessed Data:\n", preprocessed_data)

Preprocessed Data:
 ['movi fantast love everi minut', 'terribl movi complet wast time', 'okay movi bad great either', 'amaz film definit recommend watch', 'aw film recommend anyon', 'absolut masterpiec blown away', 'one worst movi ever seen', 'great perform engag stori', 'bore predict poorli written', 'enjoy film strong messag', 'would watch worth time', 'outstand cinematographi plot weak', 'fan genr', 'disappoint expect much', 'full heart emot beauti film', 'fell asleep halfway dull', 'delight famili movi everyon enjoy', 'poor act terribl special effect', 'film stay', 'cup tea might enjoy', 'thoroughli enjoy start finish', 'complet rubbish could wait end', 'fun ride plenti laugh', 'long lack direct', 'great soundtrack stori hold', 'charm movi lovabl charact', 'good hope disappoint', 'uniqu premis', 'regret pay see theater', 'action scene much el', 'beauti shot ultim forgett', 'one favorit movi year', 'could make past first 20 minut', 'compel drama outstand perform', 'confus hard follo

**Vectorization Techniques:**

Now, we'll convert the preprocessed text into numerical vectors using the following methods:

1. One Hot Encoding
2. Bag of Words
3. TF-IDF
4. CBOW (Word2Vec)

**One Hot Encoding**

In [70]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(np.array(preprocessed_data).reshape(-1, 1))
print("One Hot Encoded Data:\n", onehot_encoded)


One Hot Encoded Data:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]




**Bag of Words**

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(preprocessed_data)
print("Bag of Words Representation:\n", X_bow.toarray())


Bag of Words Representation:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


**TF-IDF**

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(preprocessed_data)
print("TF-IDF Representation:\n", X_tfidf.toarray())


TF-IDF Representation:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


**Continuous Bag of Words (CBOW)**

In [73]:
import gensim
from gensim.models import Word2Vec

tokenized_data = [review.split() for review in preprocessed_data]

# Train Word2Vec model (CBOW model: sg=0)
cbow_model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, sg=0)

# To represent each sentence, average the word vectors of all words in the sentence
def get_sentence_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vectors, axis=0)

X_cbow = np.array([get_sentence_vector(sentence, cbow_model) for sentence in preprocessed_data])
print("CBOW Word Vectors:\n", X_cbow)


CBOW Word Vectors:
 [[-6.3912629e-04  3.6210469e-03  1.8243576e-04 ... -1.7634958e-03
  -1.5856440e-03  4.4760453e-03]
 [-5.0168391e-03  1.7704182e-04  2.2441749e-03 ...  3.4834102e-03
   4.7718617e-03  4.1062124e-03]
 [ 1.3673218e-03 -7.6835445e-04  1.9242356e-03 ...  6.0570234e-04
   2.1600276e-03  2.7421662e-03]
 ...
 [-1.8390713e-03 -3.9312951e-03  2.9161647e-03 ... -4.6399934e-04
  -4.7269980e-03  3.7453952e-04]
 [ 2.4709068e-03 -4.2579565e-03 -8.6342916e-04 ... -3.3811554e-03
   2.6316105e-03  5.7820170e-03]
 [ 2.3844498e-03  5.0336327e-03 -9.4403536e-04 ...  8.7188859e-04
  -2.8905575e-05  2.7699713e-03]]


**Train a Model and Compare**

We can train a simple logistic regression model and compare the performance of each vectorization technique

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Using One Hot Encoder

# Train-test split for One Hot Encoded
X_train, X_test, y_train, y_test = train_test_split(onehot_encoded, labels, test_size=0.2, random_state=42)

# Train logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using OneHotEncoder: {accuracy}")


Accuracy using OneHotEncoder: 0.3888888888888889


In [75]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Using BOW

# Train-test split for BOW
X_train, X_test, y_train, y_test = train_test_split(X_bow, labels, test_size=0.2, random_state=42)

# Train logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using BOW: {accuracy}")

Accuracy using BOW: 0.3888888888888889


In [76]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Using CBOW

# Train-test split for CBOW
X_train, X_test, y_train, y_test = train_test_split(X_cbow, labels, test_size=0.2, random_state=42)

# Train logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using CBOW: {accuracy}")

Accuracy using CBOW: 0.3888888888888889


In [77]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Example: Using TF-IDF (You can repeat the same process for OneHot, BoW, and CBOW)

# Train-test split for TF-IDF
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)

# Train logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using TF-IDF: {accuracy}")


Accuracy using TF-IDF: 0.4444444444444444


**Testing the Model with a New Sentence**

In [78]:
# Sample new sentence for testing
new_sentence = "This film was absolutely fantastic!"

# Preprocess the new sentence
processed_sentence = preprocess(new_sentence)

# Transform the processed sentence using the same TF-IDF vectorizer
# Note: Make sure to use the same vectorizer you fitted on the training data
new_sentence_tfidf = tfidf_vectorizer.transform([processed_sentence])

# Make prediction using the trained model
prediction = model.predict(new_sentence_tfidf)

# Output the prediction
print("Predicted Sentiment:", "Positive" if prediction[0] == 1 else "Negative")


Predicted Sentiment: Positive
