In [None]:
#Import necessary libraries and load the dataset:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load dataset (assuming CSV format with 'review_text', 'sentiment' columns)
data = pd.read_csv('movie_reviews.csv')

In [None]:
# Preprocess the data:
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Preprocess the reviews
data['processed_review'] = data['review_text'].apply(preprocess_text)


In [None]:
# Split the data into training and testing sets:
X = data['processed_review']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#Convert text data to numerical features:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [None]:
#Select and train a model:
# Using Multinomial Naive Bayes as an example
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)


In [None]:
# Predictions
y_pred = model.predict(X_test_vectorized)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{report}")
