In [2]:
# Import libraries for data handling and preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK data if not already available
nltk.download('stopwords')
nltk.download('punkt')

print("Libraries imported successfully!")


Libraries imported successfully!


[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Load the IMDB dataset from a local CSV file
df = pd.read_csv("archive/IMDB Dataset.csv")

# Check the first few rows of the dataset
print("Dataset loaded successfully!")
df.head()


Dataset loaded successfully!


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure the necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load stopwords
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove non-alphabetic characters (keeping spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    return " ".join(tokens)

# Assuming df is your DataFrame and it has the 'review' column
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Show the cleaned data
print("Preprocessing completed!")
df[['review', 'cleaned_review', 'sentiment']].head()


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing completed!


Unnamed: 0,review,cleaned_review,sentiment
0,One of the other reviewers has mentioned that ...,one reviewers mentioned watching oz episode yo...,positive
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...,positive
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...,positive
3,Basically there's a family where a little boy ...,basically theres family little boy jake thinks...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love time money visually stunni...,positive


In [5]:
# Show the cleaned data
print(df[['review', 'cleaned_review', 'sentiment']].head())


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review sentiment  
0  one reviewers mentioned watching oz episode yo...  positive  
1  wonderful little production filming technique ...  positive  
2  thought wonderful way spend time hot summer we...  positive  
3  basically theres family little boy jake thinks...  negative  
4  petter matteis love time money visually stunni...  positive  


In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure the necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load stopwords
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove non-alphabetic characters (keeping spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    return " ".join(tokens)

# Assuming df is your DataFrame and it has the 'review' column
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Let's train a model (we'll use Naive Bayes here)
X = df['cleaned_review']  # Input feature (text data)
y = df['sentiment']  # Target label (sentiment)

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=50000, stop_words='english')  
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model_nb = MultinomialNB()
model_nb.fit(X_train_tfidf, y_train)

# Use the trained model to predict on test data
y_pred_nb = model_nb.predict(X_test_tfidf)

# Display classification report for Naive Bayes
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

# Define a function to classify a review using the trained Naive Bayes model
def classify_naive_bayes(review):
    # Preprocess the review
    review_cleaned = preprocess_text(review)
    
    # Vectorize the review
    review_tfidf = vectorizer.transform([review_cleaned])
    
    # Predict sentiment using Naive Bayes
    return model_nb.predict(review_tfidf)[0]

# Example usage: Get sentiment of a new review
new_review = input("Enter the review for sentiment analysis: ")
sentiment_nb = classify_naive_bayes(new_review)
print(f"Predicted sentiment using Naive Bayes: {sentiment_nb}")


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Naive Bayes Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.87      0.86      4961
    positive       0.87      0.85      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Predicted sentiment using Naive Bayes: negative


In [9]:
import joblib
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming you have already trained the model and vectorizer
# Save the Naive Bayes model to a file
joblib.dump(model_nb, 'naive_bayes_model.pkl')

# Save the vectorizer to a file
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved!")


Model and vectorizer saved!


In [44]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import numpy as np

# Assuming you have the vectorizer and Naive Bayes model already trained
# Preprocess and vectorize the input text
def preprocess_text(text):
    # Add your preprocessing steps here
    return text.lower()

def classify_with_threshold(review, model, vectorizer, threshold=0.2):
    # Preprocess the review
    review_cleaned = preprocess_text(review)
    
    # Vectorize the review
    review_tfidf = vectorizer.transform([review_cleaned])
    
    # Predict probabilities for positive and negative sentiment
    prob = model.predict_proba(review_tfidf)[0]  # Getting probabilities for each class
    
    positive_prob = prob[1]  # Probability for positive sentiment
    negative_prob = prob[0]  # Probability for negative sentiment
    
    # If the probabilities are too close to each other, classify as neutral
    if abs(positive_prob - negative_prob) < threshold:
        return 'neutral'
    elif positive_prob > negative_prob:
        return 'positive'
    else:
        return 'negative'

# Example usage: Get sentiment of a new review
new_review = input("Enter the review for sentiment analysis: ")

# Load saved model and vectorizer
loaded_model = joblib.load('naive_bayes_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Classify review
sentiment = classify_with_threshold(new_review, loaded_model, loaded_vectorizer)
print(f"Predicted sentiment: {sentiment}")


Predicted sentiment: negative


In [45]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Set the path explicitly for NLTK data
nltk.data.path.append('C:\\nltk_data')

# Print current NLTK data paths to verify if the path was added
print("NLTK data paths:", nltk.data.path)

# Download required NLTK resources
nltk.download('stopwords', download_dir='C:\\nltk_data')
nltk.download('punkt', download_dir='C:\\nltk_data')

# Load stopwords
stop_words = set(stopwords.words('english'))

# Initialize the stemmer
stemmer = PorterStemmer()

# Preprocessing function
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Convert to lowercase for consistency

    # Remove stopwords and apply stemming
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and word.isalpha()]

    return " ".join(tokens)

# Assuming df is your DataFrame and it has the 'review' column
# Apply preprocessing to the reviews
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Show the cleaned data
print("Preprocessing completed!")
print(df[['review', 'cleaned_review', 'sentiment']].head())


NLTK data paths: ['C:\\Users\\yohan/nltk_data', 'c:\\Python312\\nltk_data', 'c:\\Python312\\share\\nltk_data', 'c:\\Python312\\lib\\nltk_data', 'C:\\Users\\yohan\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data', 'C:\\nltk_data']


[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Preprocessing completed!
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review sentiment  
0  one review mention watch oz episod hook right ...  positive  
1  wonder littl product film techniqu fashion giv...  positive  
2  thought wonder way spend time hot summer weeke...  positive  
3  basic famili littl boy jake think zombi closet...  negative  
4  petter mattei love time money visual stun film...  positive  


In [46]:
# Take input from the user
new_review = input("Enter the review for sentiment analysis: ")

# Classify the sentiment
sentiment = classify_with_threshold(new_review)
print("Predicted sentiment:", sentiment)


TypeError: classify_with_threshold() missing 2 required positional arguments: 'model' and 'vectorizer'