In [4]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import joblib

# Define the file path
file_path = r'./Dataset.csv'

# Check if the file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found at {file_path}")

# Load the dataset
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print("Dataset head:")
print(df.head())

# Pre-process the data
# Check for missing values
print("\nChecking for missing values:")
print(df.isnull().sum())

# Drop missing values
df.dropna(inplace=True)

# Split data into features and target variable
X = df['review']
y = df['sentiment']

# Encode target variable
y = y.map({'positive': 1, 'negative': 0})

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with text vectorization and model training
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('classifier', MultinomialNB())
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("\nModel evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Function to make prediction with new data
def predict_sentiment(review):
    prediction = pipeline.predict([review])
    return 'positive' if prediction[0] == 1 else 'negative'

# Example prediction
new_review = "The movie was fantastic and I loved the storyline."
print("\nPrediction for new review:")
print(f"Review: {new_review}")
print(f"Sentiment: {predict_sentiment(new_review)}")

# Save the trained model
joblib.dump(pipeline, 'sentiment_analysis_model.pkl')
print("\nModel saved as 'sentiment_analysis_model.pkl'.")


Dataset head:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Checking for missing values:
review       0
sentiment    0
dtype: int64

Model evaluation:
Accuracy: 0.8652

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      4961
           1       0.88      0.85      0.86      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000


Prediction for new review:
Review: The movie was fantastic and I loved the storyline.
Sentiment: positive

Model saved as 'sentiment_analysis_model