# Data Collection
You can use a publicly available dataset like the IMDb Reviews dataset or Amazon Product Reviews. For this example, I'll use the IMDb Reviews dataset, which is widely used for sentiment analysis.

Dataset Link: IMDb Reviews Dataset

The dataset contains 50,000 movie reviews labeled as positive or negative.

In [33]:
import pandas as pd
import csv
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score, precision_recall_curve, auc

In [34]:
# Download NLTK data (only need to run once)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
# 1. Load the dataset
file_path = '/content/IMDB_Dataset.csv'
df = pd.read_csv(file_path, quoting=csv.QUOTE_NONE, on_bad_lines='skip')

# 2. Preprocess the text data
df['review'] = df['review'].fillna('')

# Preprocess the text data
def clean_text(text):
    if isinstance(text, str):  # Ensure the text is a string
        # Lowercase the text
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize text and remove stopwords
        stop_words = set(stopwords.words('english'))
        words = word_tokenize(text)
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)
    else:
        return ''  # Return empty string if it's not a valid string

# Apply text cleaning to the reviews
df['cleaned_review'] = df['review'].apply(clean_text)

# 3. Split the data into features (X) and target (y)
X = df['cleaned_review']  # Reviews (features)
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)  # Sentiment (target: 1 for positive, 0 for negative)

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 5. Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_tfidf, y_train)

# 6. Train the model (using Naive Bayes as an example)
model = MultinomialNB()

# Hyperparameter tuning using GridSearchCV
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]}  # Try different values of alpha
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_res, y_res)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# 7. Make predictions
y_pred = best_model.predict(X_test_tfidf)

# 8. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Evaluate model using ROC-AUC and Precision-Recall AUC
y_probs = best_model.predict_proba(X_test_tfidf)[:, 1]  # Get the probabilities for the positive class
roc_auc = roc_auc_score(y_test, y_probs)
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_probs)
pr_auc = auc(recall_vals, precision_vals)

# Output the evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC-AUC: {roc_auc}')
print(f'Precision-Recall AUC: {pr_auc}')
print('Confusion Matrix:')
print(conf_matrix)

# Example of model performance on a single review
sample_review = "This movie was absolutely fantastic!"
sample_review_cleaned = clean_text(sample_review)
sample_review_tfidf = vectorizer.transform([sample_review_cleaned])
prediction = best_model.predict(sample_review_tfidf)
print(f'Sample review sentiment: {"positive" if prediction == 1 else "negative"}')

Accuracy: 0.9970122497759187
Precision: 0.42857142857142855
Recall: 0.3333333333333333
F1 Score: 0.375
ROC-AUC: 0.44793955129485385
Precision-Recall AUC: 0.1635375911610308
Confusion Matrix:
[[3334    4]
 [   6    3]]
Sample review sentiment: positive


In [36]:
# Save the model and vectorizer for later use (optional)
import joblib
joblib.dump(best_model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']