Capstone Project

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords

# Load the dataset
file_path = "data/imdb.csv"
df = pd.read_csv(file_path)

# Check dataset structure
df.head()

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and special characters
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply preprocessing
df['review'] = df['review'].apply(preprocess_text)

# Convert sentiment labels to numeric
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42, stratify=df['sentiment'])

# Convert text to TF-IDF vectors
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 most common words
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Initialize models
log_reg = LogisticRegression()
svm = SVC(kernel='linear')

# Train Logistic Regression
log_reg.fit(X_train_tfidf, y_train)
y_pred_log = log_reg.predict(X_test_tfidf)

# Train SVM
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)

# Function to evaluate models
def evaluate_model(y_true, y_pred, model_name):
    print(f"\nModel: {model_name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# Evaluate both models
evaluate_model(y_test, y_pred_log, "Logistic Regression")
evaluate_model(y_test, y_pred_svm, "Support Vector Machine")


Model: Logistic Regression
Accuracy: 0.8892
Precision: 0.8827694728560189
Recall: 0.8976
F1 Score: 0.8901229670765569

Confusion Matrix:
[[4404  596]
 [ 512 4488]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


Model: Support Vector Machine
Accuracy: 0.8864
Precision: 0.8807646826960978
Recall: 0.8938
F1 Score: 0.8872344649593011

Confusion Matrix:
[[4395  605]
 [ 531 4469]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      5000
           1       0.88      0.89      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords

# Load the dataset
file_path = "data/imdb.csv"
df = pd.read_csv(file_path)

# Check dataset structure
df.head()

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and special characters
    return text

# Apply preprocessing
df['review'] = df['review'].apply(preprocess_text)

# Convert sentiment labels to numeric
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42, stratify=df['sentiment'])

# Convert text to TF-IDF vectors with bigrams and increased feature size
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Initialize models with optimized hyperparameters
log_reg = LogisticRegression(C=0.5)
svm = SVC(kernel='linear', C=0.5)

# Train Logistic Regression
log_reg.fit(X_train_tfidf, y_train)
y_pred_log = log_reg.predict(X_test_tfidf)

# Train SVM
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)

# Function to evaluate models
def evaluate_model(y_true, y_pred, model_name):
    print(f"\nModel: {model_name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# Evaluate both models
evaluate_model(y_test, y_pred_log, "Logistic Regression")
evaluate_model(y_test, y_pred_svm, "Support Vector Machine")


Model: Logistic Regression
Accuracy: 0.8979
Precision: 0.892483724600513
Recall: 0.9048
F1 Score: 0.8985996623299235

Confusion Matrix:
[[4455  545]
 [ 476 4524]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      5000
           1       0.89      0.90      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000


Model: Support Vector Machine
Accuracy: 0.9046
Precision: 0.899171270718232
Recall: 0.9114
F1 Score: 0.9052443384982122

Confusion Matrix:
[[4489  511]
 [ 443 4557]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      5000
           1       0.90      0.91      0.91      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.9

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Load the dataset
file_path = "data/imdb.csv"
df = pd.read_csv(file_path)

# Check dataset structure
df.head()

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and special characters
    return text

# Apply preprocessing
df['review'] = df['review'].apply(preprocess_text)

# Convert sentiment labels to numeric
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42, stratify=df['sentiment'])

# Convert text to TF-IDF vectors with bigrams and increased feature size
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Save the TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# Initialize models with optimized hyperparameters
log_reg = LogisticRegression(C=0.5)
svm = SVC(kernel='linear', C=0.5)

# Train Logistic Regression
log_reg.fit(X_train_tfidf, y_train)
y_pred_log = log_reg.predict(X_test_tfidf)

# Train SVM
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)

# Save trained models
with open("logistic_regression.pkl", "wb") as f:
    pickle.dump(log_reg, f)

with open("svm_model.pkl", "wb") as f:
    pickle.dump(svm, f)

# Function to evaluate models
def evaluate_model(y_true, y_pred, model_name):
    print(f"\nModel: {model_name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# Evaluate both models
evaluate_model(y_test, y_pred_log, "Logistic Regression")
evaluate_model(y_test, y_pred_svm, "Support Vector Machine")

# Function to predict sentiment for new reviews
def predict_sentiment(new_reviews):
    # Load vectorizer and models
    with open("tfidf_vectorizer.pkl", "rb") as f:
        tfidf = pickle.load(f)
    with open("logistic_regression.pkl", "rb") as f:
        log_reg = pickle.load(f)
    with open("svm_model.pkl", "rb") as f:
        svm = pickle.load(f)
    
    # Preprocess new reviews
    new_reviews_cleaned = [preprocess_text(review) for review in new_reviews]
    new_reviews_tfidf = tfidf.transform(new_reviews_cleaned)
    
    # Predict using both models
    predictions_log = log_reg.predict(new_reviews_tfidf)
    predictions_svm = svm.predict(new_reviews_tfidf)
    
    # Display results
    for review, sentiment_log, sentiment_svm in zip(new_reviews, predictions_log, predictions_svm):
        print(f"Review: {review}")
        print(f"Logistic Regression Prediction: {'Positive' if sentiment_log == 1 else 'Negative'}")
        print(f"SVM Prediction: {'Positive' if sentiment_svm == 1 else 'Negative'}\n")

# Example predictions
new_reviews = [
    "I absolutely loved this movie! The story was amazing and the acting was brilliant.",
    "Worst movie ever. It was a complete waste of time."
]
predict_sentiment(new_reviews)


Model: Logistic Regression
Accuracy: 0.8979
Precision: 0.892483724600513
Recall: 0.9048
F1 Score: 0.8985996623299235

Confusion Matrix:
[[4455  545]
 [ 476 4524]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      5000
           1       0.89      0.90      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000


Model: Support Vector Machine
Accuracy: 0.9046
Precision: 0.899171270718232
Recall: 0.9114
F1 Score: 0.9052443384982122

Confusion Matrix:
[[4489  511]
 [ 443 4557]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      5000
           1       0.90      0.91      0.91      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.9

In [11]:
new_reviews = [
    "Delighting in razzle-dazzle over historical precision, Gladiator II rigorously entertains all the better for it.",  "Echoing its predecessor while upping the bloodsport and camp, Gladiator II is an action extravaganza that derives much of its strength and honor from Denzel Washington's scene-stealing performance.",
    "Watching Gladiator II, it is hard not to be a little suspicious about exactly what we are longing for in our Roman fantasies."
]
predict_sentiment(new_reviews)

Review: Delighting in razzle-dazzle over historical precision, Gladiator II rigorously entertains all the better for it.
Logistic Regression Prediction: Negative
SVM Prediction: Negative

Review: Echoing its predecessor while upping the bloodsport and camp, Gladiator II is an action extravaganza that derives much of its strength and honor from Denzel Washington's scene-stealing performance.
Logistic Regression Prediction: Positive
SVM Prediction: Positive

Review: Watching Gladiator II, it is hard not to be a little suspicious about exactly what we are longing for in our Roman fantasies.
Logistic Regression Prediction: Positive
SVM Prediction: Positive

