In [18]:
import os
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import joblib

In [12]:
# Load the reviews and labels from the file
with open('train_reviews.pkl', 'rb') as f:
    train_reviews, train_labels = pickle.load(f)

with open('test_reviews.pkl', 'rb') as f:
    test_reviews, test_labels = pickle.load(f)
    
with open('unsup_reviews.pkl', 'rb') as f:
    train_reviews_unsup = pickle.load(f)

In [19]:
# Combine all reviews for vectorization
all_reviews = train_reviews + train_reviews_unsup

# Convert the reviews to a document-term matrix
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(all_reviews)

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer_model.pkl')

['vectorizer_model.pkl']

In [20]:
# Fit an LDA model to learn word representations
n_topics = 50
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=10, learning_method='online', random_state=0)
lda.fit(X)

# Save the LDA model
joblib.dump(lda, 'lda_model.pkl')

['lda_model.pkl']

In [21]:
from scipy.special import softmax

# Get document-topic distributions from the LDA model
doc_topic_distributions = lda.transform(X)

# Function to incorporate sentiment annotations into word representations
def incorporate_sentiment(doc_topic_distributions, labels, alpha=0.1):
    n_docs, n_topics = doc_topic_distributions.shape
    sentiment_topic_distributions = np.zeros((n_topics, 2))
    
    # Calculate the sentiment distribution for each topic
    for topic in range(n_topics):
        positive_sum = 0
        negative_sum = 0
        for i in range(n_docs):
            if i < len(labels):  # Only use labeled data
                if labels[i] == 1:
                    positive_sum += doc_topic_distributions[i, topic]
                else:
                    negative_sum += doc_topic_distributions[i, topic]
        sentiment_topic_distributions[topic, 0] = negative_sum
        sentiment_topic_distributions[topic, 1] = positive_sum
    
    # Normalize the distributions
    sentiment_topic_distributions = softmax(sentiment_topic_distributions, axis=1)
    
    # Adjust the topic-word distributions based on sentiment
    topic_word_distributions = lda.components_
    adjusted_topic_word_distributions = topic_word_distributions.copy()
    for topic in range(n_topics):
        for word in range(topic_word_distributions.shape[1]):
            adjusted_topic_word_distributions[topic, word] += alpha * sentiment_topic_distributions[topic, 1]
    
    # Normalize the adjusted topic-word distributions
    adjusted_topic_word_distributions /= adjusted_topic_word_distributions.sum(axis=1)[:, np.newaxis]
    
    return adjusted_topic_word_distributions

# Incorporate sentiment information into word representations
adjusted_topic_word_distributions = incorporate_sentiment(doc_topic_distributions, train_labels)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Use the document-topic distributions as features for sentiment classification
X_train = doc_topic_distributions[:len(train_labels)]
y_train = np.array(train_labels)

# Train a logistic regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Save the logistic regression classifier
joblib.dump(classifier, 'classifier_model.pkl')

['classifier_model.pkl']

In [23]:
# Evaluate the classifier
y_train_pred = classifier.predict(X_train)
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.77
Precision: 0.76
Recall: 0.78
F1 Score: 0.77


In [26]:
# Function to predict sentiment of an input review
def predict_sentiment(review, vectorizer, lda, classifier):
    review_vector = vectorizer.transform([review])
    review_topic_distribution = lda.transform(review_vector)
    prediction = classifier.predict(review_topic_distribution)
    sentiment = 'positive' if prediction[0] == 1 else 'negative'
    return sentiment

# Example usage
input_review = input("Enter a review: ")
predicted_sentiment = predict_sentiment(input_review, vectorizer, lda, classifier)
print(f"The predicted sentiment for the review is: {predicted_sentiment}")

Enter a review: bullshit
The predicted sentiment for the review is: positive
