<a href="https://colab.research.google.com/github/udaydaroch/Algorithm-Visualizer/blob/main/Sentiment_Analysis(machine_learning).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install dask pandas scikit-learn nltk



In [18]:
import dask.dataframe as dd
import pandas as pd
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

import hashlib
import math
from random import randint
import json
import numpy as np

import dask.bag as db
import re
from nltk.corpus import stopwords
import nltk
from dask.distributed import Client

import dask
from dask import bag as db
import time
import requests
import gzip

# Download the data
def download_data(url, filename):
    response = requests.get(url, stream=True)
    response.raise_for_status()

    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

    print(f"Downloaded {filename}")

# Extract fields from the downloaded data
def extract_fields(input_filename, output_filename, size):
    with gzip.open(input_filename, 'rt') as input_file, open(output_filename, 'w') as output_file:
        for i, line in enumerate(input_file):
            # Stop after processing size lines
            if i > size:
                break
            data = json.loads(line)
            filtered_data = {
                'user_id': data.get('user_id'),
                'review': data.get('text'),
                'rating': data.get('rating')
            }
            output_file.write(json.dumps(filtered_data) + '\n')

def tokenize_and_remove_stop_words(data):
    data = json.loads(data)
    review = data.get('review', '')
    review = review.replace('<br />', ' ')
    rating = data.get('rating', '')
    user_id = data.get('user_id', '')
    review = re.sub(r'[^a-zA-Z ]', '', review.lower())
    review = ' '.join([word for word in review.split() if word not in STOP_WORDS])
    return (user_id, rating, review)  # Return a tuple

download_url = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/Software.jsonl.gz'
downloaded_filename = 'filter-all-t.jsonl'
filtered_filename = 'filtered_data.jsonl'

size = 300000

print("Downloading data from url...")
# Download the data
download_data(download_url, downloaded_filename)

print("Downloading stopwords...")
nltk.download('stopwords')
STOP_WORDS = set(stopwords.words('english'))

client = Client()

print('Algorithm Started')
start_time = time.time()

print("Step 1: Extracting required fields...")
# Extract fields from the downloaded data
extract_fields(downloaded_filename, filtered_filename, size)

print("Step 2: Reading and processing data...")
data_bag = db.read_text(filtered_filename, blocksize="10MB")
bag_tokenized = data_bag.map(tokenize_and_remove_stop_words)
df = bag_tokenized.to_dataframe(columns=['user_id', 'rating', 'review'])

# Convert 'rating' to numeric
df['rating'] = df['rating'].astype(int)

# Split the data into training and testing sets
train_df, test_df = df.random_split([0.8, 0.2], random_state=42)

# Collect to pandas DataFrame (for scikit-learn compatibility)
train_df = train_df.compute()
test_df = test_df.compute()

# Vectorize the reviews
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train_df['review'])
y_train = train_df['rating']
X_test = vectorizer.transform(test_df['review'])
y_test = test_df['rating']

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Additional evaluation metric: Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('Algorithm Completed')
end_time = time.time()
print(f'Total Time: {end_time - start_time} seconds')


Downloading data from url...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Perhaps you already have a cluster running?
Hosting the HTTP server on port 38273 instead
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:34685
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:38273/status
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:40005'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:36203'


Downloaded filter-all-t.jsonl
Downloading stopwords...


INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:42133', name: 1, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:42133
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:50488
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:33053', name: 0, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:33053
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:34926
INFO:distributed.scheduler:Receive client connection: Client-f377e294-2e36-11ef-80bd-0242ac1c000c
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:34928


Algorithm Started
Step 1: Extracting required fields...
Step 2: Reading and processing data...
Classification Report:
              precision    recall  f1-score   support

           1       0.60      0.72      0.66      8981
           2       0.30      0.05      0.09      3596
           3       0.40      0.25      0.31      6491
           4       0.41      0.16      0.23      9364
           5       0.71      0.92      0.80     31670

    accuracy                           0.65     60102
   macro avg       0.48      0.42      0.42     60102
weighted avg       0.59      0.65      0.59     60102

Accuracy: 0.65
Algorithm Completed
Total Time: 173.51318621635437 seconds


In [21]:
import joblib

# Save the model
model_filename = 'logistic_regression_model.joblib'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")

# Save the vectorizer
vectorizer_filename = 'tfidf_vectorizer.joblib'
joblib.dump(vectorizer, vectorizer_filename)
print(f"Vectorizer saved to {vectorizer_filename}")


Model saved to logistic_regression_model.joblib
Vectorizer saved to tfidf_vectorizer.joblib


In [26]:
import joblib

# Load the model
loaded_model = joblib.load('logistic_regression_model.joblib')
print("Model loaded.")

# Load the vectorizer
loaded_vectorizer = joblib.load('tfidf_vectorizer.joblib')
print("Vectorizer loaded.")

# Example usage with new data
new_reviews = [
    "This is good",
    "Terrible experience, do not buy.",
    "Amazing product, highly recommended!",
    "Could be better, not satisfied.",
    "Excellent service and quality.",
    "Worst purchase ever, regret buying it.",
    "It's an okay product.",
    "Very happy with my purchase, exceeded expectations.",
    "I don't recommend this product.",
    "The worst customer service I've ever experienced.",
    "Absolutely love it, worth every penny!",
    "Disappointing quality, expected more.",
    "Perfect fit and great design.",
    "I wouldn't buy this again.",
    "Top-notch product, highly impressed!",
    "Poor packaging, item arrived damaged.",
    "Best purchase decision I've made recently.",
    "Not as described, misleading information.",
    "Fast shipping, received it sooner than expected.",
    "Could improve, has some flaws.",
    "Outstanding performance, exceeded my expectations.",
    "Avoid this product, waste of money.",
    "Very pleased with the quality and durability.",
    "Overpriced for what it offers.",
    "Great value for money, highly recommend it!",
    "Couldn't be happier with this purchase.",
    "Horrible experience, wouldn't recommend it to anyone.",
    "Exactly what I needed, works perfectly.",
    "Not worth the hype, disappointed.",
    "Definitely a game-changer, impressed with the results.",
    "Poor customer service, no response to queries.",
    "Good product but could be better.",
    "Absolutely awful, complete waste of money.",
    "Highly dissatisfied, expected better.",
    "Works like a charm, very reliable.",
    "Unbelievably bad, regret buying it.",
    "Excellent customer service, very responsive.",
    "Average product, meets expectations.",
    "The best I've ever used, exceeded all expectations!",
    "Not bad, but not great either.",
    "Awful experience, avoid at all costs.",
    "Well-designed and functional.",
    "Wouldn't recommend it, disappointing.",
    "Great features, easy to use.",
    "Very poor quality, broke after a few uses.",
    "Impressed with the performance, highly recommend.",
    "Very disappointing, doesn't work as advertised.",
    "Outstanding quality, worth every penny!",
    "Not what I expected, quite disappointed.",
    "Highly recommended, excellent product.",
    "Total waste of money, regret buying it.",
    "Doesn't meet expectations, quite disappointing.",
    "Excellent value for money, very satisfied.",
    "Disappointed with the quality, not as described.",
    "Exceptional service, very professional.",
    "Decent product for the price.",
    "Absolutely terrible, avoid this product.",
    "Satisfied with the purchase, meets expectations.",
    "Wouldn't buy again, not worth it.",
    "Fantastic product, exceeded my expectations.",
    "Not recommended, poor quality.",
    "Very impressed with the durability and performance.",
    "Disappointing purchase, doesn't work properly.",
    "Highly satisfied, worth every penny.",
    "Poor build quality, wouldn't recommend.",
    "Highly reliable and efficient.",
    "Not worth the money, disappointing.",
    "Excellent build quality, very durable.",
    "Regret buying it, doesn't perform well.",
    "Exactly as described, very happy with it.",
    "Avoid at all costs, terrible experience.",
    "Works like a charm, highly recommended.",
    "Average performance, expected more.",
    "Great customer service, very helpful.",
    "Could be improved, has some flaws.",
    "Absolutely satisfied with the purchase.",
    "Unreliable product, disappointed with it.",
    "Very pleased with the performance and quality.",
    "Complete waste of money, avoid.",
    "Good quality but overpriced.",
    "Perfect for my needs, highly recommend it.",
    "Not satisfied with the product, disappointing.",
    "Very responsive customer service.",
    "Works flawlessly, highly impressed.",
    "Not the best, but gets the job done.",
    "Terrible quality, broke within a week.",
    "Impressed with the features and functionality.",
    "Doesn't live up to expectations, disappointed.",
    "Excellent product, great value for money.",
    "Disappointed with the purchase, doesn't work well.",
    "Excellent performance, very reliable.",
    "Poor customer support, no resolution.",
    "Decent product, meets basic requirements.",
    "Not recommended, better options available.",
    "Highly efficient and user-friendly.",
    "Expected more, quite disappointed.",
    "Outstanding customer service, very responsive.",
    "Below average performance, not recommended.",
    "Good purchase, meets expectations.",
    "Could have been better, slightly disappointed.",
    "Top-quality product, exceeded expectations.",
    "Not worth it, regret buying it.",
    "Excellent build and design, very impressed.",
    "Very disappointing experience, avoid this product.",
]

new_reviews_transformed = loaded_vectorizer.transform(new_reviews)
predictions = loaded_model.predict(new_reviews_transformed)

# Print predictions for each review
for review, prediction in zip(new_reviews, predictions):
    print(f"Review: '{review}' -> Predicted Rating: {prediction}")


Model loaded.
Vectorizer loaded.
Review: 'This is good' -> Predicted Rating: 5
Review: 'Terrible experience, do not buy.' -> Predicted Rating: 1
Review: 'Amazing product, highly recommended!' -> Predicted Rating: 5
Review: 'Could be better, not satisfied.' -> Predicted Rating: 5
Review: 'Excellent service and quality.' -> Predicted Rating: 5
Review: 'Worst purchase ever, regret buying it.' -> Predicted Rating: 1
Review: 'It's an okay product.' -> Predicted Rating: 3
Review: 'Very happy with my purchase, exceeded expectations.' -> Predicted Rating: 5
Review: 'I don't recommend this product.' -> Predicted Rating: 5
Review: 'The worst customer service I've ever experienced.' -> Predicted Rating: 1
Review: 'Absolutely love it, worth every penny!' -> Predicted Rating: 5
Review: 'Disappointing quality, expected more.' -> Predicted Rating: 2
Review: 'Perfect fit and great design.' -> Predicted Rating: 5
Review: 'I wouldn't buy this again.' -> Predicted Rating: 1
Review: 'Top-notch product, hi