<a href="https://colab.research.google.com/github/thealonemusk/Product-Review-Analyzer/blob/main/Product_Review_Analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers


Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.0 MB/s[0m eta [36m0:00:0

In [None]:
from flask import Flask, request, jsonify
from transformers import pipeline

app = Flask(__name__)

# Load the sentiment analysis model
sentiment_model = pipeline("sentiment-analysis")

# API route for performing sentiment analysis on product reviews
@app.route("/sentiment-analysis", methods=["POST"])
def perform_sentiment_analysis():
    data = request.get_json()  # Get the reviews from the request body
    reviews = data["reviews"]

    # Perform sentiment analysis on each review
    results = sentiment_model(reviews)

    # Extract sentiment labels and scores for each review
    sentiments = [{"review": result["text"], "sentiment": result["label"], "score": result["score"]} for result in results]

    return jsonify(sentiments)

# API route for calculating the overall product quality
@app.route("/product-quality", methods=["POST"])
def calculate_product_quality():
    data = request.get_json()  # Get the sentiment analysis results from the request body
    sentiments = data["sentiments"]

    # Calculate the average sentiment score
    scores = [sentiment["score"] for sentiment in sentiments]
    average_score = sum(scores) / len(scores)

    # Determine the overall sentiment label based on the average score
    if average_score >= 0.6:
        overall_sentiment = "Positive"
    elif average_score >= 0.4:
        overall_sentiment = "Neutral"
    else:
        overall_sentiment = "Negative"

    return jsonify({"overall_sentiment": overall_sentiment, "average_score": average_score})

# API route for providing improvement suggestions
@app.route("/improvement-suggestions", methods=["POST"])
def provide_improvement_suggestions():
    data = request.get_json()  # Get the sentiment analysis results from the request body
    sentiments = data["sentiments"]

    # Identify common negative aspects mentioned in the reviews
    negative_reviews = [sentiment["review"] for sentiment in sentiments if sentiment["sentiment"] == "NEGATIVE"]
    common_negative_aspects = identify_common_negative_aspects(negative_reviews)

    return jsonify({"common_negative_aspects": common_negative_aspects})

def identify_common_negative_aspects(reviews):
    # Your implementation here
    # This function should analyze the negative reviews and identify common negative aspects

    # Example implementation: Extract nouns or keywords from the reviews
    common_aspects = ["customer service", "product durability", "shipping delays"]

    return common_aspects

# Run the Flask application
if __name__ == "__main__":
    app.run()


In [None]:
pip install transformers[torch]

Collecting accelerate>=0.20.2 (from transformers[torch])
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


In [None]:
pip install accelerate -U



In [None]:
!pip install --upgrade transformers




In [None]:
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load and preprocess the dataset
dataset = pd.read_csv("testing_dataset.csv")

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Define the labels (sentiments)
labels = list(dataset["Rate"].unique())

# Load the pre-trained tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize the text and encode labels
train_encodings = tokenizer(list(train_data["Review"]), truncation=True, padding=True)
val_encodings = tokenizer(list(val_data["Review"]), truncation=True, padding=True)

train_labels = [labels.index(label) for label in train_data["Rate"]]
val_labels = [labels.index(label) for label in val_data["Rate"]]

# Create PyTorch DataLoader
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings["input_ids"]),
                                               torch.tensor(train_encodings["attention_mask"]),
                                               torch.tensor(train_labels))

val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_encodings["input_ids"]),
                                             torch.tensor(val_encodings["attention_mask"]),
                                             torch.tensor(val_labels))

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./models",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    disable_tqdm=True  # Add this line to suppress the tqdm progress bar if needed
)

# Load the pre-trained model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(labels))

# Create a trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model("./models/sentiment_model")


ImportError: ignored

In [None]:
pip install torch



In [None]:
pip install sklearn


Collecting sklearn
  Using cached sklearn-0.0.post5.tar.gz (3.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post5-py3-none-any.whl size=2950 sha256=ab32fc32ba718fe13a52e4cd55b1d0d1c9ff876c541a07c2f7a654b7a7751f34
  Stored in directory: /root/.cache/pip/wheels/38/1f/8d/4f812c590e074c1e928f5cec67bf5053b71f38e2648739403a
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post5


In [None]:
!pip install accelerate -U




In [3]:
pip install transformers


Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.1 MB/s[0m eta [36m0:00:0

In [None]:
pip install transformers[torch]

Collecting accelerate>=0.20.2 (from transformers[torch])
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


In [4]:
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

# Load and preprocess the dataset
dataset = pd.read_csv("testing_dataset.csv")

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Define the labels (sentiments)
labels = list(dataset["Rate"].unique())

# Load the pre-trained tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize the text and encode labels
train_encodings = tokenizer(list(train_data["Review"]), truncation=True, padding=True)
val_encodings = tokenizer(list(val_data["Review"]), truncation=True, padding=True)

train_input_ids = torch.tensor(train_encodings["input_ids"])
train_attention_mask = torch.tensor(train_encodings["attention_mask"])
train_labels = torch.tensor([labels.index(label) for label in train_data["Rate"]])

val_input_ids = torch.tensor(val_encodings["input_ids"])
val_attention_mask = torch.tensor(val_encodings["attention_mask"])
val_labels = torch.tensor([labels.index(label) for label in val_data["Rate"]])

# Create PyTorch DataLoader
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_mask, val_labels)

# Load the pre-trained model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(labels))

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

num_epochs = 3
batch_size = 16

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss}")

# Save the trained model
model.save_pretrained("./models/sentiment_model")
tokenizer.save_pretrained("./models/sentiment_model")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.we

Epoch 1/3 - Loss: 1.3595246076583862
Epoch 2/3 - Loss: 1.319258689880371
Epoch 3/3 - Loss: 1.3342052698135376


('./models/sentiment_model/tokenizer_config.json',
 './models/sentiment_model/special_tokens_map.json',
 './models/sentiment_model/vocab.txt',
 './models/sentiment_model/added_tokens.json',
 './models/sentiment_model/tokenizer.json')

In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

# Load and preprocess the dataset
dataset = pd.read_csv("flipkart_product.csv")

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Define the labels (sentiments)
labels = list(dataset["Rate"].unique())

# Load the pre-trained tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize the text and encode labels
train_encodings = tokenizer(list(train_data["Review"]), truncation=True, padding=True)
val_encodings = tokenizer(list(val_data["Review"]), truncation=True, padding=True)

train_input_ids = torch.tensor(train_encodings["input_ids"])
train_attention_mask = torch.tensor(train_encodings["attention_mask"])
train_labels = torch.tensor([labels.index(label) for label in train_data["Rate"]])

val_input_ids = torch.tensor(val_encodings["input_ids"])
val_attention_mask = torch.tensor(val_encodings["attention_mask"])
val_labels = torch.tensor([labels.index(label) for label in val_data["Rate"]])

# Create PyTorch DataLoader
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_mask, val_labels)

# Load the pre-trained model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(labels))

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

num_epochs = 3
batch_size = 16

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss}")

# Save the trained model
model.save_pretrained("./models/sentiment_model")
tokenizer.save_pretrained("./models/sentiment_model")
