# Libraries Import

In [4]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import random

In [2]:
!pip install torch

Collecting torch
  Using cached torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
Collecting networkx (from torch)
  Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch)
  Using cached nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch)
  Using cached nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch)
  Using cached nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch)
  Using cached nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch)
  Using cached nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
Collecting nvidia-cufft-cu11==10.9.0.58 (from torch)
  Using cached nvidia_cufft_cu11-10.9.0.58-py3-none-man

In [18]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.15.2%2Bcpu-cp310-cp310-linux_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.0.2%2Bcpu-cp310-cp310-linux_x86_64.whl (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m0m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/4.1 MB[0m [31m94.5 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: torchvision, torchaudio
Successfully installed torchaudio-2.0.2+cpu torchvision-0.15.2+cpu


# Data

## Data import

In [7]:
file_path = "../raw_data/reviews_cleaned.csv"
df = pd.read_csv(file_path)

## Data exploration

In [8]:
df.head()

Unnamed: 0,review_content
0,Looks durable Charging is fine tooNo complains
1,"Charging is really fast, good product."
2,Till now satisfied with the quality.
3,This is a good product . The charging speed is...
4,"Good quality, would recommend"


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11012 entries, 0 to 11011
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   review_content  11008 non-null  object
dtypes: object(1)
memory usage: 86.2+ KB


In [10]:
df.describe()

Unnamed: 0,review_content
count,11008
unique,7668
top,Good
freq,362


In [29]:
df.shape

(11012, 1)

In [30]:
df["review_content"][0]

'Looks durable Charging is fine tooNo complains'

In [11]:
print(df.isna().sum())

review_content    4
dtype: int64


In [12]:
df[df.isna().any(axis=1)]

Unnamed: 0,review_content
5861,
8233,
9122,
9953,


## Data Cleaning

In [13]:
# Drop NAN
df = df.dropna()

In [14]:
print(df.isna().sum())

review_content    0
dtype: int64


# BERT

## Instantiate Model

In [20]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


## Encode and calculate Sentiment

In [32]:
tokens = tokenizer.encode("Ganz gut", return_tensors="pt")

In [33]:
result = model(tokens)

In [34]:
result.logits

tensor([[-2.2008, -1.5792,  0.6085,  1.6348,  1.1937]],
       grad_fn=<AddmmBackward0>)

In [35]:
int(torch.argmax(result.logits))+1

4

## Score our Data

### BERT with Pseudodf

#### Create pseudo dataframe as playground 

In [36]:
# Create some example data with titles
pseudo_data = {
    'Username': ['user1', 'user2', 'user3', 'user4', 'user5', 'user6', 'user7', 'user8', 'user9', 'user10'],
    'ProductID': [101, 101, 101, 102, 102, 103, 104, 104, 105, 106],
    'Title': [
        'Excellent Product',
        'Good Purchase',
        'Average Quality',
        'Very Disappointed',
        'Highly Recommended',
        'Functional but Not Perfect',
        'Waste of Money',
        'Satisfactory Purchase',
        'Exceeded Expectations',
        'Mixed Feelings'],
    'Review': [
        'Excellent',
        'Good product.',
        'Average quality for the price.',
        'Not recommended. Very disappointed.',
        'Awesome! I love it. Highly recommended.',
        'Could be better, but it works.',
        'Terrible. Waste of money.',
        'Satisfactory purchase. No complaints.',
        'This product exceeded my expectations. Great value!',
        'I have mixed feelings about this product.']
}

# Add random additional sentences to some reviews
for i in range(5):
    index = random.randint(0, 9)
    additional_sentences = [
        'I received it on time.',
        'The packaging was damaged, but the product was intact.',
        'The customer service was helpful.',
        'The color is not as described in the picture.',
        'I would buy it again in the future.']
    pseudo_data['Review'][index] += ' ' + ' '.join(random.sample(additional_sentences, random.randint(1, 3)))

# Create a DataFrame named pseudo_df
pseudo_df = pd.DataFrame(pseudo_data)

# Display the DataFrame
pseudo_df


Unnamed: 0,Username,ProductID,Title,Review
0,user1,101,Excellent Product,Excellent
1,user2,101,Good Purchase,Good product. I would buy it again in the futu...
2,user3,101,Average Quality,Average quality for the price.
3,user4,102,Very Disappointed,Not recommended. Very disappointed.
4,user5,102,Highly Recommended,Awesome! I love it. Highly recommended. The pa...
5,user6,103,Functional but Not Perfect,"Could be better, but it works."
6,user7,104,Waste of Money,Terrible. Waste of money.
7,user8,104,Satisfactory Purchase,Satisfactory purchase. No complaints.
8,user9,105,Exceeded Expectations,This product exceeded my expectations. Great v...
9,user10,106,Mixed Feelings,I have mixed feelings about this product. I wo...


#### Loop through data and pass to model

In [37]:
# function to to get a review and pass through the model
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors="pt")
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [38]:
pseudo_df["Sentiment Title"] = pseudo_df["Title"].apply(lambda x: sentiment_score(x[:512]))

In [39]:
pseudo_df["Sentiment Review"] = pseudo_df["Review"].apply(lambda x: sentiment_score(x[:512]))

In [40]:
pseudo_df

Unnamed: 0,Username,ProductID,Title,Review,Sentiment Title,Sentiment Review
0,user1,101,Excellent Product,Excellent,5,5
1,user2,101,Good Purchase,Good product. I would buy it again in the futu...,4,4
2,user3,101,Average Quality,Average quality for the price.,3,3
3,user4,102,Very Disappointed,Not recommended. Very disappointed.,1,1
4,user5,102,Highly Recommended,Awesome! I love it. Highly recommended. The pa...,5,5
5,user6,103,Functional but Not Perfect,"Could be better, but it works.",3,3
6,user7,104,Waste of Money,Terrible. Waste of money.,1,1
7,user8,104,Satisfactory Purchase,Satisfactory purchase. No complaints.,4,4
8,user9,105,Exceeded Expectations,This product exceeded my expectations. Great v...,5,5
9,user10,106,Mixed Feelings,I have mixed feelings about this product. I wo...,3,3


### BERT with our data

In [43]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors="pt")
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [52]:
# Split the DataFrame into batches of 512 rows
batch_size = 512
num_batches = len(df) // batch_size + 1

# Initialize an empty list to store sentiment scores
sentiment_scores = []

# Iterate through the batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_reviews = df["review_content"].iloc[start_idx:end_idx]
    
    # Iterate through the rows within the batch
    for j, review in enumerate(batch_reviews):
        # Check if the review is NaN or None
        if pd.notna(review):
            sentiment = sentiment_score(review[:512])
            sentiment_scores.append(sentiment)
            
            # Calculate the row number
            row_number = i * batch_size + j + 1
            
            # Print the progress
            print(f"Row {row_number} in Batch {i + 1} is done")

# Add the sentiment scores to the DataFrame
df["Sentiment"] = sentiment_scores

Row 1 in Batch 1 is done
Row 2 in Batch 1 is done
Row 3 in Batch 1 is done
Row 4 in Batch 1 is done
Row 5 in Batch 1 is done
Row 6 in Batch 1 is done
Row 7 in Batch 1 is done
Row 8 in Batch 1 is done
Row 9 in Batch 1 is done
Row 10 in Batch 1 is done
Row 11 in Batch 1 is done
Row 12 in Batch 1 is done
Row 13 in Batch 1 is done
Row 14 in Batch 1 is done
Row 15 in Batch 1 is done
Row 16 in Batch 1 is done
Row 17 in Batch 1 is done
Row 18 in Batch 1 is done
Row 19 in Batch 1 is done
Row 20 in Batch 1 is done
Row 21 in Batch 1 is done
Row 22 in Batch 1 is done
Row 23 in Batch 1 is done
Row 24 in Batch 1 is done
Row 25 in Batch 1 is done
Row 26 in Batch 1 is done
Row 27 in Batch 1 is done
Row 28 in Batch 1 is done
Row 29 in Batch 1 is done
Row 30 in Batch 1 is done
Row 31 in Batch 1 is done
Row 32 in Batch 1 is done
Row 33 in Batch 1 is done
Row 34 in Batch 1 is done
Row 35 in Batch 1 is done
Row 36 in Batch 1 is done
Row 37 in Batch 1 is done
Row 38 in Batch 1 is done
Row 39 in Batch 1 is 

In [53]:
df.head()

Unnamed: 0,review_content,Sentiment
0,Looks durable Charging is fine tooNo complains,4
1,"Charging is really fast, good product.",4
2,Till now satisfied with the quality.,4
3,This is a good product . The charging speed is...,4
4,"Good quality, would recommend",4


## Save results to CSV

In [61]:
csv_path_save = "../raw_data/reviews_analyzed.csv"

In [62]:
df.to_csv(csv_path_save, index=False)

- nothing i am aware of :)

In [54]:
file_path_save = "../raw_data/reviews_analyzed.csv"

In [None]:
df.to_csv(csv_file_path, index=TRUE)


# Function to predict and give alternative suggestion

In [25]:
#from fastapi import FastAPI
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import random
#app = FastAPI()

#@app.get("/")
def index():
    return {"title": "Hello Arthur"}

#@app.get("/predict")
def predict(review):
    tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    tokens = tokenizer.encode(review, return_tensors="pt")
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [8]:
# Raw File
file_path = "../raw_data/summary_keywords_df.csv"
df = pd.read_csv(file_path)

In [61]:
df.head()

Unnamed: 0,product_id,product_name,product_category,summary,keywords,sentiment
0,B002PD61Y4,D-Link DWA-131 300 Mbps Wireless Nano USB Adap...,WirelessUSBAdapters,Good quality tool from d linkWiFi signal is go...,"['dvr works', 'wifi supporting', 'jio wifi', '...",4
1,B002SZEOLG,TP-Link Nano USB WiFi Dongle 150Mbps High Gain...,WirelessUSBAdapters,The wifi dongle is a simple plug & play device...,"['usb tethering', 'best adapter', 'wifi dongle...",4
2,B003B00484,Duracell Plus AAA Rechargeable Batteries (750 ...,RechargeableBatteries,Soldering the connections was bit tricky but i...,"['trimmer battery', 'qt4005 trimmer', 'expensi...",4
3,B003L62T7W,"Logitech B100 Wired USB Mouse, 3 yr Warranty, ...",Mice,The best thing about this mouse is that u can ...,"['mouse quality', 'best mouse', 'mouse budget'...",5
4,B004IO5BMQ,"Logitech M235 Wireless Mouse, 1000 DPI Optical...",Mice,"Good product, but too smaller than the regular...","['sized battery', 'mouse easy', 'mouse', 'mous...",4


In [52]:
df[(df['product_category'] == 'Mice') & (df['sentiment'].isin(good_review_score))].drop(columns=['product_category','keywords','sentiment'])

Unnamed: 0,product_id,product_name,summary
3,B003L62T7W,"Logitech B100 Wired USB Mouse, 3 yr Warranty, ...",The best thing about this mouse is that u can ...
4,B004IO5BMQ,"Logitech M235 Wireless Mouse, 1000 DPI Optical...","Good product, but too smaller than the regular..."
177,B01HJI0FS2,"Dell MS116 1000Dpi USB Wired Optical Mouse, Le...",Good for gaming by the way. Suits my hand the ...
186,B01KK0HU3Y,HP Z3700 Wireless Optical Mouse with USB Recei...,It's so smooth to use this. No pressure on han...
205,B01MQ2A86A,"Logitech M331 Silent Plus Wireless Mouse, 2.4G...",Like : Very soft to hand. No clicking sound. V...
237,B073BRXPZX,"Lenovo 300 Wired Plug & Play USB Mouse, High R...",It does the job and cable is strong enough. De...
332,B07J2NGB69,"Lenovo 400 Wireless Mouse, 1200DPI Optical Sen...",The most important flaw there are times I forg...
343,B07JPX9CR7,"Dell WM118 Wireless Mouse, 2.4 Ghz with USB Na...",The wireless mouse is working well. There is o...
362,B07L9FW9GF,"Zebronics Zeb-Power Wired USB Mouse, 3-Button,...",You can also do decent gaming with this mouse....
467,B07X2L5Z8C,Logitech Pebble M350 Wireless Mouse with Bluet...,It's a good bluetooth mouse. Connects within 1...


In [70]:
def predict_and_recommed(product_category, review):
    #good_review_score = [4, 5]
    if predict(review) < 4:
        if 5 in df['sentiment'].values:
            print(f'It looks like you are not too happy with this product. Try taking a look at these products under {product_category}:')
            return df[(df['product_category'] == product_category) & (df['sentiment'] == 5)].drop(columns=['product_category','keywords','sentiment'])
        else:
            print(f'It looks like you are not too happy with this product. Try taking a look at these products under {product_category}:')
            return df[(df['product_category'] == product_category) & (df['sentiment'] == 4)].drop(columns=['product_category','keywords','sentiment'])
    elif predict(review) == 4:
            print(f'Thank you for your feedback. Here are some other products with better reviews under {product_category}:')
            return df[(df['product_category'] == product_category) & (df['sentiment'] == 5)].drop(columns=['product_category','keywords','sentiment'])
    else:
        print('Great choice!')        

In [71]:
## Input
your_product_category = "Mice"
your_review = "This mouse is pretty good." #"Great mouse" #"This mouse is bad. Would not buy again."

In [72]:
## Output
predict_and_recommed(your_product_category, your_review)

Thank you for your feedback. Here are some other products with better reviews under Mice:


Unnamed: 0,product_id,product_name,summary
3,B003L62T7W,"Logitech B100 Wired USB Mouse, 3 yr Warranty, ...",The best thing about this mouse is that u can ...
177,B01HJI0FS2,"Dell MS116 1000Dpi USB Wired Optical Mouse, Le...",Good for gaming by the way. Suits my hand the ...
186,B01KK0HU3Y,HP Z3700 Wireless Optical Mouse with USB Recei...,It's so smooth to use this. No pressure on han...
467,B07X2L5Z8C,Logitech Pebble M350 Wireless Mouse with Bluet...,It's a good bluetooth mouse. Connects within 1...
680,B08LW31NQ6,Lenovo 600 Bluetooth 5.0 Silent Mouse: Compact...,"This is the perfect mouse for office, its comf..."
845,B098JYT4SY,"Zebronics Zeb-Jaguar Wireless Mouse, 2.4GHz wi...",very good producteasy to handleuse of dpi is a...
861,B099SD8PRP,"Lenovo 130 Wireless Compact Mouse, 1K DPI Opti...",Great mouse for daily office use. After 1 mont...
1121,B09ZHCJDP1,Amazon Basics Wireless Mouse | 2.4 GHz Connect...,The mouse is super comfortable in my hand. The...
