# Libraries Import

In [24]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import random

# Data

## Data import

In [25]:
file_path = "../raw_data/reviews_cleaned.csv"
df = pd.read_csv(file_path)

## Data exploration

In [26]:
df.head()

Unnamed: 0,review_content
0,Looks durable Charging is fine tooNo complains
1,"Charging is really fast, good product."
2,Till now satisfied with the quality.
3,This is a good product . The charging speed is...
4,"Good quality, would recommend"


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11012 entries, 0 to 11011
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   review_content  11008 non-null  object
dtypes: object(1)
memory usage: 86.2+ KB


In [28]:
df.describe()

Unnamed: 0,review_content
count,11008
unique,7668
top,Good
freq,362


In [29]:
df.shape

(11012, 1)

In [30]:
df["review_content"][0]

'Looks durable Charging is fine tooNo complains'

In [46]:
print(df.isna().sum())

review_content    4
dtype: int64


In [47]:
df[df.isna().any(axis=1)]

Unnamed: 0,review_content
5861,
8233,
9122,
9953,


## Data Cleaning

In [50]:
# Drop NAN
df = df.dropna()

In [51]:
print(df.isna().sum())

review_content    0
dtype: int64


# BERT

## Instantiate Model

In [31]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

## Encode and calculate Sentiment

In [32]:
tokens = tokenizer.encode("Ganz gut", return_tensors="pt")

In [33]:
result = model(tokens)

In [34]:
result.logits

tensor([[-2.2008, -1.5792,  0.6085,  1.6348,  1.1937]],
       grad_fn=<AddmmBackward0>)

In [35]:
int(torch.argmax(result.logits))+1

4

## Score our Data

### BERT with Pseudodf

#### Create pseudo dataframe as playground 

In [36]:
# Create some example data with titles
pseudo_data = {
    'Username': ['user1', 'user2', 'user3', 'user4', 'user5', 'user6', 'user7', 'user8', 'user9', 'user10'],
    'ProductID': [101, 101, 101, 102, 102, 103, 104, 104, 105, 106],
    'Title': [
        'Excellent Product',
        'Good Purchase',
        'Average Quality',
        'Very Disappointed',
        'Highly Recommended',
        'Functional but Not Perfect',
        'Waste of Money',
        'Satisfactory Purchase',
        'Exceeded Expectations',
        'Mixed Feelings'],
    'Review': [
        'Excellent',
        'Good product.',
        'Average quality for the price.',
        'Not recommended. Very disappointed.',
        'Awesome! I love it. Highly recommended.',
        'Could be better, but it works.',
        'Terrible. Waste of money.',
        'Satisfactory purchase. No complaints.',
        'This product exceeded my expectations. Great value!',
        'I have mixed feelings about this product.']
}

# Add random additional sentences to some reviews
for i in range(5):
    index = random.randint(0, 9)
    additional_sentences = [
        'I received it on time.',
        'The packaging was damaged, but the product was intact.',
        'The customer service was helpful.',
        'The color is not as described in the picture.',
        'I would buy it again in the future.']
    pseudo_data['Review'][index] += ' ' + ' '.join(random.sample(additional_sentences, random.randint(1, 3)))

# Create a DataFrame named pseudo_df
pseudo_df = pd.DataFrame(pseudo_data)

# Display the DataFrame
pseudo_df


Unnamed: 0,Username,ProductID,Title,Review
0,user1,101,Excellent Product,Excellent
1,user2,101,Good Purchase,Good product. I would buy it again in the futu...
2,user3,101,Average Quality,Average quality for the price.
3,user4,102,Very Disappointed,Not recommended. Very disappointed.
4,user5,102,Highly Recommended,Awesome! I love it. Highly recommended. The pa...
5,user6,103,Functional but Not Perfect,"Could be better, but it works."
6,user7,104,Waste of Money,Terrible. Waste of money.
7,user8,104,Satisfactory Purchase,Satisfactory purchase. No complaints.
8,user9,105,Exceeded Expectations,This product exceeded my expectations. Great v...
9,user10,106,Mixed Feelings,I have mixed feelings about this product. I wo...


#### Loop through data and pass to model

In [37]:
# function to to get a review and pass through the model
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors="pt")
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [38]:
pseudo_df["Sentiment Title"] = pseudo_df["Title"].apply(lambda x: sentiment_score(x[:512]))

In [39]:
pseudo_df["Sentiment Review"] = pseudo_df["Review"].apply(lambda x: sentiment_score(x[:512]))

In [40]:
pseudo_df

Unnamed: 0,Username,ProductID,Title,Review,Sentiment Title,Sentiment Review
0,user1,101,Excellent Product,Excellent,5,5
1,user2,101,Good Purchase,Good product. I would buy it again in the futu...,4,4
2,user3,101,Average Quality,Average quality for the price.,3,3
3,user4,102,Very Disappointed,Not recommended. Very disappointed.,1,1
4,user5,102,Highly Recommended,Awesome! I love it. Highly recommended. The pa...,5,5
5,user6,103,Functional but Not Perfect,"Could be better, but it works.",3,3
6,user7,104,Waste of Money,Terrible. Waste of money.,1,1
7,user8,104,Satisfactory Purchase,Satisfactory purchase. No complaints.,4,4
8,user9,105,Exceeded Expectations,This product exceeded my expectations. Great v...,5,5
9,user10,106,Mixed Feelings,I have mixed feelings about this product. I wo...,3,3


### BERT with our data

In [43]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors="pt")
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [52]:
# Split the DataFrame into batches of 512 rows
batch_size = 512
num_batches = len(df) // batch_size + 1

# Initialize an empty list to store sentiment scores
sentiment_scores = []

# Iterate through the batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_reviews = df["review_content"].iloc[start_idx:end_idx]
    
    # Iterate through the rows within the batch
    for j, review in enumerate(batch_reviews):
        # Check if the review is NaN or None
        if pd.notna(review):
            sentiment = sentiment_score(review[:512])
            sentiment_scores.append(sentiment)
            
            # Calculate the row number
            row_number = i * batch_size + j + 1
            
            # Print the progress
            print(f"Row {row_number} in Batch {i + 1} is done")

# Add the sentiment scores to the DataFrame
df["Sentiment"] = sentiment_scores

Row 1 in Batch 1 is done
Row 2 in Batch 1 is done
Row 3 in Batch 1 is done
Row 4 in Batch 1 is done
Row 5 in Batch 1 is done
Row 6 in Batch 1 is done
Row 7 in Batch 1 is done
Row 8 in Batch 1 is done
Row 9 in Batch 1 is done
Row 10 in Batch 1 is done
Row 11 in Batch 1 is done
Row 12 in Batch 1 is done
Row 13 in Batch 1 is done
Row 14 in Batch 1 is done
Row 15 in Batch 1 is done
Row 16 in Batch 1 is done
Row 17 in Batch 1 is done
Row 18 in Batch 1 is done
Row 19 in Batch 1 is done
Row 20 in Batch 1 is done
Row 21 in Batch 1 is done
Row 22 in Batch 1 is done
Row 23 in Batch 1 is done
Row 24 in Batch 1 is done
Row 25 in Batch 1 is done
Row 26 in Batch 1 is done
Row 27 in Batch 1 is done
Row 28 in Batch 1 is done
Row 29 in Batch 1 is done
Row 30 in Batch 1 is done
Row 31 in Batch 1 is done
Row 32 in Batch 1 is done
Row 33 in Batch 1 is done
Row 34 in Batch 1 is done
Row 35 in Batch 1 is done
Row 36 in Batch 1 is done
Row 37 in Batch 1 is done
Row 38 in Batch 1 is done
Row 39 in Batch 1 is 

In [53]:
df.head()

Unnamed: 0,review_content,Sentiment
0,Looks durable Charging is fine tooNo complains,4
1,"Charging is really fast, good product.",4
2,Till now satisfied with the quality.,4
3,This is a good product . The charging speed is...,4
4,"Good quality, would recommend",4


## Save results to CSV

In [61]:
csv_path_save = "../raw_data/reviews_analyzed.csv"

In [62]:
df.to_csv(csv_path_save, index=False)

# To Dos

- nothing i am aware of :)

In [54]:
file_path_save = "../raw_data/reviews_analyzed.csv"

In [None]:
df.to_csv(csv_file_path, index=TRUE)