# Libraries Import

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import random

  from .autonotebook import tqdm as notebook_tqdm


# Data

## Data import

In [2]:
file_path = "../raw_data/amazon.csv"
df = pd.read_csv(file_path)

## Data exploration

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df["review_content"][0]

# BERT

## Instantiate Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

## Encode and calculate Sentiment

In [None]:
tokens = tokenizer.encode("Ganz gut", return_tensors="pt")

In [None]:
result = model(tokens)

In [None]:
result.logits

In [None]:
int(torch.argmax(result.logits))+1

## Score our Data

### Create pseudo-data to test

In [None]:
# Create some example data with titles
pseudo_data = {
    'Username': ['user1', 'user2', 'user3', 'user4', 'user5', 'user6', 'user7', 'user8', 'user9', 'user10'],
    'ProductID': [101, 101, 101, 102, 102, 103, 104, 104, 105, 106],
    'Title': [
        'Excellent Product',
        'Good Purchase',
        'Average Quality',
        'Very Disappointed',
        'Highly Recommended',
        'Functional but Not Perfect',
        'Waste of Money',
        'Satisfactory Purchase',
        'Exceeded Expectations',
        'Mixed Feelings'],
    'Review': [
        'Excellent',
        'Good product.',
        'Average quality for the price.',
        'Not recommended. Very disappointed.',
        'Awesome! I love it. Highly recommended.',
        'Could be better, but it works.',
        'Terrible. Waste of money.',
        'Satisfactory purchase. No complaints.',
        'This product exceeded my expectations. Great value!',
        'I have mixed feelings about this product.']
}

# Add random additional sentences to some reviews
for i in range(5):
    index = random.randint(0, 9)
    additional_sentences = [
        'I received it on time.',
        'The packaging was damaged, but the product was intact.',
        'The customer service was helpful.',
        'The color is not as described in the picture.',
        'I would buy it again in the future.']
    pseudo_data['Review'][index] += ' ' + ' '.join(random.sample(additional_sentences, random.randint(1, 3)))

# Create a DataFrame named pseudo_df
pseudo_df = pd.DataFrame(pseudo_data)

# Display the DataFrame
pseudo_df


### Loop through data and pass to model

In [None]:
# function to to get a review and pass through the model
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors="pt")
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [None]:
pseudo_df["Sentiment Title"] = pseudo_df["Title"].apply(lambda x: sentiment_score(x[:512]))

In [None]:
pseudo_df["Sentiment Review"] = pseudo_df["Review"].apply(lambda x: sentiment_score(x[:512]))

In [None]:
pseudo_df

# To Dos

- 512 is maximum of passing through, so we need to somehow continue looping after again for 512