In [1]:
import pandas as pd
import numpy as np

# Import and read the Dataset

In [2]:
# Read the data datasets

# Dataset from Sep-Oct 2018 (5k datapoints)
test_data = pd.read_csv("/Users/tiagovhp/Ironhack/Week_7/Amazon_SentimentAnalysis/Dataset/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv")

# Dataset from Feb-Apr 2019 (28332 datapoints)
train_data = pd.read_csv("/Users/tiagovhp/Ironhack/Week_7/Amazon_SentimentAnalysis/Dataset/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")

In [3]:
# Separate data into training & test
# Keep only relevant columns
## reviews.title, reviews.text, reviews.rating, reviews.doRecommend
## Combine reviews.doRecommend, reviews.title and reviews.text into 1 single column: concatenate with " "

# Train data
train = pd.DataFrame(train_data['reviews.text'], columns = ['reviews.text'])
train['reviews.title'] = train_data['reviews.title']
#train['reviews.doRecommend'] = train_data['reviews.doRecommend']
train['combined.review'] = train['reviews.title'] + ' ' + train['reviews.text']
#train['combined.review'] = train_data['reviews.doRecommend'] + ' ' + train['combined.review']
train = train.drop(columns=["reviews.title","reviews.text"])
train['reviews.rating'] = train_data['reviews.rating']
display(train.head())

# Test data
test = pd.DataFrame(test_data['reviews.text'], columns = ['reviews.text'])
test['reviews.title'] = test_data['reviews.title']
test['combined.review'] = test['reviews.title'] + ' ' + test['reviews.text']
test = test.drop(columns=["reviews.title","reviews.text"])
test['reviews.rating'] = test_data['reviews.rating']
display(test.head())

Unnamed: 0,combined.review,reviews.rating
0,... 3 of them and one of the item is bad quali...,3
1,... always the less expensive way to go for pr...,4
2,... are not Duracell but for the price i am ha...,5
3,... as well as name brand batteries at a much ...,5
4,... batteries are very long lasting the price ...,5


Unnamed: 0,combined.review,reviews.rating
0,Too small I thought it would be as big as smal...,3
1,Great light reader. Easy to use at the beach T...,5
2,Great for the price Didnt know how much i'd us...,4
3,A Great Buy I am 100 happy with my purchase. I...,5
4,Solid entry-level Kindle. Great for kids Solid...,5


In [4]:
# Convert scores from [1,2,3] to -1, [4] to 0 and 5 to 1
train['score'] = train['reviews.rating'].apply(lambda rating: -1 if rating <= 3 else (0 if rating == 4 else 1))
test['score'] = test['reviews.rating'].apply(lambda rating: -1 if rating <= 3 else (0 if rating == 4 else 1))

# Convert the combined.review column to string
train['combined.review'] = train['combined.review'].astype(str)
test['combined.review'] = test['combined.review'].astype(str)

# Rename columns
train = train.rename(columns={'combined.review':'text', 'score':'label'})
test = test.rename(columns={'combined.review':'text', 'score':'label'})

# Drop review.rating
train = train.drop(columns=['reviews.rating'])
test = test.drop(columns=['reviews.rating'])

train

Unnamed: 0,text,label
0,... 3 of them and one of the item is bad quali...,-1
1,... always the less expensive way to go for pr...,0
2,... are not Duracell but for the price i am ha...,1
3,... as well as name brand batteries at a much ...,1
4,... batteries are very long lasting the price ...,1
...,...,...
28327,Xmas gift I got 2 of these for my 8 yr old twi...,1
28328,yes it is a great tablet. I bought this for my...,0
28329,You get a lot for the price! Very nice for lig...,1
28330,You get the entire World for less than $100! T...,1


Clean the sentences:
- Remove ponctuation special characters (without substituting accented characters)
- Replace multiple spaces with single space
- Remove english stopwords
- Convert to lowercase

In [5]:
# Text cleaning

# Define a function to clean a sentence
def clean_sentence(sentence):
    import re

    # Remove any special character and replace them space " "
    cleaned_sentence = re.sub(r'[^a-zA-Z0-9\s]', " ", sentence)

    # Convert to lowercase
    cleaned_sentence = cleaned_sentence.lower()

    # Remove stopwords
    import nltk
    from nltk.corpus import stopwords
    ## Stopwords in english
    stopwords = stopwords.words("english")
    ## Remove stopwords from sentence
    for word in stopwords:
        cleaned_sentence = re.sub(r'\b' + re.escape(word) + r'\b', "", cleaned_sentence)
    
     # Replace multiple spaces with a single space
    cleaned_sentence = re.sub(r'\s+', ' ', cleaned_sentence)
    return cleaned_sentence

# Define a function to clean sentences from list
def clean_sentences(list_of_sentences):
    cleaned_sentences_list = []
    for sentence in list_of_sentences:
        cleaned_sentence = clean_sentence(sentence)
        cleaned_sentences_list.append(cleaned_sentence)
    return cleaned_sentences_list


# Clean all text from train and test
train_text_clean = clean_sentences(train['text'])
train['text']= train_text_clean

test_text_clean = clean_sentences(test['text'])
test['text']= test_text_clean

# Inspect the final train and test data
display(train.head(10))
display(test.head(10))

Unnamed: 0,text,label
0,3 one item bad quality missing backup spring ...,-1
1,always less expensive way go products like bu...,0
2,duracell price happy well duracell price happy,1
3,well name brand batteries much better seem wo...,1
4,batteries long lasting price great batteries ...,1
5,batteries christmas amazonbasics cell good no...,1
6,batteries ordered past pleased ive problame b...,1
7,batteries last quite perfect nothing say well...,1
8,hold amount high power juice like energizer d...,-1
9,done well appear good shelf life amazonbasics...,0


Unnamed: 0,text,label
0,small thought would big small paper turn like...,-1
1,great light reader easy use beach kindle light...,1
2,great price didnt know much use kindle went lo...,0
3,great buy 100 happy purchase caught sale real...,1
4,solid entry level kindle great kids solid entr...,1
5,good ebook make excellent ebook reader expect ...,1
6,light weight makes world difference taking boo...,1
7,good quality bought kindle 2 months ago batte...,0
8,best ebook amazon kindle always best ebook upg...,1
9,great product beyond expectation even show mus...,1


In [6]:
# Convert my Dataframe into Hugging Face object

from datasets import Dataset

train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)


  from .autonotebook import tqdm as notebook_tqdm


# Load the Hugging Face tokenizer (from distilBert)

In [7]:
import numpy
from transformers import DistilBertTokenizer

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenizer function
def tok_function(example):
    return tokenizer(example['text'], padding = 'max_length',truncation=True)

tokenized_data = tok_function(train)

# Tokenize our datasets
train_tok = train.map(tok_function,batched=True)
test_tok = test.map(tok_function,batched=True)

Map: 100%|██████████| 28332/28332 [00:04<00:00, 5785.81 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 5401.71 examples/s]


# Load Hugging Face pre-trained Model

In [8]:
from transformers import DistilBertForSequenceClassification

# Load the pre-trained DistilBERT model with Classification head for 3 classes
distil_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Set Up the trainer

In [9]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="/Users/tiagovhp/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=distil_model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test,
)



# Train the model

In [10]:
# Fine-tune DistilBert on my data

trainer.train()

  5%|▍         | 500/10626 [07:39<2:35:55,  1.08it/s]

{'loss': 0.4702, 'grad_norm': 2.8669793605804443, 'learning_rate': 1.9058912102390367e-05, 'epoch': 0.14}


  9%|▉         | 1000/10626 [15:11<2:17:00,  1.17it/s]

{'loss': 0.3718, 'grad_norm': 9.447548866271973, 'learning_rate': 1.811782420478073e-05, 'epoch': 0.28}


 14%|█▍        | 1500/10626 [22:25<2:10:02,  1.17it/s]

{'loss': 0.3779, 'grad_norm': 3.9057464599609375, 'learning_rate': 1.717673630717109e-05, 'epoch': 0.42}


 19%|█▉        | 2000/10626 [29:32<2:02:12,  1.18it/s]

{'loss': 0.361, 'grad_norm': 3.0919992923736572, 'learning_rate': 1.6235648409561456e-05, 'epoch': 0.56}


 24%|██▎       | 2500/10626 [36:31<1:53:31,  1.19it/s]

{'loss': 0.3508, 'grad_norm': 4.461452960968018, 'learning_rate': 1.5294560511951817e-05, 'epoch': 0.71}


 28%|██▊       | 3000/10626 [43:42<1:50:58,  1.15it/s]

{'loss': 0.344, 'grad_norm': 3.09033465385437, 'learning_rate': 1.435347261434218e-05, 'epoch': 0.85}


 33%|███▎      | 3500/10626 [50:50<1:38:17,  1.21it/s]

{'loss': 0.335, 'grad_norm': 6.722323894500732, 'learning_rate': 1.3412384716732544e-05, 'epoch': 0.99}


 33%|███▎      | 3542/10626 [51:25<1:30:18,  1.31it/s]

ValueError: You have to specify either input_ids or inputs_embeds

# Evalutate the model

In [None]:
# Evaluate the model

results = trainer.evaluate()
print(results)

100%|██████████| 125/125 [00:34<00:00,  3.65it/s]

{'eval_loss': 1.0380381345748901, 'eval_runtime': 34.7923, 'eval_samples_per_second': 28.742, 'eval_steps_per_second': 3.593, 'epoch': 3.0}





In [None]:
# Make predctions / test
predictions, label_ids, metrics = trainer.predict(test_tok)

100%|██████████| 125/125 [00:34<00:00,  3.58it/s]


In [None]:
# Convert predictions from logits to class labels
predicted_labels = np.argmax(predictions, axis=1)

In [None]:
# Map predicted labels if necessary (original labels were -1, 0, 1)
predicted_labels_mapped = predicted_labels - 1 

In [None]:
# Print predictions for the first few entries
for i in range(5):
    print(f"Text: {test_tok[i]['text']}, Predicted label: {predicted_labels_mapped[i]}, True label: {test_tok[i]['label']}")


Text:  small thought would big small paper turn like palm think small read comfortable regular kindle would definitely recommend paperwhite instead , Predicted label: 0, True label: -1
Text: great light reader easy use beach kindle light easy use especially beach , Predicted label: 0, True label: 1
Text: great price didnt know much use kindle went lower end im happy even little dark, Predicted label: 0, True label: 0
Text:  great buy 100 happy purchase caught sale really good price normally real book person 1 year old loves ripping pages kindle prevents extremely portable fits better purse giant book loaded lots books finish one start another without go store serves needs picked one paperwhite price unbeatable difference could see one backlit simple book light dollar tree solves issue second kindle first old keyboard model put fell love keyboard lol likely last , Predicted label: 0, True label: 1
Text: solid entry level kindle great kids solid entry level kindle great kids gifted kid f

In [None]:
from sklearn.metrics import classification_report

# Calculate performance metrics
print(classification_report(label_ids, predicted_labels_mapped, target_names=['Negative', 'Neutral', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00        58
     Neutral       0.21      0.98      0.34       209
    Positive       0.00      0.00      0.00       733

    accuracy                           0.20      1000
   macro avg       0.07      0.33      0.11      1000
weighted avg       0.04      0.20      0.07      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
