# Importing Libraries

In [7]:
import numpy as np
import pandas as pd

# Loading the dataset

In [8]:
from datasets import load_dataset

ds = load_dataset("Kenneth12/productreviewsentiment")

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

testing%20-%20Sheet1%20%283%29.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/2099 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/599 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/301 [00:00<?, ? examples/s]

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['Text', 'Label'],
        num_rows: 2099
    })
    validation: Dataset({
        features: ['Text', 'Label'],
        num_rows: 599
    })
    test: Dataset({
        features: ['Text', 'Label'],
        num_rows: 301
    })
})

In [10]:
df = ds.copy()

# Data Cleaning

## Training data cleaning

In [11]:
df_train = pd.DataFrame(df['train'])
df_train.head()

Unnamed: 0,Text,Label
0,McCann's makes oatmeal for every oatmeal conno...,positive
1,This wasn't in stock the last time I looked. I...,positive
2,Buyer Beware Please! This sweetener is not for...,negative
3,I have been drinking Royal King 100% Natural O...,positive
4,I had heard a little about this product from t...,positive


In [12]:
df_train['Label'].value_counts()

Label
positive     1584
negative      383
neutral       130
positiive       1
positve         1
Name: count, dtype: int64

In [13]:
# Define a mapping for misspelled labels to correct ones
corrections = {
    'positiive': 'positive',
    'positve': 'positive',
}

# Replace misspelled labels with correct ones
df_train['Label'] = df_train['Label'].replace(corrections)

In [14]:
df_train['Label'].value_counts()

Label
positive    1586
negative     383
neutral      130
Name: count, dtype: int64

## Validation data cleaning

In [15]:
df_val = pd.DataFrame(df['validation'])
df_val.head()

Unnamed: 0,Text,Label
0,I love chai. I used to buy loose leaf chai fro...,negative
1,Someone brought a couple of Lindt Chocolates t...,positive
2,"I'd been told this stuff tasted like Spam, but...",positive
3,"If you've never had real Swiss fondue, you're ...",positive
4,I feed Hills Science diet to my pet because I ...,positive


In [16]:
df_val['Label'].value_counts()

Label
positive    447
negative    152
Name: count, dtype: int64

## Test data cleaning

In [17]:
df_test = pd.DataFrame(df['test'])
df_test.head()

Unnamed: 0,Text,Label
0,I tried this The Switch Black Cherry flavor as...,positive
1,The Switch is an alternative to juice or soda....,positive
2,This is better than any commercially available...,positive
3,"First, this is a household that VERY rarely dr...",positive
4,"The Switch soda's are a, mostly, excellent alt...",positive


In [18]:
df_test['Label'].value_counts()

Label
positive     233
negative      40
neutral       24
0              2
positiive      1
1              1
Name: count, dtype: int64

In [19]:
# Define a mapping for misspelled labels to correct ones
corrections = {
    'positiive': 'positive',
    '0' :  'negative',
    '1' :  'neutral'
}

# Replace misspelled labels with correct ones
df_test['Label'] = df_test['Label'].replace(corrections)

In [20]:
df_test['Label'].value_counts()

Label
positive    234
negative     42
neutral      25
Name: count, dtype: int64

# Combining the data to reconsider for split

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

combined_df = pd.concat([df_train, df_val], ignore_index=True)


# Now do stratified split on combined data
train_new, val_new = train_test_split(
    combined_df,
    test_size=0.2,
    stratify=combined_df['Label'],  # to maintain class balance
    random_state=42
)

print(train_new['Label'].value_counts())
print("************************************")
print(val_new['Label'].value_counts())


Label
positive    1626
negative     428
neutral      104
Name: count, dtype: int64
************************************
Label
positive    407
negative    107
neutral      26
Name: count, dtype: int64


In [22]:
import pandas as pd


target_count = 400

# Separate each class
positive_df = train_new[train_new['Label'] == 'positive']
negative_df = train_new[train_new['Label'] == 'negative']
neutral_df = train_new[train_new['Label'] == 'neutral']

# Undersample positive to 400
positive_sampled = positive_df.sample(target_count, random_state=42)


if len(negative_df) > target_count:
    negative_sampled = negative_df.sample(target_count, random_state=42)
else:
    negative_sampled = negative_df

# Oversample neutral to 400 by sampling with replacement
neutral_oversampled = neutral_df.sample(target_count, replace=True, random_state=42)

# Combine all
train_balanced = pd.concat([positive_sampled, negative_sampled, neutral_oversampled]).reset_index(drop=True)

# Check new counts
print(train_balanced['Label'].value_counts())


Label
positive    400
negative    400
neutral     400
Name: count, dtype: int64


In [23]:
train_balanced.sample(5)

Unnamed: 0,Text,Label
294,We have older cats with one having hyperthyroi...,positive
376,There is known design flaw having to do with t...,positive
49,I got a single can of this as a sample from th...,positive
812,I brewed this coffee with the recommended rati...,neutral
539,Was shocked at the skinny size of this item!! ...,negative


# Converting back to hugging face format

In [24]:
from datasets import Dataset

# Map string labels to integers
def map_labels(df):
    label2id = {'negative': 0, 'neutral': 1, 'positive': 2}
    df = df.copy()
    df['label'] = df['Label'].map(label2id)
    return df

train_balanced_mapped = map_labels(train_balanced)
val_new_mapped = map_labels(val_new)
df_test_mapped = map_labels(df_test)  # Assuming df_test also has same columns

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_balanced_mapped)
val_dataset = Dataset.from_pandas(val_new_mapped)
test_dataset = Dataset.from_pandas(df_test_mapped)


In [25]:
# Columns you want to keep
keep_cols = ['Text', 'Label', 'label']

# Clean train, val, and test datasets
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in keep_cols])
val_dataset = val_dataset.remove_columns([col for col in val_dataset.column_names if col not in keep_cols])
test_dataset = test_dataset.remove_columns([col for col in test_dataset.column_names if col not in keep_cols])

print(train_dataset)
print(val_dataset)
print(test_dataset)


Dataset({
    features: ['Text', 'Label', 'label'],
    num_rows: 1200
})
Dataset({
    features: ['Text', 'Label', 'label'],
    num_rows: 540
})
Dataset({
    features: ['Text', 'Label', 'label'],
    num_rows: 301
})


In [26]:
from datasets import DatasetDict


dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['Text', 'Label', 'label'],
        num_rows: 1200
    })
    validation: Dataset({
        features: ['Text', 'Label', 'label'],
        num_rows: 540
    })
    test: Dataset({
        features: ['Text', 'Label', 'label'],
        num_rows: 301
    })
})


In [27]:

dataset_dict = dataset_dict.map(lambda x: x, remove_columns=["Label"])

# Check features after removal
print(dataset_dict)


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/540 [00:00<?, ? examples/s]

Map:   0%|          | 0/301 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'label'],
        num_rows: 1200
    })
    validation: Dataset({
        features: ['Text', 'label'],
        num_rows: 540
    })
    test: Dataset({
        features: ['Text', 'label'],
        num_rows: 301
    })
})


# Preprocessing

In [28]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [29]:
# Define a tokenization function
def preprocess_function(examples):
    # Tokenize the text column, truncating/padding as needed
    return tokenizer(examples["Text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenization to each split using map()
dataset_dict = dataset_dict.map(preprocess_function, batched=True)


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/540 [00:00<?, ? examples/s]

Map:   0%|          | 0/301 [00:00<?, ? examples/s]

In [30]:
# Optional: set the format to pytorch tensors if you want to use Trainer directly
dataset_dict.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [31]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['Text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1200
    })
    validation: Dataset({
        features: ['Text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 540
    })
    test: Dataset({
        features: ['Text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 301
    })
})

In [32]:
dataset_dict["train"] = dataset_dict["train"].remove_columns(["Text"])
dataset_dict["validation"] = dataset_dict["validation"].remove_columns(["Text"])
dataset_dict["test"] = dataset_dict["test"].remove_columns(["Text"])


In [33]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1200
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 540
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 301
    })
})

In [34]:
def reorder_columns(ds, order):
    data = {k: ds[k] for k in order}
    return ds.from_dict(data)

new_order = ["input_ids", "attention_mask", "label"]

dataset_dict["train"] = reorder_columns(dataset_dict["train"], new_order)
dataset_dict["validation"] = reorder_columns(dataset_dict["validation"], new_order)
dataset_dict["test"] = reorder_columns(dataset_dict["test"], new_order)

print(dataset_dict["train"].column_names)


['input_ids', 'attention_mask', 'label']


In [35]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'label'],
        num_rows: 1200
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'label'],
        num_rows: 540
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'label'],
        num_rows: 301
    })
})

In [36]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Model building

In [37]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'cardiffnlp/twitter-roberta-base-sentiment-latest',
    num_labels=3
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:
from transformers import TrainingArguments

training_args = TrainingArguments("/kaggle/working/")   #chaning working directory as per need

training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    report_to="none",  
    logging_dir="/kaggle/working/logs",       
    save_strategy="epoch",               
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [41]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    processing_class=tokenizer,
    data_collator=data_collator
)

In [42]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=225, training_loss=0.577191162109375, metrics={'train_runtime': 55.3523, 'train_samples_per_second': 65.038, 'train_steps_per_second': 4.065, 'total_flos': 236802075955200.0, 'train_loss': 0.577191162109375, 'epoch': 3.0})

# Inference

In [67]:
predictions = trainer.predict(dataset_dict["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

#predictions = trainer.predict(dataset_dict["test"])
#print(predictions.predictions.shape, predictions.label_ids.shape)

(540, 3) (540,)


In [68]:
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids

In [49]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud

# Evaluation

In [70]:
import evaluate

# Load all required metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
#precision = evaluate.load("precision")
#recall = evaluate.load("recall")

# Compute predictions and labels
acc_score = accuracy.compute(predictions=preds, references=labels)
f1_score = f1.compute(predictions=preds, references=labels, average="weighted")
#precision_score = precision.compute(predictions=preds, references=labels, average="weighted")
#recall_score = recall.compute(predictions=preds, references=labels, average="weighted")

# Print results
print("Accuracy:", acc_score)
print("F1 Score:", f1_score)
#print("Precision:", precision_score)
#print("Recall:", recall_score)


Accuracy: {'accuracy': 0.7407407407407407}
F1 Score: {'f1': 0.7604697532214547}


In [71]:
#Save the model
model.save_pretrained("/kaggle/working/final_sentiment_model")
tokenizer.save_pretrained("/kaggle/working/final_sentiment_tokenizer")

('/kaggle/working/final_sentiment_tokenizer/tokenizer_config.json',
 '/kaggle/working/final_sentiment_tokenizer/special_tokens_map.json',
 '/kaggle/working/final_sentiment_tokenizer/vocab.json',
 '/kaggle/working/final_sentiment_tokenizer/merges.txt',
 '/kaggle/working/final_sentiment_tokenizer/added_tokens.json',
 '/kaggle/working/final_sentiment_tokenizer/tokenizer.json')

# Push to hugging face

In [73]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [74]:
model.push_to_hub("suryaummadi/review-roberta-customer-experience-analytics")
tokenizer.push_to_hub("suryaummadi/review-roberta-customer-experience-analytics")

Uploading...:   0%|          | 0.00/499M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/suryaummadi/review-roberta-customer-experience-analytics/commit/57dc453b332427851f32dbd745c9955a1c195ff3', commit_message='Upload tokenizer', commit_description='', oid='57dc453b332427851f32dbd745c9955a1c195ff3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/suryaummadi/review-roberta-customer-experience-analytics', endpoint='https://huggingface.co', repo_type='model', repo_id='suryaummadi/review-roberta-customer-experience-analytics'), pr_revision=None, pr_num=None)

# Loading from hugging face and working

In [78]:
from transformers import pipeline

pipe = pipeline(
    "text-classification",
    model="suryaummadi/review-roberta-customer-experience-analytics",
    return_all_scores=True
)

def get_top_sentiment(text):
    outputs = pipe(text)
    scores = outputs[0]

    # Find label with highest score
    top = max(scores, key=lambda x: x['score'])
    label = top['label']  # labels are already 'negative', 'neutral', 'positive'
    percentage = top['score'] * 100

    return f"The customer feels {percentage:.1f}% {label} about the product."

# Example usage
review = "The product quality is excellent and I love using it every day."
print(get_top_sentiment(review))


Device set to use cuda:0


The customer feels 98.0% positive about the product.


In [80]:
pipe('fuck your product')

[[{'label': 'negative', 'score': 0.9853103160858154},
  {'label': 'neutral', 'score': 0.005511525087058544},
  {'label': 'positive', 'score': 0.009178121574223042}]]

In [None]:
##End