In [1]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use GPU
    print(f"GPU is available. Using {torch.cuda.get_device_name(0)}.")
else:
    device = torch.device("cpu")   # Use CPU
    print("GPU is not available. Using CPU.")

# Example: Moving a tensor or model to the device (GPU or CPU)
# tensor = torch.tensor([1, 2, 3]).to(device)
# model = model.to(device)  # If you have a model

# Example tensor to verify it's on the correct device
tensor = torch.tensor([1, 2, 3], device=device)
print(f"Tensor is on: {tensor.device}")


GPU is available. Using NVIDIA GeForce RTX 3070 Ti Laptop GPU.
Tensor is on: cuda:0


In [2]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Number of labels for multilabel classification (set this based on your task)
num_classes = 5  # Change this to the actual number of classes you have

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Load pre-trained DistilBERT model for multilabel classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_classes)
model.config.problem_type = "multi_label_classification"  # Set the problem type

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Print confirmation
print(f'Model is loaded with {num_classes} output classes and using device: {device}')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is loaded with 5 output classes and using device: cuda


In [3]:
import pandas as pd

# Read the CSV file
file_path = 'combined_dfv4.csv'
df = pd.read_csv(file_path)

# Define the list of possible output labels
output_labels = ['Delivery', 'Product Quality', 'Price', 'Customer Service']

# Function to extract labels from the 'Aspect 1', 'Aspect 2', 'Aspect 3' columns
def extract_labels(row):
    labels = []
    for aspect in ['Aspect 1', 'Aspect 2', 'Aspect 3', 'Aspect 4']:
        if row[aspect] in output_labels:
            labels.append(row[aspect])
    return labels

# Create a new column with the list of output labels
df['Output Labels'] = df.apply(extract_labels, axis=1)

# Display the first few rows to check the result
print(df[['Combined Text', 'Month of Response Date','Aspect 1', 'Aspect 2', 'Aspect 3', 'Aspect 4', 'Output Labels']].head())


                                       Combined Text Month of Response Date  \
0  Reasonable priced with a high capacity of prints.                 Sep-23   
1                     Quick delivery, easy to order!                 Dec-23   
2  I bought HP ink for my printer and it is the o...                 Jul-23   
3  Best price for good ink cartridge. Easy to ins...                 Mar-24   
4  always buy brand have had such bad luck with a...                 Aug-23   

           Aspect 1  Aspect 2 Aspect 3 Aspect 4              Output Labels  
0               NaN       NaN    Price      NaN                    [Price]  
1               NaN  Delivery      NaN      NaN                 [Delivery]  
2               NaN       NaN      NaN      NaN                         []  
3  Customer Service       NaN    Price      NaN  [Customer Service, Price]  
4               NaN       NaN      NaN      NaN                         []  


In [9]:
print(df.dtypes)

Survey ID                      int64
Product Name                  object
Print Customer Region         object
LTR                            int64
Source Type                   object
Survey language               object
Review Source                 object
Star Rating                  float64
Product Family                object
Supplies Family               object
Printer Family                object
Model Name                    object
Combined Text                 object
Ink Supply Type               object
token_count                    int64
Month of Response Date        object
predicted_level1              object
predicted_probabilities       object
predicted_aspect              object
max_predicted_probability    float64
max_predicted_aspect          object
0                            float64
Aspect 1                      object
Aspect 2                      object
Aspect 3                      object
Aspect 4                      object
Output Labels                 object
d

In [4]:
# Filter for 'Mar-24' only
df = df[df['Month of Response Date'] == "Apr-24"]
df = df[df['Output Labels'] != "[]"]

# Filter the DataFrame to exclude rows where 'Output Labels' is an empty list
df = df[df['Output Labels'].map(lambda x: len(x) > 0)]
df.head()

Unnamed: 0,Survey ID,Product Name,Print Customer Region,LTR,Source Type,Survey language,Review Source,Star Rating,Product Family,Supplies Family,...,predicted_probabilities,predicted_aspect,max_predicted_probability,max_predicted_aspect,0,Aspect 1,Aspect 2,Aspect 3,Aspect 4,Output Labels
18,110020182,HP 63 Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,"['0.8352197918762199', '0.7786848755893012']","Delivery, 0",0.83522,Delivery,0.0,,Delivery,,,[Delivery]
108,123460320,HP 65XL Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,['0.7908739113381595'],Delivery,0.790874,Delivery,,,Delivery,,,[Delivery]
141,133472112,HP 64XL High Yield Tri-color Original Ink Cart...,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Centaur,...,"['0.8070593486001005', '0.7121404761904764', '...","Delivery, Customer Service, Price",0.807059,Delivery,,Customer Service,Delivery,Price,,"[Customer Service, Delivery, Price]"
155,134626563,HP 65XL Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,"['0.7097935292855333', '0.761109698797934']","Customer Service, Product Quality",0.76111,Product Quality,,Customer Service,,,Product Quality,"[Customer Service, Product Quality]"
312,172491173,HP 65XL Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,['0.7133459591878705'],Product Quality,0.713346,Product Quality,,,,,Product Quality,[Product Quality]


In [5]:
# Convert 'Output Labels' into binary vectors
def labels_to_vector(row):
    label_vector = [1 if label in row['Output Labels'] else 0 for label in output_labels]
    return label_vector

df['Label Vectors'] = df.apply(labels_to_vector, axis=1)

# Prepare text data and labels for the model
texts = df['Combined Text'].tolist()
labels = df['Label Vectors'].tolist()

# Tokenize the text data using DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

# Convert labels to torch tensors
labels = torch.tensor(labels)

# Create the dataset with the original DataFrame
class MultilabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, original_df):
        self.encodings = encodings
        self.labels = labels
        self.original_df = original_df

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset
dataset = MultilabelDataset(encodings, labels, df)

# Displaying the first few rows for validation
df.head()

Unnamed: 0,Survey ID,Product Name,Print Customer Region,LTR,Source Type,Survey language,Review Source,Star Rating,Product Family,Supplies Family,...,predicted_aspect,max_predicted_probability,max_predicted_aspect,0,Aspect 1,Aspect 2,Aspect 3,Aspect 4,Output Labels,Label Vectors
18,110020182,HP 63 Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,"Delivery, 0",0.83522,Delivery,0.0,,Delivery,,,[Delivery],"[1, 0, 0, 0]"
108,123460320,HP 65XL Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,Delivery,0.790874,Delivery,,,Delivery,,,[Delivery],"[1, 0, 0, 0]"
141,133472112,HP 64XL High Yield Tri-color Original Ink Cart...,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Centaur,...,"Delivery, Customer Service, Price",0.807059,Delivery,,Customer Service,Delivery,Price,,"[Customer Service, Delivery, Price]","[1, 0, 1, 1]"
155,134626563,HP 65XL Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,"Customer Service, Product Quality",0.76111,Product Quality,,Customer Service,,,Product Quality,"[Customer Service, Product Quality]","[0, 1, 0, 1]"
312,172491173,HP 65XL Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,Product Quality,0.713346,Product Quality,,,,,Product Quality,[Product Quality],"[0, 1, 0, 0]"


## Updated using Multilabel metrics

In [22]:
import pandas as pd
import numpy as np
from transformers import DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, random_split
import torch
from sklearn.metrics import f1_score, precision_score, recall_score, hamming_loss, classification_report

# Assuming 'dataset' is your prepared dataset

# Split the dataset into training and validation sets (80% train, 20% validation)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders for training and validation
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Load pre-trained DistilBERT model for multilabel classification (number of classes)
output_labels = ['Delivery', 'Product Quality', 'Price', 'Customer Service']
num_classes = len(output_labels)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_classes)

# Use GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Initialize lists to store metrics
train_losses = []
val_losses = []
metrics_list = []

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    true_labels = []
    predictions = []

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].float().to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Correctly access the logits

        # Calculate loss
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits, labels)
        total_loss += loss.item()

        # Store true labels and predictions for metrics
        true_labels.append(labels.detach().cpu().numpy())
        predictions.append((torch.sigmoid(logits) > 0.5).detach().cpu().numpy())

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Calculate average loss for this epoch
    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)

    # Concatenate predictions and true labels for the training set
    true_labels_concat = np.concatenate(true_labels)
    predictions_concat = np.concatenate(predictions)

    # Validation phase
    model.eval()
    val_total_loss = 0
    val_true_labels = []
    val_predictions = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].float().to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Correctly access the logits

            val_loss = loss_fct(logits, labels)
            val_total_loss += val_loss.item()

            # Store true labels and predictions for metrics
            val_true_labels.append(labels.detach().cpu().numpy())
            val_predictions.append((torch.sigmoid(logits) > 0.5).detach().cpu().numpy())

    # Calculate average loss for validation
    avg_val_loss = val_total_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    # Concatenate predictions and true labels for the validation set
    val_true_labels_concat = np.concatenate(val_true_labels)
    val_predictions_concat = np.concatenate(val_predictions)

    # Compute performance metrics (macro and micro) for validation
    f1_macro = f1_score(val_true_labels_concat, val_predictions_concat, average='macro')
    f1_micro = f1_score(val_true_labels_concat, val_predictions_concat, average='micro')
    precision_macro = precision_score(val_true_labels_concat, val_predictions_concat, average='macro')
    precision_micro = precision_score(val_true_labels_concat, val_predictions_concat, average='micro')
    recall_macro = recall_score(val_true_labels_concat, val_predictions_concat, average='macro')
    recall_micro = recall_score(val_true_labels_concat, val_predictions_concat, average='micro')
    hamming = hamming_loss(val_true_labels_concat, val_predictions_concat)

    # Log metrics for this epoch
    metrics_list.append({
        'Epoch': epoch + 1,
        'Train Loss': avg_loss,
        'Validation Loss': avg_val_loss,
        'F1 Score (Macro)': f1_macro,
        'F1 Score (Micro)': f1_micro,
        'Precision (Macro)': precision_macro,
        'Precision (Micro)': precision_micro,
        'Recall (Macro)': recall_macro,
        'Recall (Micro)': recall_micro,
        'Hamming Loss': hamming
    })

    # Generate classification reports for both training and validation
    print(f"\nEpoch {epoch + 1} Classification Report (Training):")
    print(classification_report(true_labels_concat, predictions_concat, target_names=output_labels))

    print(f"\nEpoch {epoch + 1} Classification Report (Validation):")
    print(classification_report(val_true_labels_concat, val_predictions_concat, target_names=output_labels))

# Save training metrics to Excel
metrics_df = pd.DataFrame(metrics_list)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item['labels'] = torch.tensor(self.labels[idx])



Epoch 1 Classification Report (Training):
                  precision    recall  f1-score   support

        Delivery       0.76      0.53      0.62       638
 Product Quality       0.70      0.52      0.60      1274
           Price       0.91      0.73      0.81       791
Customer Service       0.63      0.60      0.62      1640

       micro avg       0.71      0.59      0.65      4343
       macro avg       0.75      0.60      0.66      4343
    weighted avg       0.72      0.59      0.65      4343
     samples avg       0.63      0.62      0.60      4343


Epoch 1 Classification Report (Validation):
                  precision    recall  f1-score   support

        Delivery       0.84      0.65      0.73       158
 Product Quality       0.72      0.72      0.72       358
           Price       0.78      1.00      0.88       185
Customer Service       0.70      0.65      0.67       413

       micro avg       0.74      0.73      0.73      1114
       macro avg       0.76      0.75

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  item['labels'] = torch.tensor(self.labels[idx])



Epoch 2 Classification Report (Training):
                  precision    recall  f1-score   support

        Delivery       0.83      0.77      0.80       638
 Product Quality       0.77      0.76      0.76      1274
           Price       0.95      0.95      0.95       791
Customer Service       0.76      0.71      0.73      1640

       micro avg       0.81      0.78      0.79      4343
       macro avg       0.83      0.80      0.81      4343
    weighted avg       0.81      0.78      0.79      4343
     samples avg       0.83      0.82      0.80      4343


Epoch 2 Classification Report (Validation):
                  precision    recall  f1-score   support

        Delivery       0.87      0.63      0.73       158
 Product Quality       0.73      0.82      0.77       358
           Price       0.89      0.97      0.93       185
Customer Service       0.70      0.78      0.73       413

       micro avg       0.76      0.80      0.78      1114
       macro avg       0.80      0.80

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  item['labels'] = torch.tensor(self.labels[idx])



Epoch 3 Classification Report (Training):
                  precision    recall  f1-score   support

        Delivery       0.90      0.86      0.88       638
 Product Quality       0.84      0.83      0.84      1274
           Price       0.97      0.96      0.97       791
Customer Service       0.84      0.78      0.81      1640

       micro avg       0.87      0.84      0.86      4343
       macro avg       0.89      0.86      0.87      4343
    weighted avg       0.87      0.84      0.86      4343
     samples avg       0.90      0.88      0.87      4343


Epoch 3 Classification Report (Validation):
                  precision    recall  f1-score   support

        Delivery       0.84      0.65      0.74       158
 Product Quality       0.76      0.77      0.76       358
           Price       0.98      0.85      0.91       185
Customer Service       0.73      0.74      0.74       413

       micro avg       0.79      0.75      0.77      1114
       macro avg       0.83      0.75

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
print(metrics_df)


   Epoch  Train Loss  Validation Loss  F1 Score (Macro)  F1 Score (Micro)  \
0      1    0.404305         0.353598          0.749849          0.733995   
1      2    0.269680         0.298846          0.791624          0.779617   
2      3    0.202863         0.320394          0.785958          0.772769   

   Precision (Macro)  Precision (Micro)  Recall (Macro)  Recall (Micro)  \
0           0.758338           0.737319        0.754486        0.730700   
1           0.795554           0.757191        0.800435        0.803411   
2           0.828506           0.792453        0.751801        0.754039   

   Hamming Loss  
0      0.162804  
1      0.139625  
2      0.136313  


In [24]:
# Save the tokenizer
tokenizer.save_pretrained('models/distilbert multilabels_v4df_mar-24')
print(f'Tokenizer saved to models/distilbert multilabels_v4df_mar-24')

# Save the entire model with configuration for future use
model.save_pretrained('models/distilbert multilabels_v4df_mar-24')
print(f'Model with configuration saved to models/distilbert multilabels_v4df_mar-24')


Tokenizer saved to models/distilbert multilabels_v4df_mar-24
Model with configuration saved to models/distilbert multilabels_v4df_mar-24


In [8]:
import os
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset  # Import Dataset

# Function to prepare dataset from text
def prepare_dataset(texts, tokenizer, max_length=512):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return list(zip(encodings['input_ids'], encodings['attention_mask']))

# Custom dataset for batching
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings[idx][0],
            'attention_mask': self.encodings[idx][1]
        }

# Function to load model, predict, and update DataFrame
def predict_and_update_dataframe(model_path, df, text_column='Combined Text', output_labels=None, batch_size=16):
    # Load the tokenizer using relative paths
    try:
        tokenizer = DistilBertTokenizer.from_pretrained(model_path)
    except Exception as e:
        print(f"Error loading tokenizer: {e}")
        return

    # Instantiate the model and load the weights
    try:
        model = DistilBertForSequenceClassification.from_pretrained(model_path, num_labels=len(output_labels))
    except Exception as e:
        print(f"Error loading model or weights: {e}")
        return

    # Move model to device (use CPU if GPU is not available or out of memory)
    # device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    device = torch.device('cpu')
    model.to(device)
    model.eval()

    # Prepare the dataset from the DataFrame
    dataset = df[text_column].tolist()  # Assuming 'Combined Text' contains the text data
    encodings = prepare_dataset(dataset, tokenizer)

    # Use DataLoader to load data in batches
    data_loader = DataLoader(TextDataset(encodings), batch_size=batch_size, shuffle=False)

    predictions = []

    # Perform inference in batches
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.sigmoid(logits).cpu().numpy()

            # Convert predictions to label columns
            for pred in preds:
                labels = [output_labels[i] for i in range(len(output_labels)) if pred[i] >= 0.5]  # Threshold of 0.5
                predictions.append(labels)

    # Add predictions back to the DataFrame
    df['Predicted Labels'] = predictions

    # Save the updated DataFrame to an Excel file
    df.to_excel('predicted_labels_v4df_apr-24.xlsx', index=False)
    print('Predictions saved to predicted_labels_v4df_apr-24.xlsx')

    # Save the filtered DataFrame to a CSV file
    df.to_csv('predicted_labels_v4df_apr-24.csv', index=False)
    print('Filtered predictions saved to predicted_labels_v4df_apr-24.csv')


model_path = 'models/distilbert multilabels_v4df_mar-24'

predict_and_update_dataframe(
    model_path,
    df,
    output_labels=['Delivery', 'Product Quality', 'Price', 'Customer Service'],
    batch_size=8  
)


Predictions saved to predicted_labels_v4df_apr-24.xlsx
Filtered predictions saved to predicted_labels_v4df_apr-24.csv


In [11]:
import ollama

def extract_sentiment_expression_llama(review, aspects, model_name='llama3.1'):
    # Store the answers
    answers = {}

    # Iterate over the provided aspects to construct the prompt
    for aspect in aspects:
        prompt = f"""
        Review: "{review}"
        Aspect: "{aspect}"
        What is the sentiment (positive, negative) for this aspect? Return the sentiment identified only. 
        """

        # Use the Ollama API to generate the sentiment expression
        response = ollama.chat(
            model=model_name, 
            messages=[{"role": "user", "content": prompt}]
        )

        # Extract the sentiment expression from the response
        result_text = response['message']['content']
        answers[aspect] = result_text.strip()

    return answers

# Example usage
review = "The quality is good but the price is expensive."
aspects = ["quality", "price"]

# Extract sentiment expressions for the review and specified aspects
sentiment_expressions = extract_sentiment_expression_llama(review, aspects)
print(f"Review: '{review}' => Sentiment Expressions: {sentiment_expressions}")


Review: 'The quality is good but the price is expensive.' => Sentiment Expressions: {'quality': 'Negative', 'price': 'Negative'}


# need to change the code below to a df, so that we dont re-ingest as a new df

In [2]:
import pandas as pd

# Read the Excel file
file_path = 'predicted_labels_v4df_mar-24.xlsx'
df = pd.read_excel(file_path)
df.head()

Unnamed: 0,Survey ID,Product Name,Print Customer Region,LTR,Source Type,Survey language,Review Source,Star Rating,Product Family,Supplies Family,...,max_predicted_probability,max_predicted_aspect,0,Aspect 1,Aspect 2,Aspect 3,Aspect 4,Output Labels,Label Vectors,Predicted Labels
0,101618952,HP 63 Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,0.954531,0,,Customer Service,,Price,,"['Customer Service', 'Price']","[0, 0, 1, 1]","['Price', 'Customer Service']"
1,110317001,HP 63XL High Yield Black Original Ink Cartridge,US,10,Web Reviews,English,HP US,5.0,Supplies - Ink,Dolmen Refresh,...,0.766199,Delivery,,,Delivery,,,['Delivery'],"[1, 0, 0, 0]",['Delivery']
2,113652950,HP 63 Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,0.809284,Price,,,,Price,,['Price'],"[0, 0, 1, 0]",['Price']
3,117483634,HP 65XL Tri-color Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,0.780703,Price,,,,Price,,['Price'],"[0, 0, 1, 0]",['Price']
4,118156397,HP 951XL 3-pack High Yield Cyan/Magenta/Yellow...,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Nesta+,...,0.807181,Delivery,,,Delivery,,,['Delivery'],"[1, 0, 0, 0]",['Delivery']


In [6]:
import ast

# Convert the 'Sentiment Expressions' column from string representation to dictionary
df['Predicted Labels'] = df['Predicted Labels'].apply(ast.literal_eval)

In [8]:
df.head()
print(df.dtypes)

Survey ID                      int64
Product Name                  object
Print Customer Region         object
LTR                            int64
Source Type                   object
Survey language               object
Review Source                 object
Star Rating                  float64
Product Family                object
Supplies Family               object
Printer Family                object
Model Name                    object
Combined Text                 object
Ink Supply Type               object
token_count                    int64
Month of Response Date        object
predicted_level1              object
predicted_probabilities       object
predicted_aspect              object
max_predicted_probability    float64
max_predicted_aspect          object
0                            float64
Aspect 1                      object
Aspect 2                      object
Aspect 3                      object
Aspect 4                      object
Output Labels                 object
L

In [12]:
# Applying the function to the DataFrame
def process_dataframe(df):
    # Since the 'Predicted Labels' are already lists, we rename this step
    df['Aspect List'] = df['Predicted Labels']  # Copy over the already processed list

    # Apply the sentiment extraction function to each row
    df['Sentiment Expressions'] = df.apply(
        lambda row: extract_sentiment_expression_llama(row['Combined Text'], row['Aspect List']), axis=1
    )

    return df

# Apply to df
df_new = process_dataframe(df)

# Save the new DataFrame to an Excel file
df_new.to_excel('processed_sentiment_data_v4df_mar-24.xlsx', index=False)

print("Data saved to 'processed_sentiment_v4df_mar-24.xlsx'")


Data saved to 'processed_sentiment_v4df_mar-24.xlsx'


need to change the code so that it will auto-add the timestamp to the excel file

In [None]:
# import ast

# Example of how your data might look
# df['Sentiment Expressions'] = ["{'Delivery': 'Positive'}", "{'Price': 'Negative'}"]

# Convert the 'Sentiment Expressions' column to lists of dictionaries
# df_new['Sentiment Expressions'] = df['Sentiment Expressions'].apply(
#    lambda x: [ast.literal_eval(x)]
# )

# Display the first few rows of the DataFrame
# print(df_new.head())


In [13]:
# Function to process the sentiment expression and assign the label
def process_sentiment_label(sentiment):
    if any(keyword in sentiment for keyword in ['Negative', 'Concern', 'Not Satisfied', 'Mixed']):
        return 'Negative'
    elif any(keyword in sentiment for keyword in ['Positive', 'Satisfied']):
        return 'Positive'
    else:
        return 'Neutral'

# Function to expand rows for each aspect and sentiment
def expand_rows_for_aspects(df):
    expanded_rows = []

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        # Get the sentiment dictionary from the row
        sentiments = row['Sentiment Expressions']  # This is a dict e.g. {'Price': 'Positive', 'Customer Service': 'Negative'}

        # Check if sentiments is a dictionary and not empty
        if isinstance(sentiments, dict) and sentiments:
            # Iterate over each aspect in the sentiment dictionary
            for aspect, sentiment_expression in sentiments.items():
                new_row = row.copy()  # Copy the current row
                
                # Create a new column for the current aspect
                new_row['Aspect'] = aspect
                
                # Create a new column for the sentiment label based on the sentiment expression
                new_row['Predicted Sentiment'] = process_sentiment_label(sentiment_expression)
                
                # Append the new row to the list
                expanded_rows.append(new_row)
        else:
            # If there are no sentiments, append the original row without modifications
            expanded_rows.append(row)

    # Create a new DataFrame from the expanded rows
    expanded_df = pd.DataFrame(expanded_rows)
    
    # Filter out rows where 'Predicted Sentiment' is blank or NaN
    expanded_df = expanded_df[expanded_df['Predicted Sentiment'].notna() & (expanded_df['Predicted Sentiment'] != '')]
    
    return expanded_df


# 'df_new' contains the columns 'Sentiment Expressions', which is a dictionary of aspects and sentiments
df_expanded = expand_rows_for_aspects(df_new)

In [42]:
print(df_expanded.dtypes)

Survey ID                      int64
Product Name                  object
Print Customer Region         object
LTR                            int64
Source Type                   object
Survey language               object
Review Source                 object
Star Rating                  float64
Product Family                object
Supplies Family               object
Printer Family                object
Model Name                    object
Combined Text                 object
Ink Supply Type               object
token_count                    int64
Month of Response Date        object
predicted_level1              object
predicted_probabilities       object
predicted_aspect              object
max_predicted_probability    float64
max_predicted_aspect          object
0                            float64
Aspect 1                      object
Aspect 2                      object
Aspect 3                      object
Aspect 4                      object
Output Labels                 object
L

In [14]:
def score_to_sentiment(row):
    if not pd.isna(row['LTR']):
        # use LTR (0-10)
        if row['LTR'] <= 4:
            return 'Negative'
        else:
            return 'Positive'
    elif not pd.isna(row['Star Rating']):
        # USe Star Rating (1-5)
        if row['Star Rating'] <=2:
            return 'Negative'
        else:
            return 'Positive'
    else:
        return 'Unknown'

df_expanded['Actual Sentiment'] = df_expanded.apply(score_to_sentiment, axis=1)

# Save the expanded DataFrame to an Excel file
df_expanded.to_excel('expanded_sentiment_data_mar-24.xlsx', index=False)

print("Expanded data saved to 'expanded_sentiment_data_mar-24.xlsx'")

Expanded data saved to 'expanded_sentiment_data_mar-24.xlsx'


In [None]:
## this is for mar'24 
from sklearn.metrics import confusion_matrix, classification_report

# 'df_expanded' is the DataFrame with 'Actual Sentiment' and 'Predicted Sentiment' columns
def calculate_confusion_matrix(df):
    # Check if the 'Actual Sentiment' and 'Predicted Sentiment' columns exist
    if 'Actual Sentiment' not in df.columns or 'Predicted Sentiment' not in df.columns:
        raise ValueError("Columns 'Actual Sentiment' and 'Predicted Sentiment' are required in the DataFrame")

    # Extract the actual and predicted sentiments
    actual_sentiments = df['Actual Sentiment']
    predicted_sentiments = df['Predicted Sentiment']

    # Calculate the confusion matrix
    cm = confusion_matrix(actual_sentiments, predicted_sentiments, labels=['Positive', 'Negative', 'Neutral'])

    # Generate a classification report
    report = classification_report(actual_sentiments, predicted_sentiments, target_names=['Positive', 'Negative', 'Neutral'])

    return cm, report

# Calculate the confusion matrix and classification report
conf_matrix, class_report = calculate_confusion_matrix(df_expanded)

# Display the results
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


In [48]:
## this is for apr'24 
from sklearn.metrics import confusion_matrix, classification_report

# 'df_expanded' is the DataFrame with 'Actual Sentiment' and 'Predicted Sentiment' columns
def calculate_confusion_matrix(df):
    # Check if the 'Actual Sentiment' and 'Predicted Sentiment' columns exist
    if 'Actual Sentiment' not in df.columns or 'Predicted Sentiment' not in df.columns:
        raise ValueError("Columns 'Actual Sentiment' and 'Predicted Sentiment' are required in the DataFrame")

    # Extract the actual and predicted sentiments
    actual_sentiments = df['Actual Sentiment']
    predicted_sentiments = df['Predicted Sentiment']

    # Calculate the confusion matrix
    cm = confusion_matrix(actual_sentiments, predicted_sentiments, labels=['Positive', 'Negative', 'Neutral'])

    # Generate a classification report
    report = classification_report(actual_sentiments, predicted_sentiments, target_names=['Positive', 'Negative', 'Neutral'])

    return cm, report

# Calculate the confusion matrix and classification report
conf_matrix, class_report = calculate_confusion_matrix(df_expanded)

# Display the results
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Confusion Matrix:
[[1387 1640   86]
 [   5 2038    8]
 [   0    0    0]]

Classification Report:
              precision    recall  f1-score   support

    Positive       0.55      0.99      0.71      2051
    Negative       0.00      0.00      0.00         0
     Neutral       1.00      0.45      0.62      3113

    accuracy                           0.66      5164
   macro avg       0.52      0.48      0.44      5164
weighted avg       0.82      0.66      0.65      5164



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
