# NLI for ABSA

In [8]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# Model and tokenizer initialization (only once)
def initialize_nli_model(model_name='MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli'):
    device = 0 if torch.cuda.is_available() else -1  # Check if GPU is available
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    
    # Create a pipeline for NLI, specifying the device
    nli_pipeline = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=device)
    
    return nli_pipeline, device

# Function to extract sentiment expressions
def extract_sentiment_expression_nli(review, aspects, nli_pipeline):
    # Define possible labels for sentiment
    candidate_labels = ['positive', 'negative']

    # Store the answers
    answers = {}

    # Iterate over the provided aspects to construct the NLI inputs
    for aspect in aspects:
        # Formulate the hypothesis
        hypothesis = f"The sentiment for the aspect '{aspect}' is"

        # Use the NLI pipeline to predict the sentiment for each aspect
        response = nli_pipeline(
            sequences=review,   # Premise: The review text
            candidate_labels=[f"{hypothesis} {label}" for label in candidate_labels],  # Hypotheses
        )

        # Extract the sentiment with the highest score
        sentiment = response['labels'][0].split()[-1]  # Get the last word ('positive' or 'negative')
        answers[aspect] = sentiment

    return answers

# Example usage
review = "The quality is good but the price is expensive."
aspects = ["quality", "price"]

# Initialize the model and pipeline once
nli_pipeline, device = initialize_nli_model()

# Extract sentiment expressions for the review and specified aspects
sentiment_expressions = extract_sentiment_expression_nli(review, aspects, nli_pipeline)
print(f"Review: '{review}' => Sentiment Expressions: {sentiment_expressions}")

# Checks GPU availability and sets the device to GPU (or CPU)

# Check number of GPUs
num_gpus = torch.cuda.device_count()
print("Number of GPU(s):", num_gpus)

# Check GPU name
if num_gpus > 0:
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU found.")

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Review: 'The quality is good but the price is expensive.' => Sentiment Expressions: {'quality': 'positive', 'price': 'negative'}
Number of GPU(s): 1
GPU Name: NVIDIA GeForce RTX 3070 Ti
Using device: cuda


In [44]:
import pandas as pd

# Read the Excel file
file_path = 'predicted_labels_v4df_apr-24.xlsx'
df = pd.read_excel(file_path)
df.head()

Unnamed: 0,Survey ID,Product Name,Print Customer Region,LTR,Source Type,Survey language,Review Source,Star Rating,Product Family,Supplies Family,...,max_predicted_probability,max_predicted_aspect,0,Aspect 1,Aspect 2,Aspect 3,Aspect 4,Output Labels,Label Vectors,Predicted Labels
0,110020182,HP 63 Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,0.83522,Delivery,0.0,,Delivery,,,['Delivery'],"[1, 0, 0, 0]",['Delivery']
1,123460320,HP 65XL Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,0.790874,Delivery,,,Delivery,,,['Delivery'],"[1, 0, 0, 0]",['Delivery']
2,133472112,HP 64XL High Yield Tri-color Original Ink Cart...,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Centaur,...,0.807059,Delivery,,Customer Service,Delivery,Price,,"['Customer Service', 'Delivery', 'Price']","[1, 0, 1, 1]","['Delivery', 'Price']"
3,134626563,HP 65XL Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,0.76111,Product Quality,,Customer Service,,,Product Quality,"['Customer Service', 'Product Quality']","[0, 1, 0, 1]","['Product Quality', 'Customer Service']"
4,172491173,HP 65XL Black Original Ink Cartridge,US,10,Web Reviews,English,Walmart,5.0,Supplies - Ink,Dolmen Refresh,...,0.713346,Product Quality,,,,,Product Quality,['Product Quality'],"[0, 1, 0, 0]",['Customer Service']


In [45]:
# df = df.head(1000)
len(df)

3959

In [46]:
import ast

# Convert the 'Sentiment Expressions' column from string representation to dictionary
df['Predicted Labels'] = df['Predicted Labels'].apply(ast.literal_eval)

In [47]:
# Applying the function to the DataFrame
def process_dataframe(df, nli_pipeline):
    # Copy over the 'Predicted Labels' to 'Aspect List' as they are already processed lists
    df['Aspect List'] = df['Predicted Labels']  # Assuming 'Predicted Labels' are lists

    # Apply the sentiment extraction function to each row
    df['Sentiment Expressions'] = df.apply(
        lambda row: extract_sentiment_expression_nli(row['Combined Text'], row['Aspect List'], nli_pipeline),
        axis=1  # Make sure the lambda applies row-wise
    )

    return df

# Initialize the model and pipeline once
nli_pipeline, device = initialize_nli_model()

# Apply the process_dataframe function to df
df_new = process_dataframe(df, nli_pipeline)

# Save the new DataFrame to an Excel file
df_new.to_excel('[roBERTa] processed_sentiment_data_v4df_Apr-24_NLI.xlsx', index=False)

print("Data saved successfully.")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Data saved successfully.


In [48]:
# Function to process the sentiment expression and assign the label
def process_sentiment_label(sentiment):
    if any(keyword in sentiment for keyword in ['negative']):
        return 'Negative'
    elif any(keyword in sentiment for keyword in ['positive']):
        return 'Positive'
    else:
        return 'Neutral'

# Function to expand rows for each aspect and sentiment
def expand_rows_for_aspects(df):
    expanded_rows = []

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        # Get the sentiment dictionary from the row
        sentiments = row['Sentiment Expressions']  # This is a dict e.g. {'Price': 'Positive', 'Customer Service': 'Negative'}

        # Check if sentiments is a dictionary and not empty
        if isinstance(sentiments, dict) and sentiments:
            # Iterate over each aspect in the sentiment dictionary
            for aspect, sentiment_expression in sentiments.items():
                new_row = row.copy()  # Copy the current row
                
                # Create a new column for the current aspect
                new_row['Aspect'] = aspect
                
                # Create a new column for the sentiment label based on the sentiment expression
                new_row['Predicted Sentiment'] = process_sentiment_label(sentiment_expression)
                
                # Append the new row to the list
                expanded_rows.append(new_row)
        else:
            # If there are no sentiments, append the original row without modifications
            expanded_rows.append(row)

    # Create a new DataFrame from the expanded rows
    expanded_df = pd.DataFrame(expanded_rows)
    
    # Filter out rows where 'Predicted Sentiment' is blank or NaN
    expanded_df = expanded_df[expanded_df['Predicted Sentiment'].notna() & (expanded_df['Predicted Sentiment'] != '')]
    
    return expanded_df


# 'df_new' contains the columns 'Sentiment Expressions', which is a dictionary of aspects and sentiments
df_expanded = expand_rows_for_aspects(df_new)

# Save the expanded DataFrame to an Excel file
df_expanded.to_excel('[roBERTa] expanded_sentiment_data_apr_NLI-24.xlsx', index=False)

print("Expanded data saved")

Expanded data saved
