In [5]:
!pip install -q transformers

In [6]:
import pandas as pd
from tqdm.auto import tqdm

In [7]:
import torch

# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA GeForce GTX 980M


from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [9]:
# Inform the transformers library to use the specified device (GPU or CPU)
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0 if device.type == 'cuda' else -1)

In [10]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']

classifier(sequence_to_classify, candidate_labels)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.9938651323318481, 0.0032737706787884235, 0.0028610231820493937]}

**If more than one candidate label can be correct.**

In [11]:
candidate_labels = ['travel', 'cooking', 'dancing', 'exploration']
classifier(sequence_to_classify, candidate_labels, multi_label=True)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'exploration', 'dancing', 'cooking'],
 'scores': [0.9945111274719238,
  0.9383887648582458,
  0.005706172436475754,
  0.0018193108262494206]}

In [12]:
sequence_to_classify = "Donald Trump will be next president"
candidate_labels = ['science', 'politics', 'history']

classifier(sequence_to_classify, candidate_labels)

{'sequence': 'Donald Trump will be next president',
 'labels': ['politics', 'history', 'science'],
 'scores': [0.8404949903488159, 0.15547983348369598, 0.004025168716907501]}

In [13]:
#download_dir = 'D:\\downloads'
download_dir = 'D:\\downloads\\amazon_customer_reviews'
print(download_dir)

D:\downloads\amazon_customer_reviews


In [14]:
import os

In [15]:
os.listdir(download_dir)

['4ee3300b-8d78-46ac-8abf-72edb1f4f5db',
 'amazon_categories.csv',
 'amazon_products.csv',
 'Amazon_products.zip',
 'amazon_product_data',
 'amazon_product_data.zip',
 'amazon_reviews_pickle_paragraphs.pkl',
 'amazon_reviews_pickle_paragraphs.xlsx',
 'amazon_reviews_pickle_paragraphs_20240129.pkl',
 'amazon_reviews_pickle_paragraphs_20240219.pkl',
 'amazon_reviews_pickle_paragraphs_20240219.xlsx',
 'amazon_reviews_pickle_paragraphs_similarity_20240219.pkl',
 'amazon_reviews_pickle_paragraphs_similarity_reduced_20240219.pkl',
 'amazon_reviews_pickle_sentences',
 'amazon_reviews_pickle_sentences_20240129.pkl',
 'amazon_reviews_pickle_sentences_20240219.pkl',
 'amazon_reviews_pickle_sentences_similarity_20240219.pkl',
 'amazon_reviews_pickle_sentences_similarity_20240219_index_paragraph.faiss',
 'amazon_reviews_pickle_sentences_similarity_20240219_index_sentence.faiss',
 'amazon_reviews_pickle_sentences_similarity_20240219_index_summary.faiss',
 'chroma.sqlite3',
 'Reviews.csv',
 'selecte

In [16]:
df_sentence_filename = os.path.join(download_dir,"amazon_reviews_pickle_sentences_20240219.pkl")
df_sentence = pd.read_pickle(df_sentence_filename)

In [17]:
df_sentence.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'P_index',
       'S_sentence_number', 'Sentence', 'Summary_vector', 'Sentence_vector'],
      dtype='object')

In [18]:
sample_series = df_sentence.iloc[0:500000]['Sentence']

In [19]:
type(sample_series)
print(sample_series)

0         I have bought several of the Vitality canned d...
1         The product looks more like a stew than a proc...
2         My Labrador is finicky and she appreciates thi...
3         Product arrived labeled as Jumbo Salted Peanut...
4         Not sure if this was an error or if the vendor...
                                ...                        
499995    I went to my local warehouse store that starts...
499996    Seriously it is the same stuff they use in the...
499997    If you ever visit the Carmike 14 in Tyler, TX ...
499998    My girlfriend and I love this stuff, and she w...
499999    This buttery salt tastes just like what you fi...
Name: Sentence, Length: 500000, dtype: object


In [20]:
sample_series

0         I have bought several of the Vitality canned d...
1         The product looks more like a stew than a proc...
2         My Labrador is finicky and she appreciates thi...
3         Product arrived labeled as Jumbo Salted Peanut...
4         Not sure if this was an error or if the vendor...
                                ...                        
499995    I went to my local warehouse store that starts...
499996    Seriously it is the same stuff they use in the...
499997    If you ever visit the Carmike 14 in Tyler, TX ...
499998    My girlfriend and I love this stuff, and she w...
499999    This buttery salt tastes just like what you fi...
Name: Sentence, Length: 500000, dtype: object

In [21]:
# Iterating over the Series
#for index, sentence in sample_series.items():
#    print(f"Index {index}: {sentence}")

# Define your intent tags
candidate_labels = [
    "Quality Appreciation",
    "Product Description",
    "Product Appearance",
    "Preference Expression",
    "Packaging Issue",
    "Comparison",
    "Complaint",
    "misrepresentation",
    "Historical Mention",
    "Taste/Flavor Comment",
    "Product Ingredients",
    
]
threshold = 0.8
# Iterate over each sentence and classify
for i, sentence in sample_series.items():
    print(f"Index {i}: {sentence}")
    result = classifier(sentence, candidate_labels, multi_label=True)
    intents = result["labels"]
    scores = result["scores"]
    # Print the most likely intent for each sentence
    print("Predicted intent:", intents[0], "with score:", scores[0])
    print("All intents and scores:", list(zip(intents, scores)))
    print("---")

In [22]:
# Ensure tqdm is setup for pandas explicitly
tqdm.pandas(desc="Classifying sentences")

def classify_intent_with_scores_progress(df, text_column, candidate_labels, threshold=0.5):
    """
    Classify the intent of texts in the specified column of a DataFrame, including scores,
    with progress tracking using tqdm.

    Parameters:
    - df: The DataFrame containing the text to classify.
    - text_column: The name of the column with text to classify.
    - candidate_labels: A list of candidate intent labels.
    - threshold: The confidence threshold to consider a label as valid.

    Returns:
    - DataFrame with an additional column for the predicted intent and scores.
    """
    # Define a helper function to get intents with scores above the threshold
    def get_intents_with_scores(text):
        result = classifier(text, candidate_labels, multi_label=True)
        intents_scores = [(intent, f"{score:.2f}") for intent, score in zip(result["labels"], result["scores"]) if score > threshold]
        return ', '.join([f"{intent}: {score}" for intent, score in intents_scores]) if intents_scores else 'No Intent Exceeds Threshold'
    
    # Apply the helper function to the specified column with progress tracking
    df['Predicted Intent and Scores'] = df[text_column].progress_apply(get_intents_with_scores)
    
    return df

# Initialize the pipeline with GPU support if available
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

def classify_intent_with_scores_batch(df, text_column, candidate_labels, threshold=0.5, batch_size=8):
    """
    Classify the intent of texts in the specified column of a DataFrame using batch processing,
    including scores, with progress tracking using tqdm.

    Parameters:
    - df: The DataFrame containing the text to classify.
    - text_column: The name of the column with text to classify.
    - candidate_labels: A list of candidate intent labels.
    - threshold: The confidence threshold to consider a label as valid.
    - batch_size: Number of texts to process in each batch.

    Returns:
    - DataFrame with an additional column for the predicted intent and scores.
    """
    # Prepare a new column for the predicted intents and scores
    df['Predicted Intent and Scores'] = ''

    # Process in batches
    for start in tqdm(range(0, len(df), batch_size), desc="Processing batches"):
        end = start + batch_size
        batch_texts = df[text_column][start:end].tolist()
        results = classifier(batch_texts, candidate_labels, multi_label=True)
        
        # Process each result in the batch
        for i, result in enumerate(results):
            intents_scores = [(intent, f"{score:.2f}") for intent, score in zip(result["labels"], result["scores"]) if score > threshold]
            predicted_intents_scores = ', '.join([f"{intent}: {score}" for intent, score in intents_scores]) if intents_scores else 'No Intent Exceeds Threshold'
            df.loc[start+i, 'Predicted Intent and Scores'] = predicted_intents_scores

    return df

In [23]:
threshold = 0.8

In [24]:
candidate_labels = [
    "Quality Appreciation",
    "Product Description",
    "Product Appearance",
    "Preference Expression",
    "Packaging Issue",
    "Comparison",
    "Complaint",
    "Misrepresentation",
    "Historical Mention",
    "Taste/Flavor Comment",
    "Product Ingredients",
    "Value for Money",
    "Purchase Recommendation",
    "Usage Experience",
    "Product Efficacy",
    "Health and Safety Concerns",
    "Customer Service Experience",
    "Repeat Purchase Intention",
    "Emotional Response",
    "Environmental/Sustainability Mention",
]

In [25]:
df = sample_series.to_frame(name='Sentence')

In [1]:
type(df)

NameError: name 'df' is not defined

In [27]:
df

Unnamed: 0,Sentence
0,I have bought several of the Vitality canned d...
1,The product looks more like a stew than a proc...
2,My Labrador is finicky and she appreciates thi...
3,Product arrived labeled as Jumbo Salted Peanut...
4,Not sure if this was an error or if the vendor...
...,...
499995,I went to my local warehouse store that starts...
499996,Seriously it is the same stuff they use in the...
499997,"If you ever visit the Carmike 14 in Tyler, TX ..."
499998,"My girlfriend and I love this stuff, and she w..."


In [28]:
type(df["Sentence"])

pandas.core.series.Series

In [29]:
# Assuming 'df' is your DataFrame and 'Sentence' is the column with text
df_with_intents_scores_progress = classify_intent_with_scores_progress(df, 'Sentence', candidate_labels, threshold=0.8)
print(df_with_intents_scores_progress)

Classifying sentences:   0%|          | 0/500000 [00:00<?, ?it/s]



                                                 Sentence  \
0       I have bought several of the Vitality canned d...   
1       The product looks more like a stew than a proc...   
2       My Labrador is finicky and she appreciates thi...   
3       Product arrived labeled as Jumbo Salted Peanut...   
4       Not sure if this was an error or if the vendor...   
...                                                   ...   
499995  I went to my local warehouse store that starts...   
499996  Seriously it is the same stuff they use in the...   
499997  If you ever visit the Carmike 14 in Tyler, TX ...   
499998  My girlfriend and I love this stuff, and she w...   
499999  This buttery salt tastes just like what you fi...   

                              Predicted Intent and Scores  
0       Value for Money: 0.98, Quality Appreciation: 0...  
1       Product Appearance: 0.96, Taste/Flavor Comment...  
2       Value for Money: 0.98, Comparison: 0.87, Purch...  
3       Misrepresentation: 

In [30]:
df_with_intents_scores_progress

Unnamed: 0,Sentence,Predicted Intent and Scores
0,I have bought several of the Vitality canned d...,"Value for Money: 0.98, Quality Appreciation: 0..."
1,The product looks more like a stew than a proc...,"Product Appearance: 0.96, Taste/Flavor Comment..."
2,My Labrador is finicky and she appreciates thi...,"Value for Money: 0.98, Comparison: 0.87, Purch..."
3,Product arrived labeled as Jumbo Salted Peanut...,"Misrepresentation: 1.00, Packaging Issue: 0.95..."
4,Not sure if this was an error or if the vendor...,No Intent Exceeds Threshold
...,...,...
499995,I went to my local warehouse store that starts...,"Value for Money: 0.98, Comparison: 0.87, Repea..."
499996,Seriously it is the same stuff they use in the...,Comparison: 0.92
499997,"If you ever visit the Carmike 14 in Tyler, TX ...",Value for Money: 0.81
499998,"My girlfriend and I love this stuff, and she w...","Value for Money: 0.85, Quality Appreciation: 0.82"


In [32]:
df_with_intents_scores_progress

Unnamed: 0,Sentence,Predicted Intent and Scores
0,I have bought several of the Vitality canned d...,"Value for Money: 0.98, Quality Appreciation: 0..."
1,The product looks more like a stew than a proc...,"Product Appearance: 0.96, Taste/Flavor Comment..."
2,My Labrador is finicky and she appreciates thi...,"Value for Money: 0.98, Comparison: 0.87, Purch..."
3,Product arrived labeled as Jumbo Salted Peanut...,"Misrepresentation: 1.00, Packaging Issue: 0.95..."
4,Not sure if this was an error or if the vendor...,No Intent Exceeds Threshold
...,...,...
499995,I went to my local warehouse store that starts...,"Value for Money: 0.98, Comparison: 0.87, Repea..."
499996,Seriously it is the same stuff they use in the...,Comparison: 0.92
499997,"If you ever visit the Carmike 14 in Tyler, TX ...",Value for Money: 0.81
499998,"My girlfriend and I love this stuff, and she w...","Value for Money: 0.85, Quality Appreciation: 0.82"


In [31]:
df_with_intents_scores_progress_filename = os.path.join(download_dir,"amazon_reviews_pickle_sentences_with_intents_scores_progress_20240219.pkl")
df_with_intents_scores_progress.to_pickle(df_with_intents_scores_progress_filename)

In [82]:
df_with_intents.iloc[0]["Predicted Intent and Scores"]

'Quality Appreciation: 0.96, Comparison: 0.80'

In [93]:
df_with_intents.iloc[0]

Sentence                       I have bought several of the Vitality canned d...
Predicted Intent and Scores         Quality Appreciation: 0.96, Comparison: 0.80
Name: 0, dtype: object

In [108]:
# Example paragraph
paragraph = """
The quick brown fox jumps over the lazy dog. This old sentence is famous for containing every letter of the English alphabet. A clever way to demonstrate fonts, it has been used for decades by typographers and designers.
"""

# Candidate labels
candidate_labels = ["Typography", "Animal Behavior", "English Language", "Design"]

# Classify the paragraph
results = classifier(paragraph, candidate_labels)
print(results)



{'sequence': '\nThe quick brown fox jumps over the lazy dog. This old sentence is famous for containing every letter of the English alphabet. A clever way to demonstrate fonts, it has been used for decades by typographers and designers.\n', 'labels': ['Typography', 'English Language', 'Animal Behavior', 'Design'], 'scores': [0.4202216565608978, 0.3329188823699951, 0.12616656720638275, 0.12069287151098251]}
