In [2]:
import pandas as pd

splits = {'train': 'labeled_final/train-00000-of-00001.parquet', 'test': 'labeled_final/test-00000-of-00001.parquet', 'validation': 'labeled_final/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/google-research-datasets/paws/" + splits["test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df

Unnamed: 0,id,sentence1,sentence2,label
0,1,This was a series of nested angular standards ...,"This was a series of nested polar scales , so ...",0
1,2,His father emigrated to Missouri in 1868 but r...,"His father emigrated to America in 1868 , but ...",0
2,3,"In January 2011 , the Deputy Secretary General...","In January 2011 , FIBA Asia deputy secretary g...",1
3,4,"Steiner argued that , in the right circumstanc...",Steiner held that the spiritual world can be r...,0
4,5,"Luciano Williames Dias ( born July 25 , 1970 )...",Luciano Williames Dias ( born 25 July 1970 ) i...,0
...,...,...,...,...
7995,7996,"The company has branches in Tokyo , based in t...",The company has branches in Tokyo based in Sai...,1
7996,7997,Muara Teweh ( abbreviated : MTW ) is a city lo...,Teweh ( abbreviated : MTW ) is a city located ...,0
7997,7998,The modern coat of arms of Bavaria was designe...,The modern coat of arms of Bavaria was designe...,1
7998,7999,"Former President , Brenda Kuecks , received a ...","In 2013 , former President Brenda Kuecks recei...",0


In [4]:
df_0 = df[df['label'] == 0].sample(n=500, random_state=42) 
df_1 = df[df['label'] == 1].sample(n=500, random_state=42)

sentence_df = pd.concat([df_0, df_1]).sample(frac=1, random_state=42).reset_index(drop=True)

print(sentence_df.head())
print(sentence_df['label'].value_counts())


     id                                          sentence1  \
0  2397  Østfold Arbeiderblad was a Norwegian newspaper...   
1  2411  Born in Sydney , she grew up in Albury and wen...   
2  7110  The Nationals wore the uniforms of the Washing...   
3  4701  On a smaller scale , Ohio hosts minor league b...   
4  2090  The province with the lowest crime rate in 200...   

                                           sentence2  label  
0  Østfold Arbeiderblad was a Norwegian newspaper...      1  
1  Born in Sydney , she grew up in Albury and in ...      1  
2  The Nationals wore the uniforms of the 1924 Wa...      1  
3  On a smaller scale , Ohio hosts Minor League B...      1  
4  The province with the lowest crime rate for th...      0  
label
1    500
0    500
Name: count, dtype: int64


Open source Llama

In [17]:
import os
os.environ["GROQ_API_KEY"] = "grok-api-key"

In [9]:
import os
import pandas as pd
from groq import Groq
from tqdm import tqdm 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

def classify_sentence_pair(client, sentence1, sentence2, model="llama3-8b-8192"):

    prompt = (
        f"Determine if the following two sentences have the same meaning.\n\n"
        f"Sentence 1: \"{sentence1}\"\n"
        f"Sentence 2: \"{sentence2}\"\n\n"
        f"Respond with '1' if they mean the same and '0' if they mean different. DONT GIVE ANYTHING ELSE, JUST OUTPUT 1 or 0"
    )

    try:
        response = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model,
            max_tokens=1,  
            temperature=0  
        )

        content = response.choices[0].message.content.strip()
        if content.startswith("1"):
            return 1
        elif content.startswith("0"):
            return 0
        else:
            print(f"Unexpected response: {content}")
            return 0  
    except Exception as e:
        print(f"Error classifying pair ({sentence1}, {sentence2}): {e}")
        return 0 

def main():
    predictions = []

    for index, row in tqdm(sentence_df.iterrows(), total=sentence_df.shape[0], desc="Classifying sentence pairs"):
        sentence1 = row['sentence1']
        sentence2 = row['sentence2']
        pred = classify_sentence_pair(client, sentence1, sentence2)
        predictions.append(pred)

    sentence_df['prediction'] = predictions
    accuracy = accuracy_score(sentence_df['label'], sentence_df['prediction'])
    conf_matrix = confusion_matrix(sentence_df['label'], sentence_df['prediction'])
    class_report = classification_report(sentence_df['label'],sentence_df['prediction'])

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)



main()


Classifying sentence pairs: 100%|██████████| 1000/1000 [35:15<00:00,  2.12s/it]

Accuracy: 56.90%
Confusion Matrix:
[[ 76 424]
 [  7 493]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.15      0.26       500
           1       0.54      0.99      0.70       500

    accuracy                           0.57      1000
   macro avg       0.73      0.57      0.48      1000
weighted avg       0.73      0.57      0.48      1000






# Phrase Similarity

In [12]:
from datasets import load_dataset

ds = load_dataset("PiC/phrase_similarity")

README.md:   0%|          | 0.00/5.47k [00:00<?, ?B/s]

phrase_similarity.py:   0%|          | 0.00/4.75k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

PS-hard/validation/0000.parquet:   0%|          | 0.00/202k [00:00<?, ?B/s]

PS-hard/test/0000.parquet:   0%|          | 0.00/403k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7004 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [13]:
ds

DatasetDict({
    train: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 7004
    })
    validation: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2000
    })
})

In [None]:
import time
import os
import pandas as pd
from groq import Groq
from tqdm import tqdm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from concurrent.futures import ThreadPoolExecutor, as_completed
from datasets import DatasetDict, load_dataset
import random

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

def classify_sentence_pair(client, phrase1, sentence1, phrase2, sentence2, model="llama3-8b-8192"):

    prompt = (
        f"Determine if the following two pairs of phrases have the same meaning given the usage of these phrases in corresponding sentences.\n\n"
        f"Pair 1:\nPhrase: \"{phrase1}\"\nSentence: \"{sentence1}\"\n\n"
        f"Pair 2:\nPhrase: \"{phrase2}\"\nSentence: \"{sentence2}\"\n\n"
        f"Respond with '1' if they mean the same and '0' if they mean different in the given sentence. JUST OUTPUT 1 or 0."
    )

    retry_attempts = 5 
    wait_time = 2      
    for attempt in range(retry_attempts):
        try:
            response = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model=model,
                max_tokens=1,  
                temperature=0  
            )
            content = response.choices[0].message.content.strip()
            if content.startswith("1"):
                return 1
            elif content.startswith("0"):
                return 0
            else:
                print(f"Unexpected response: '{content}' for Pair 1 and Pair 2.")
                return 0
        except Exception as e:
            if "rate_limit_exceeded" in str(e):
                retry_after = wait_time * (2 ** attempt)  # Exponential backoff
                print(f"Rate limit reached. Retrying in {retry_after:.2f} seconds...")
                time.sleep(retry_after)
            else:
                print(f"Error classifying pair ({phrase1}, {sentence1} vs {phrase2}, {sentence2}): {e}")
                return 0 

    print(f"Exceeded retry attempts for pair ({phrase1}, {sentence1} vs {phrase2}, {sentence2}). Returning default value.")
    return 0

def process_test_split(dataset_split, cache, max_workers=10):
    df = pd.DataFrame(dataset_split)

    if not set(df['label'].unique()).issubset({0, 1}):
        raise ValueError("Labels must be binary (0 or 1).")
    df_label0 = df[df['label'] == 0]
    df_label1 = df[df['label'] == 1]

    df_sampled_label0 = df_label0.sample(n=500, random_state=42)
    df_sampled_label1 = df_label1.sample(n=500, random_state=42)

    df_sampled = pd.concat([df_sampled_label0, df_sampled_label1]).reset_index(drop=True)

    df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

    predictions = [0] * len(df_sampled)  # Preallocate list for efficiency

    def worker(index, row):
        phrase1 = row['phrase1']
        sentence1 = row['sentence1']
        phrase2 = row['phrase2']
        sentence2 = row['sentence2']
        key = (phrase1, sentence1, phrase2, sentence2)
        if key in cache:
            return index, cache[key]
        else:
            pred = classify_sentence_pair(client, phrase1, sentence1, phrase2, sentence2)
            cache[key] = pred
            return index, pred

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        futures = {
            executor.submit(worker, idx, row): idx
            for idx, row in df_sampled.iterrows()
        }

        # Iterate over completed tasks with a progress bar
        for future in tqdm(as_completed(futures), total=len(futures), desc="Classifying test entries"):
            idx, pred = future.result()
            predictions[idx] = pred

    # Add predictions to the DataFrame
    df_sampled['prediction'] = predictions

    return df_sampled

def main():
    # Initialize a cache to store previously classified pairs
    cache = {}

    # Extract the test split
    test_split = ds['test']

    print("Selecting 1,000 random test entries with equal label distribution (500 each)...")

    # Process the test split
    processed_test_df = process_test_split(test_split, cache, max_workers=10)

    # Evaluate the results
    accuracy = accuracy_score(processed_test_df['label'], processed_test_df['prediction'])
    conf_matrix = confusion_matrix(processed_test_df['label'], processed_test_df['prediction'])
    class_report = classification_report(processed_test_df['label'], processed_test_df['prediction'])

    print(f"\n--- Evaluation Metrics for Selected Test Entries ---")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)

if __name__ == "__main__":
    main()


The Output for this is deleted as it was too big, because of the implemented retrying logic and debugging print statements, the results are given in the report!