In [4]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
sns.set_theme(style="whitegrid")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
#Load all datasets
# Main Dataset

# Conversation Data -- we will use this data in the "Conversation Data" section
df = pd.read_json(
    "../data/training-set/chatbot-arena-conversations.jsonl.gz",
    lines=True,
    compression="gzip"
)
df.head(5)
# Auxiliary Datasets

# Embedding Data -- we will use this data in the "Embedding Data" section
prompt_embeddings = np.load(
    "../data/training-set/chatbot-arena-prompts-embeddings.npy"
)

response_a_embeddings = np.load(
    "../data/training-set/chatbot-arena-model_a_response-embeddings.npy"
)

response_b_embeddings = np.load(
    "../data/training-set/chatbot-arena-model_b_response-embeddings.npy"
)

# Topic Modeling and Hardness Score Data -- we will use this data in the "Topic Modeling and Hardness Score Data" section
topic_and_hardness = pd.read_json(
    "../data/training-set/chatbot-arena-gpt3-scores.jsonl.gz",
    lines=True,
    compression="gzip"
)

In [6]:
df.head()

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,model_b,arena_user_973,[{'content': 'What is the difference between O...,[{'content': 'What is the difference between O...
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,tie,arena_user_973,[{'content': 'Why did my parent not invite me ...,[{'content': 'Why did my parent not invite me ...
2,90bfd142157948aba01931726c888e7f,koala-13b,oasst-pythia-12b,model_b,arena_user_973,"[{'content': 'Fuji vs. Nikon, which is better?...","[{'content': 'Fuji vs. Nikon, which is better?..."
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,model_b,arena_user_973,[{'content': 'How to build an arena for chatbo...,[{'content': 'How to build an arena for chatbo...
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,model_a,arena_user_973,"[{'content': 'When is it today?', 'role': 'use...","[{'content': 'When is it today?', 'role': 'use..."


In [7]:
#split prompt and response, add columns with lengths (characters)
df["prompt"] = df["conversation_a"].str[0].str["content"]
df["response_a"] = df["conversation_a"].str[1].str["content"]
df["response_b"] = df["conversation_b"].str[1].str["content"]

In [8]:
#prompt and response tokenization
import re

# \b\w+\b captures words by looking for word boundaries (\b) and word characters (\w+)
pattern = r"\b\w+\b"

# Apply regex tokenization to each column in your dataframe
df['prompt_tokens'] = df['prompt'].apply(lambda x: re.findall(pattern, x))
df['a_response_tokens'] = df['response_a'].apply(lambda x: re.findall(pattern, x))
df['b_response_tokens'] = df['response_b'].apply(lambda x: re.findall(pattern, x))

# Calculate token counts
df['prompt_token_length'] = df['prompt_tokens'].apply(len)
df['a_response_token_length'] = df['a_response_tokens'].apply(len)
df['b_response_token_length'] = df['b_response_tokens'].apply(len)

df.head(1)


Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,prompt,response_a,response_b,prompt_tokens,a_response_tokens,b_response_tokens,prompt_token_length,a_response_token_length,b_response_token_length
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,model_b,arena_user_973,[{'content': 'What is the difference between O...,[{'content': 'What is the difference between O...,What is the difference between OpenCL and CUDA?,OpenCL and CUDA are two different programming ...,OpenCL and CUDA are both programming languages...,"[What, is, the, difference, between, OpenCL, a...","[OpenCL, and, CUDA, are, two, different, progr...","[OpenCL, and, CUDA, are, both, programming, la...",8,141,302


In [9]:
import textstat

# Define functions for each readability score
def flesch_kincaid_score(text):
    return textstat.flesch_kincaid_grade(text)

def gunning_fog_score(text):
    return textstat.gunning_fog(text)

def smog_score(text):
    return textstat.smog_index(text)


In [10]:
# Calculate readability scores for each response
df['a_response_flesch_kincaid'] = df['response_a'].apply(flesch_kincaid_score)
df['a_response_gunning_fog'] = df['response_a'].apply(gunning_fog_score)
df['a_response_smog'] = df['response_a'].apply(smog_score)

df['b_response_flesch_kincaid'] = df['response_b'].apply(flesch_kincaid_score)
df['b_response_gunning_fog'] = df['response_b'].apply(gunning_fog_score)
df['b_response_smog'] = df['response_b'].apply(smog_score)


In [11]:
import textstat
from lexical_diversity import lex_div as ld
from collections import Counter

# Lexical Richness: Type-Token Ratio (unique words / total words)
def type_token_ratio(text):
    tokens = text.split()
    unique_tokens = set(tokens)
    return len(unique_tokens) / len(tokens) if tokens else 0

# Lexical Richness: Lexical Diversity
def lexical_diversity(text):
    tokens = text.split()
    return ld.ttr(tokens)  # Alternatively, you can use other functions in lex_div

# Average Syllable Count per Word
def average_syllable_count(text):
    tokens = text.split()
    if len(tokens) == 0:
        return 0
    total_syllables = sum(textstat.syllable_count(word) for word in tokens)
    return total_syllables / len(tokens)

# Complex Word Count (words with more than three syllables)
def complex_word_count(text):
    tokens = text.split()
    return sum(1 for word in tokens if textstat.syllable_count(word) > 3)


In [12]:
# Lexical richness, syllable count, and complex word count for each response
df['a_response_ttr'] = df['response_a'].apply(type_token_ratio)
df['a_response_lexical_diversity'] = df['response_a'].apply(lexical_diversity)
df['a_response_avg_syllable_count'] = df['response_a'].apply(average_syllable_count)
df['a_response_complex_word_count'] = df['response_a'].apply(complex_word_count)

df['b_response_ttr'] = df['response_b'].apply(type_token_ratio)
df['b_response_lexical_diversity'] = df['response_b'].apply(lexical_diversity)
df['b_response_avg_syllable_count'] = df['response_b'].apply(average_syllable_count)
df['b_response_complex_word_count'] = df['response_b'].apply(complex_word_count)


In [13]:
df.columns

Index(['question_id', 'model_a', 'model_b', 'winner', 'judge',
       'conversation_a', 'conversation_b', 'prompt', 'response_a',
       'response_b', 'prompt_tokens', 'a_response_tokens', 'b_response_tokens',
       'prompt_token_length', 'a_response_token_length',
       'b_response_token_length', 'a_response_flesch_kincaid',
       'a_response_gunning_fog', 'a_response_smog',
       'b_response_flesch_kincaid', 'b_response_gunning_fog',
       'b_response_smog', 'a_response_ttr', 'a_response_lexical_diversity',
       'a_response_avg_syllable_count', 'a_response_complex_word_count',
       'b_response_ttr', 'b_response_lexical_diversity',
       'b_response_avg_syllable_count', 'b_response_complex_word_count'],
      dtype='object')

In [14]:
df.head()

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,prompt,response_a,response_b,...,b_response_gunning_fog,b_response_smog,a_response_ttr,a_response_lexical_diversity,a_response_avg_syllable_count,a_response_complex_word_count,b_response_ttr,b_response_lexical_diversity,b_response_avg_syllable_count,b_response_complex_word_count
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,model_b,arena_user_973,[{'content': 'What is the difference between O...,[{'content': 'What is the difference between O...,What is the difference between OpenCL and CUDA?,OpenCL and CUDA are two different programming ...,OpenCL and CUDA are both programming languages...,...,12.01,14.2,0.664179,0.664179,1.753731,9,0.483221,0.483221,1.701342,18
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,tie,arena_user_973,[{'content': 'Why did my parent not invite me ...,[{'content': 'Why did my parent not invite me ...,Why did my parent not invite me to their wedding?,It is possible that your parent did not invite...,It is likely that they wanted to keep the gues...,...,11.32,11.7,0.555556,0.555556,1.333333,2,0.67033,0.67033,1.373626,1
2,90bfd142157948aba01931726c888e7f,koala-13b,oasst-pythia-12b,model_b,arena_user_973,"[{'content': 'Fuji vs. Nikon, which is better?...","[{'content': 'Fuji vs. Nikon, which is better?...","Fuji vs. Nikon, which is better?",Both Fuji and Nikon are popular camera brands ...,This is a subjective question and the answer d...,...,11.54,12.6,0.636364,0.636364,1.715909,15,0.480769,0.480769,1.580769,10
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,model_b,arena_user_973,[{'content': 'How to build an arena for chatbo...,[{'content': 'How to build an arena for chatbo...,How to build an arena for chatbots?,Building an arena for chatbots can be done by ...,Building an arena for chatbots is a great way ...,...,11.89,12.5,0.498127,0.498127,1.554307,11,0.5,0.5,1.496429,10
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,model_a,arena_user_973,"[{'content': 'When is it today?', 'role': 'use...","[{'content': 'When is it today?', 'role': 'use...",When is it today?,"I'm sorry, I cannot determine the current date...","Today is February 23, 2023.",...,2.0,0.0,0.711538,0.711538,1.346154,0,1.0,1.0,1.6,0


In [15]:
def jaccard_similarity(tokens_a, tokens_b):
    set_a = set(tokens_a)
    set_b = set(tokens_b)
    intersection = set_a.intersection(set_b)
    union = set_a.union(set_b)
    return len(intersection) / len(union) if union else 0


In [16]:
# Calculate Jaccard similarity between tokens of response_a and response_b
df['response_jaccard_similarity'] = df.apply(
    lambda row: jaccard_similarity(row['a_response_tokens'], row['b_response_tokens']), axis=1
)

# Calculate Jaccard similarity between the prompt and each response
df['prompt_a_jaccard_similarity'] = df.apply(
    lambda row: jaccard_similarity(row['prompt_tokens'], row['a_response_tokens']), axis=1
)

df['prompt_b_jaccard_similarity'] = df.apply(
    lambda row: jaccard_similarity(row['prompt_tokens'], row['b_response_tokens']), axis=1
)

In [17]:
# Check if the prompt contains a question mark and set binary value
df['is_question'] = df['prompt'].apply(lambda x: 1 if '?' in x else 0)

In [18]:
def keyword_overlap_count(prompt_tokens, response_tokens):
    # Convert tokens to sets and find the intersection
    overlap = set(prompt_tokens).intersection(set(response_tokens))
    return len(overlap)


In [25]:
# Calculate keyword overlap count for each response with the prompt
df['prompt_a_keyword_overlap'] = df.apply(lambda row: keyword_overlap_count(row['prompt_tokens'], row['a_response_tokens']), axis=1)
df['prompt_b_keyword_overlap'] = df.apply(lambda row: keyword_overlap_count(row['prompt_tokens'], row['b_response_tokens']), axis=1)
# Calculate keyword overlap count between the two responses
df['response_ab_keyword_overlap'] = df.apply(lambda row: keyword_overlap_count(row['a_response_tokens'], row['b_response_tokens']), axis=1)


In [28]:
def unique_word_count(tokens):
    return len(set(tokens))


In [29]:
# Calculate the number of unique words in the prompt and both responses
df['prompt_unique_words'] = df['prompt_tokens'].apply(unique_word_count)
df['a_response_unique_words'] = df['a_response_tokens'].apply(unique_word_count)
df['b_response_unique_words'] = df['b_response_tokens'].apply(unique_word_count)

In [30]:
df.head()

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,prompt,response_a,response_b,...,response_jaccard_similarity,prompt_a_jaccard_similarity,prompt_b_jaccard_similarity,is_question,prompt_a_keyword_overlap,prompt_b_keyword_overlap,response_ab_keyword_overlap,prompt_unique_words,a_response_unique_words,b_response_unique_words
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,model_b,arena_user_973,[{'content': 'What is the difference between O...,[{'content': 'What is the difference between O...,What is the difference between OpenCL and CUDA?,OpenCL and CUDA are two different programming ...,OpenCL and CUDA are both programming languages...,...,0.180851,0.054945,0.036496,1,5,5,34,8,88,134
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,tie,arena_user_973,[{'content': 'Why did my parent not invite me ...,[{'content': 'Why did my parent not invite me ...,Why did my parent not invite me to their wedding?,It is possible that your parent did not invite...,It is likely that they wanted to keep the gues...,...,0.207547,0.1,0.028986,1,7,2,22,10,67,61
2,90bfd142157948aba01931726c888e7f,koala-13b,oasst-pythia-12b,model_b,arena_user_973,"[{'content': 'Fuji vs. Nikon, which is better?...","[{'content': 'Fuji vs. Nikon, which is better?...","Fuji vs. Nikon, which is better?",Both Fuji and Nikon are popular camera brands ...,This is a subjective question and the answer d...,...,0.256983,0.046729,0.02459,1,5,3,46,6,106,119
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,model_b,arena_user_973,[{'content': 'How to build an arena for chatbo...,[{'content': 'How to build an arena for chatbo...,How to build an arena for chatbots?,Building an arena for chatbots can be done by ...,Building an arena for chatbots is a great way ...,...,0.301587,0.04918,0.047619,1,6,6,57,7,121,125
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,model_a,arena_user_973,"[{'content': 'When is it today?', 'role': 'use...","[{'content': 'When is it today?', 'role': 'use...",When is it today?,"I'm sorry, I cannot determine the current date...","Today is February 23, 2023.",...,0.02439,0.025,0.125,1,1,1,1,4,37,5


In [31]:
politeness_indicators = ["please", "thank you", "could you", "kindly", "would you mind"]
negative_words = ["no", "not", "never", "can't", "won't", "don't", "hate", "bad"]
def contains_politeness(tokens):
    return 1 if any(word in tokens for word in politeness_indicators) else 0

def contains_negative(tokens):
    return 1 if any(word in tokens for word in negative_words) else 0


In [32]:
# Apply politeness and negativity indicators
df['prompt_contains_politeness'] = df['prompt_tokens'].apply(contains_politeness)
df['a_response_contains_politeness'] = df['a_response_tokens'].apply(contains_politeness)
df['b_response_contains_politeness'] = df['b_response_tokens'].apply(contains_politeness)

df['prompt_contains_negative'] = df['prompt_tokens'].apply(contains_negative)
df['a_response_contains_negative'] = df['a_response_tokens'].apply(contains_negative)
df['b_response_contains_negative'] = df['b_response_tokens'].apply(contains_negative)


In [34]:
df.head()

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,prompt,response_a,response_b,...,response_ab_keyword_overlap,prompt_unique_words,a_response_unique_words,b_response_unique_words,prompt_contains_politeness,a_response_contains_politeness,b_response_contains_politeness,prompt_contains_negative,a_response_contains_negative,b_response_contains_negative
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,model_b,arena_user_973,[{'content': 'What is the difference between O...,[{'content': 'What is the difference between O...,What is the difference between OpenCL and CUDA?,OpenCL and CUDA are two different programming ...,OpenCL and CUDA are both programming languages...,...,34,8,88,134,0,0,0,0,0,1
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,tie,arena_user_973,[{'content': 'Why did my parent not invite me ...,[{'content': 'Why did my parent not invite me ...,Why did my parent not invite me to their wedding?,It is possible that your parent did not invite...,It is likely that they wanted to keep the gues...,...,22,10,67,61,0,0,0,1,1,0
2,90bfd142157948aba01931726c888e7f,koala-13b,oasst-pythia-12b,model_b,arena_user_973,"[{'content': 'Fuji vs. Nikon, which is better?...","[{'content': 'Fuji vs. Nikon, which is better?...","Fuji vs. Nikon, which is better?",Both Fuji and Nikon are popular camera brands ...,This is a subjective question and the answer d...,...,46,6,106,119,0,0,0,0,0,1
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,model_b,arena_user_973,[{'content': 'How to build an arena for chatbo...,[{'content': 'How to build an arena for chatbo...,How to build an arena for chatbots?,Building an arena for chatbots can be done by ...,Building an arena for chatbots is a great way ...,...,57,7,121,125,0,0,0,0,0,0
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,model_a,arena_user_973,"[{'content': 'When is it today?', 'role': 'use...","[{'content': 'When is it today?', 'role': 'use...",When is it today?,"I'm sorry, I cannot determine the current date...","Today is February 23, 2023.",...,1,4,37,5,0,0,0,0,1,0


In [None]:
from transformers import pipeline
import torch

# Check if GPU is available and set device accordingly
device = 0 if torch.cuda.is_available() else -1

# Load the sentiment-analysis pipeline with truncation set to 512 tokens and GPU support
sentiment_model = pipeline(
    "sentiment-analysis", 
    model="distilbert-base-uncased-finetuned-sst-2-english", 
    framework="pt",
    max_length=512,  # Set to 512 tokens
    truncation=True,
    batch_size=8,
    device=device  # Specify GPU if available
)

# Define a function to get sentiment score using token lists
def get_transformer_sentiment_from_tokens(tokens):
    # Join the tokens to form the full text sequence
    text = " ".join(tokens)
    # Apply sentiment analysis
    result = sentiment_model(text)[0]
    return result['score'] if result['label'] == 'POSITIVE' else -result['score']

# Apply to token columns in your DataFrame
df['prompt_sentiment'] = df['prompt_tokens'].apply(get_transformer_sentiment_from_tokens)
df['a_response_sentiment'] = df['a_response_tokens'].apply(get_transformer_sentiment_from_tokens)
df['b_response_sentiment'] = df['b_response_tokens'].apply(get_transformer_sentiment_from_tokens)