# Introduction 📜

✔️ What is the objective of this notebook?

The goal is to create a robust and efficient solution to predict users' preference of LLM responses using LightGBM and TF-IDF vectorization.

---

✔️ What does this notebook cover?

- `Data Loading & EDA`

- `Theory behind TF-IDF`

- `Data Preprocessing`

- `Model Training`
       
- `Model Inference`
     

# Imports 📦

In [None]:
# Handle warning messages
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Data preprocessing
import numpy as np
import pandas as pd
from pathlib import Path

# Data visualization
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix

# Model development
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

# TF-IDF Vectorization
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Similarity/distance features for TF-IDF vectors
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, laplacian_kernel

# Configuration ⚙️

In [None]:
class CFG:
    # Paths to competition data
    train_data = Path("/kaggle/input/lmsys-chatbot-arena/train.csv")
    test_data = Path("/kaggle/input/lmsys-chatbot-arena/test.csv")
    subm_data = Path("/kaggle/input/lmsys-chatbot-arena/sample_submission.csv")
    
    # Colorscale for confusion matrix
    colorscale = "peach"
    
    # TF-IDF Vectorization parameters
    components = 32
    ngrams = (1, 7) 
    max_freq = 0.95 # Words that occur in more than 95% of the documents are omitted
    min_freq = 10   # Words that occur in less than 10 documents are omitted
    
    # Training arguments
    num_classes = 3
    early_stop = 50
    log_steps = 100
    
    # LightGBM parameters
    params = {
        "objective": "multiclass",
        "colsample_bytree": 0.8,
        "colsample_bynode": 0.8,
        "metric": "multiclass",
        "learning_rate": 0.02,
        "extra_trees": True,
        "num_rounds": 3000,
        "reg_lambda": 1.3,
        "num_classes": 3,
        "num_leaves": 64,
        "reg_alpha": 0.1,
        "device": "cpu",
        "max_depth": 6,
        "max_bin": 128,
        "verbose": -1,
        "seed": 42
    }

# Exploratory Data Analysis (EDA) 🗃️

In [None]:
class EDA:
    def read_data(self, path):
        # Read dataframe from path
        df = pd.read_csv(path)
        
        # Display the shape of the dataframe and the first 3 rows
        print(f"The shape of the dataframe is: {df.shape}")
        display(df.head(3))
        
        return df
    
    def pie_chart(self, data):
        # Calculate the counts for each winner column
        counts = {
            'winner_model_a': data['winner_model_a'].sum(),
            'winner_model_b': data['winner_model_b'].sum(),
            'winner_tie': data['winner_tie'].sum()
        }

        # Define the colors
        colors = ['#a89192', '#8083a8', '#a8c28c']  # creme, light blue, mint
        identifiers = ['Creme', 'Light Blue', 'Mint']
        
        # Create the pie chart
        fig = go.Figure(data=[go.Pie(labels=identifiers, 
                                     values=list(counts.values()), 
                                     textinfo='percent', 
                                     hole=0.1,
                                     marker=dict(colors=colors, line=dict(color='#FFFFFF')))])
        
        # Update layout for a transparent background and move the pie to the left
        fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', 
                          paper_bgcolor='rgba(0,0,0,0)', 
                          margin=dict(l=0, r=0, t=0, b=0))
        
        # Hide the legend
        fig.update_layout(showlegend=False)
        
        # Show the plot
        fig.show()

        # Display the counts as a table
        counts_df = pd.DataFrame(list(counts.items()), columns=['Class', 'Count'])
        counts_df['Identifier'] = identifiers
        display(counts_df)
        
    def response_length(self, data):
        # Create a copy of the dataframe to avoid modifying the original data
        data_copy = data.copy()
        
        # Calculate the number of words in each response
        data_copy['word_count_a'] = data_copy['response_a'].apply(lambda x: len(str(x).split()))
        data_copy['word_count_b'] = data_copy['response_b'].apply(lambda x: len(str(x).split()))
        
        # Calculate the average word count for each winner class
        word_counts = {
            'winner_model_a': int(
                data_copy[data_copy['winner_model_a'] == 1][
                    ['word_count_a', 
                     'word_count_b']
                ].mean().mean()
            ),
            
            'winner_model_b': int(
                data_copy[data_copy['winner_model_b'] == 1][
                    ['word_count_a', 
                     'word_count_b']
                ].mean().mean()
            ),
            
            'winner_tie': int(
                data_copy[data_copy['winner_tie'] == 1][
                    ['word_count_a', 
                     'word_count_b']
                ].mean().mean()
            )
        }
        
        # Create custom hover text
        hover_texts = [f"Word Count: {value}<br>{key}" for key, value in word_counts.items()]
        
        # Create the bar chart
        fig = go.Figure(data=[go.Bar(
            x=list(word_counts.keys()),  # Winner class labels on x-axis
            y=list(word_counts.values()),
            marker=dict(color=['#a89192', '#8083a8', '#a8c28c']),
            hovertext=hover_texts,
            hoverinfo='text',
            orientation='v'  # Ensure bars are vertical
        )])
        
        # Update layout
        fig.update_layout(
            title='Average Response Word Count by Winner Class',
            xaxis_title='',
            yaxis_title='Average Response Word Count',
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            xaxis=dict(showticklabels=False)  # Hide x-axis labels
        )
        
        # Show the plot
        fig.show()

In [None]:
eda = EDA()

In [None]:
train_data = eda.read_data(CFG.train_data)

In [None]:
test_data = eda.read_data(CFG.test_data)

In [None]:
subm_data = eda.read_data(CFG.subm_data)

In [None]:
print("Distribution of classes (winners):")
eda.pie_chart(train_data)

In [None]:
# Plot average response word count per winner model
eda.response_length(train_data)

# Theory 📒

✔️ **Term Frequency - Inverse Document Frequency** or **TF-IDF** vectorization is used in text mining and information retrieval to assess the importance of words in a document relative to a corpus. This technique transforms text data into a numerical format suitable for machine learning algorithms.

---

✔️ **Components of TF-IDF**

1. Term Frequency (TF):

   - *Definition:* Measures the frequency of a term in a document.
   
   - *Formula:* $ \text{TF}(t,d) = \frac{f_{t,d}}{\sum\limits_{t' \in d} f_{t',d}} $ , where $ f_{t,d} $ is the frequency of term $ t $ in document $ d $.

2. Inverse Document Frequency (IDF):

   - *Definition:* Measures the importance of a term across the entire corpus.
   
   - *Formula:* $ \text{IDF}(t) = \log \left( \frac{N}{1 + n_t} \right) $ , where $ N $ is the total number of documents, and $ n_t $ is the number of documents containing term $ t $.

3. TF-IDF Score:

   - *Definition:* Product of TF and IDF scores.
   
   - *Formula:* $ \text{TF-IDF}(t,d) = \text{TF}(t,d) \times \text{IDF}(t) $
   
---

✔️ ***N-grams* explained**

*N-grams* are contiguous sequences of $ n $ items (tokens) extracted from a text document. They provide a more comprehensive representation of the language structure and context compared to individual words.

*Formula:* $ N\text{-grams} = [t_1, t_2, ..., t_n] $

*Example:* For `ngrams = (1, 3)`, it means we are considering all possible combinations of tokens within a sliding window of length 3 in the text document. Each combination of 3 tokens represents a trigram. 

For instance, consider the sentence: "I love coding."

With `ngrams = (1, 3)`, the n-grams extracted from this sentence would include:

   * Unigrams (1-grams): ["I"], ["love"], ["coding"]
    
   * Bigrams (2-grams): ["I love"], ["love coding"]
    
   * Trigrams (3-grams): ["I love coding"]

This way, $ N-grams $ capture not only individual words but also phrases and contextual information within the text.
  
---
   
✔️ **Steps of TF-IDF**

1. Tokenization:

   - *Definition:* Breaks text into tokens.
   
   - *Example:* "I love coding" -> ["I", "love", "coding"]

2. Document Frequency Calculation:

   - *Definition:* Counts the number of documents containing each term.
   
   - *Example:* "love" appears in 1 document out of 1.

3. TF-IDF Calculation:

   - *Definition:* Computes the TF-IDF score for each term in each document.
   
   - *Example:* For ngrams = (1, 3), "love" appears in Document 1, the TF-IDF score for "love" would be calculated based on its TF and IDF.

4. Vectorization:

   - *Definition:* Represents each document as a vector of TF-IDF scores.
   
   - *Example:* Each document becomes a high-dimensional vector where each dimension corresponds to a unique term or n-gram.


# Data Preprocessing 🛠️

In [None]:
class DataPreprocessing:
    # Check if any value in the input list is None
    @staticmethod
    def retrieve_none(vals):
        return int(any(val is None for val in vals))

    # Calculate the total length of strings in the input list
    @staticmethod
    def retrieve_length(vals):
        length = 0
        for val in vals:
            if isinstance(val, str):
                length += len(val)
        return length
    
    # Calculate the count of unique works in the input list
    @staticmethod
    def retrieve_nuniques(vals):
        if isinstance(vals, str):
            return len(set(vals.split()))
        return 0
    
    # Replace 'None' in the list with the string 'NONE', and join elements with a space
    @staticmethod
    def clean_response(text):
        if isinstance(text, list):
            cleaned_text = ' '.join([str(item) if item is not None else 'NONE' for item in text])
            return cleaned_text

        return text

    def add_features(self, data):
        # Add features related to the length and presence of None values in response columns.
        data[f"response_a_len"] = data[f"response_a"].apply(self.retrieve_length)
        data[f"response_b_len"] = data[f"response_b"].apply(self.retrieve_length)

        # Calculate unique word count for responses
        data[f"response_a_unique"] = data[f"response_a"].apply(self.retrieve_nuniques)
        data[f"response_b_unique"] = data[f"response_b"].apply(self.retrieve_nuniques)

        # Calculate length difference, mean length, and length difference ratio.
        data["response_len_diff"] = data["response_a_len"] - data["response_b_len"]
        data["response_len_mean"] = (data["response_a_len"] + data["response_b_len"]) / 2
        data["response_diff_ratio"] = data["response_len_diff"] / data["response_len_mean"]

        # Calculate unique word count difference, mean, and ratio.
        data["response_unique_diff"] = data["response_a_unique"] - data["response_b_unique"]
        data["response_unique_mean"] = (data["response_a_unique"] + 
                                        data["response_b_unique"]) / 2
        data["response_unique_ratio"] = (data["response_unique_diff"] / 
                                         data["response_unique_mean"])

        # Check if any value in response columns is None.
        data["a_has_none"] = data["response_a"].apply(self.retrieve_none)
        data["b_has_none"] = data["response_b"].apply(self.retrieve_none)
        data["has_none_diff"] = data["a_has_none"] - data["b_has_none"]

        return data
    
    # Calculate cosine similarity between prompt and responses
    @staticmethod
    def calculate_cosine_similarity(tfidf_matrix, 
                                    prompt_idx, 
                                    response_a_idx, 
                                    response_b_idx):
        
        # Cosine similarity between prompt (p) and response_a (a)
        similarity_pa = cosine_similarity(
                tfidf_matrix[prompt_idx].reshape(1, -1), 
                tfidf_matrix[response_a_idx].reshape(1, -1)
        )[0][0]

        # Cosine similarity between prompt (p) and response_b (b)
        similarity_pb = cosine_similarity(
                tfidf_matrix[prompt_idx].reshape(1, -1), 
                tfidf_matrix[response_b_idx].reshape(1, -1)
        )[0][0]

        return similarity_pa, similarity_pb

    # Calculate distances (Euclidean/Laplacian) between prompt and responses
    @staticmethod
    def calculate_distances(tfidf_matrix, 
                            prompt_idx, 
                            response_a_idx, 
                            response_b_idx, 
                            distance_metric):
        
        # Distance between prompt (p) and response_a (a)
        distance_pa = distance_metric(
                tfidf_matrix[prompt_idx].reshape(1, -1), 
                tfidf_matrix[response_a_idx].reshape(1, -1)
        )[0][0]
        
        # Distance between prompt (p) and response_b (b)
        distance_pb = distance_metric(
                tfidf_matrix[prompt_idx].reshape(1, -1),
                tfidf_matrix[response_b_idx].reshape(1, -1)
        )[0][0]
        
        return distance_pa, distance_pb

    def create_tfidf_features(self, train, test, ngrams, min_freq, max_freq, components):
        # Initialize TF-IDF Vectorizer
        tfidf_vectorizer = TfidfVectorizer(analyzer='char', 
                                           ngram_range=ngrams, 
                                           min_df=min_freq, 
                                           max_df=max_freq,
                                           lowercase=False,
                                           sublinear_tf=True)

        # Combine train and test data into a single DataFrame
        full_data = pd.concat([train, test], ignore_index=True)

        # Clean and prepare the text columns
        for col in ['prompt', 'response_a', 'response_b']:
            full_data[col] = full_data[col].apply(self.clean_response)

        # Combine all text columns into a single corpus for TF-IDF vectorization
        full_corpus = pd.concat([full_data['prompt'], 
                                 full_data['response_a'], 
                                 full_data['response_b']], 
                                 ignore_index=True)

        # Compute the TF-IDF matrix
        tfidf_matrix = tfidf_vectorizer.fit_transform(full_corpus)

        # Perform dimensionality reduction with TruncatedSVD
        svd = TruncatedSVD(n_components=components, random_state=42)
        reduced_matrix = svd.fit_transform(tfidf_matrix)

        # Calculate split indices for separating different parts of the corpus
        len_full = len(full_data)
        split_index_01 = len_full
        split_index_02 = len_full * 2

        # Split the reduced matrix into prompts, response_a, and response_b parts
        full_tfidf_prompts = reduced_matrix[:split_index_01]
        full_tfidf_response_a = reduced_matrix[split_index_01:split_index_02]
        full_tfidf_response_b = reduced_matrix[split_index_02:]

        # Separate the reduced matrix into training and testing sets
        len_train = len(train)
        train_tfidf_prompts = full_tfidf_prompts[:len_train]
        train_tfidf_response_a = full_tfidf_response_a[:len_train]
        train_tfidf_response_b = full_tfidf_response_b[:len_train]
        test_tfidf_prompts = full_tfidf_prompts[len_train:]
        test_tfidf_response_a = full_tfidf_response_a[len_train:]
        test_tfidf_response_b = full_tfidf_response_b[len_train:]

        # Create DataFrames to hold the SVD features for train and test sets
        feature_names = [f'svd_feature_{i}' for i in range(components)]
        train_features = pd.DataFrame(index=train.index)
        test_features = pd.DataFrame(index=test.index)

        # Assign SVD features to the respective columns in the feature DataFrames
        for i in range(components):
            train_features[f'svd_prompts_{i}'] = train_tfidf_prompts[:, i]
            train_features[f'svd_response_a_{i}'] = train_tfidf_response_a[:, i]
            train_features[f'svd_response_b_{i}'] = train_tfidf_response_b[:, i]
            test_features[f'svd_prompts_{i}'] = test_tfidf_prompts[:, i]
            test_features[f'svd_response_a_{i}'] = test_tfidf_response_a[:, i]
            test_features[f'svd_response_b_{i}'] = test_tfidf_response_b[:, i]

        # Concatenate the new features with the original train and test DataFrames
        train = pd.concat([train, train_features], axis=1)
        test = pd.concat([test, test_features], axis=1)

        # Calculate similarity and distance features
        for df, len_df in zip([train, test], [len(train), len(test)]):
            prompt_indices = df.index

            # Calculate cosine similarity features
            df['similarity_pa'], df['similarity_pb'] = zip(*[
                self.calculate_cosine_similarity(reduced_matrix, i, i + len_df, i + 2 * len_df)
                for i in prompt_indices
            ])

            # Calculate Euclidean distance features
            df['euclidean_pa'], df['euclidean_pb'] = zip(*[
                self.calculate_distances(reduced_matrix, i, i + len_df, i + 2 * len_df, 
                                         euclidean_distances)
                for i in prompt_indices
            ])

            # Calculate Laplacian kernel distance features
            df['laplacian_pa'], df['laplacian_pb']= zip(*[
                self.calculate_distances(reduced_matrix, i, i + len_df, i + 2 * len_df, 
                                         laplacian_kernel)
                for i in prompt_indices
            ])

        return train, test
    
    # Merges multiple labels into a single label
    def merge_label(self, row):
        if row["winner_model_a"] == 1:
            return 0
        if row["winner_model_b"] == 1:
            return 1
        if row["winner_tie"] == 1:
            return 2
        raise ValueError("The value is invalid.")

In [None]:
dp = DataPreprocessing()

In [None]:
# Add length, similarity and distance features
train_data = dp.add_features(train_data)
test_data = dp.add_features(test_data)

In [None]:
# Extract TF-IDF features and perform dimensionality reduction
train_data, test_data = dp.create_tfidf_features(train_data, 
                                                 test_data, 
                                                 CFG.ngrams,
                                                 CFG.min_freq, 
                                                 CFG.max_freq, 
                                                 CFG.components)

In [None]:
# Merge multiple labels into a single label
train_data["target"] = train_data[
    ["winner_model_a", "winner_model_b", "winner_tie"]
                                 ].apply(lambda x: dp.merge_label(x), axis=1)

# Model Development 🧠

In [None]:
class ModelDevelopment:
    def train_lgb(self, train_data, test_data, feature_cols, params, early_stop, log_steps):
        # Extract feature values and target labels from the training and testing data
        X_train = train_data[feature_cols].values
        X_test = test_data[feature_cols].values
        Y_train = train_data["target"]

        # List to store predictions
        train_preds_list = []
        test_preds_list = []

        # Initialize StratifiedKFold
        cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
        for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, Y_train)):
            # Split the training data into training and validation sets for the current fold
            x_train, x_valid = X_train[train_index], X_train[valid_index]
            y_train, y_valid = Y_train[train_index], Y_train[valid_index]

            # Create LightGBM dataset objects for training and validation
            train = lgb.Dataset(x_train, y_train)
            valid = lgb.Dataset(x_valid, y_valid, reference=train)

            # Train the model on the current fold
            model = lgb.train(
                params,
                train,
                valid_sets=[train, valid],
                feature_name=feature_cols,
                callbacks=[lgb.early_stopping(early_stop),
                           lgb.log_evaluation(log_steps)])

            # Make predictions on the train and test sets
            train_preds = model.predict(X_train)
            test_preds = model.predict(X_test)

            train_preds_list.append(train_preds)
            test_preds_list.append(test_preds)

        # Average predictions
        train_preds = np.mean(train_preds_list, axis=0)
        test_preds = np.mean(test_preds_list, axis=0)

        return train_preds, test_preds
    
    # Confusion matrix for train data predictions
    def plot_cm(self, y_true, y_pred, labels, colorscale):
        cm = confusion_matrix(y_true, y_pred, labels=labels)

        # Create a custom hover text formatter
        def format_hover_text(value):
            if value >= 10000:
                return str(int(value))  # Convert to integer without commas or "k"
            else:
                return str(value)

        # Create the heatmap
        fig = go.Figure(data=go.Heatmap(
            z=cm,
            x=labels,
            y=labels,
            colorscale=colorscale,
            zmin=0,
            zmax=20000,
            text=cm,
            texttemplate="%{text:.0f}",
            hovertemplate="True: %{y}<br>Predicted: %{x}<br>Count: %{z:,.0f}<extra></extra>",
            customdata=[format_hover_text(value) for value in cm.flatten()]
        ))

        # Update layout for a transparent background and square aspect ratio
        fig.update_layout(
            plot_bgcolor='rgba(0,0,0,0)',
            paper_bgcolor='rgba(0,0,0,0)',
            xaxis_title="Predicted Labels",
            yaxis_title="True Labels",
            xaxis=dict(constrain='domain'),
            yaxis=dict(constrain='domain', scaleanchor='x'),
            width=650,  
            height=650,  
            margin=dict(t=65, b=65, l=65, r=65) 
        )

        # Show the plot
        fig.show()

In [None]:
md = ModelDevelopment()

In [None]:
# Define label columns
label_cols = ["winner_model_a", "winner_model_b", "winner_tie"]

# Define the list of features to exclude from the training data
excluded_features = ['id', 
                     'model_a', 
                     'model_b', 
                     'prompt', 
                     'response_a', 
                     'response_b',
                     'winner_model_a', 
                     'winner_model_b', 
                     'winner_tie', 
                     'target', 
                     'fold_id']

features = [col for col in train_data.columns if col not in excluded_features]

In [None]:
# Train LightGBM
train_preds, test_preds = md.train_lgb(train_data, 
                                       test_data, 
                                       features,
                                       CFG.params, 
                                       CFG.early_stop, 
                                       CFG.log_steps)

In [None]:
# Confusion matrix for (mean) predictions on train data
md.plot_cm(train_data['target'], np.argmax(train_preds, axis=1), [0, 1, 2], CFG.colorscale)

# Submit Predictions 💡

In [None]:
# Assign the predicted test labels to the submission dataframe
subm_data[label_cols] = test_preds

# Save the submission dataframe and display the first 3 rows
subm_data.to_csv("submission.csv", index=False)
display(subm_data.head(3))