In [1]:
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [2]:
import kaleido

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# %%writefile "/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /harness_eval_large_scale/lm_harness_visualization.py"

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from google.colab import drive
import plotly.graph_objects as go
import plotly.express as px
import scipy.stats as stats

class LmHarnessVis:
    def __init__(self, file_path):
        self.file_path = file_path
        self.drive_mounted = False
        if not self.drive_mounted:
            drive.mount('/content/drive')
            self.drive_mounted = True

    def mount_drive(self):
        if not self.drive_mounted:
            drive.mount('/content/drive')
            self.drive_mounted = True

    def read_res_json(self, file_name):
        json_data = []
        full_path = f"{self.file_path}/{file_name}"
        with open(full_path, 'r') as file:
            for line in file:
                json_data.append(json.loads(line))
        return json_data


    def create_res_df(self, json_data):
        data = []
        for entry in json_data:
            category = entry['doc']['Category']
            sub_category = entry['doc']['Sub-Category']
            responses = entry.get('filtered_resps', [])
            likelihoods = np.array([float(resp[0]) for resp in responses])
            true_indices = [i for i, resp in enumerate(responses) if resp[1] == "True"]

            # Apply softmax to the likelihoods to convert them to probabilities
            probabilities = self.softmax(likelihoods)
            confidence_percentages = probabilities * 100

            # Extract confidence percentages for 'True' answers
            true_confidences = [confidence_percentages[i] for i in true_indices]

            mean_confidence = np.mean(true_confidences) if true_confidences else 0
            correct = entry.get('acc', 0)

            data.append({
                'Category': category,
                'Sub-Category': sub_category,
                'Total Responses': len(responses),
                'True Responses': len(true_confidences),
                'Mean Confidence (%)': mean_confidence,
                'Correct': correct,

            })
        return pd.DataFrame(data)


    def calculate_accuracy(self, df, category_level='Sub-Category'):
        groupby_columns = ['Category', 'Sub-Category'] if category_level == 'Sub-Category' else ['Category']
        accuracy_data = df.groupby(groupby_columns).agg(
            Total_Questions=('Correct', 'size'),
            Correct_Answers=('Correct', 'sum')
        )
        accuracy_data['Accuracy (%)'] = (accuracy_data['Correct_Answers'] / accuracy_data['Total_Questions']) * 100
        return accuracy_data

    def total_accuracy(self, df):
        total_questions = df['Correct'].count()
        total_correct_answers = df['Correct'].sum()
        total_accuracy = (total_correct_answers / total_questions) * 100
        return total_accuracy

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

    def plot_average_likelihood(self, df, model_name, path=None):
        # Group by 'Sub-Category' to calculate mean and std of 'Mean Confidence (%)'
        stats = df.groupby('Sub-Category')['Mean Confidence (%)'].agg(['mean', 'std'])
        stats_sorted = stats.sort_values(by='mean', ascending=True)

        # Use a subtle color palette for each bar
        colors = [f"rgba({color[0]*255}, {color[1]*255}, {color[2]*255}, 0.8)" for color in sns.color_palette("Paired", len(stats_sorted))]

        # Create a bar chart using plotly
        fig = go.Figure()

        # Add bars with mean shown above and std only in hover info
        fig.add_trace(go.Bar(
            x=stats_sorted.index,
            y=stats_sorted['mean'],
            marker=dict(color=colors),
            text=[f"{val:.2f}%" for val in stats_sorted['mean']],  # Mean to display above bars
            textposition='outside',  # Position mean text above bars
            hovertext=[f"Mean: {val:.2f}%<br>Std: {std:.2f}%" for val, std in zip(stats_sorted['mean'], stats_sorted['std'])],
            hoverinfo="text"  # Display mean and std in hover only
        ))

        # Update layout for readability
        fig.update_layout(
            title=f"{model_name} Confidence by Sub-Category",
            xaxis_title="Sub-Category",
            yaxis_title="Average Confidence (%) of Model Answers",
            yaxis=dict(range=[0, 110]),
            template="plotly_white",
            title_x=0.5,
            title_font_size=20
        )

                # Save the chart as SVG if a path is provided
        if path:
            # Use the title as the file name, replacing spaces with underscores
            file_name = f"{fig.layout.title.text.replace(' ', '_')}.png"
            full_path = f"{path}{file_name}"

            # Save the plot as SVG
            fig.write_image(full_path, format="png", width=1600, height=1200, scale=2)

        fig.show()


        # Display the chart
        fig.show()



    def create_bar_chart(self, df, model_name, accuracy_df, path=None):
        # Ensure 'Category' is a column in accuracy_df
        if 'Category' not in accuracy_df.columns:
            accuracy_df.reset_index(inplace=True)

        # Define the custom color palette
        color_palette = px.colors.qualitative.Dark24 + px.colors.qualitative.Set2

        # Get unique categories
        categories = accuracy_df['Category'].unique()
        bar_width = 0.25
        fig = go.Figure()

        # Loop through each category and plot sub-category bars
        for i, category in enumerate(categories):
            sub_category_df = accuracy_df[accuracy_df['Category'] == category]

            # Assign colors from the custom palette
            color = color_palette[i % len(color_palette)]  # Cycle through palette if categories exceed colors

            # Positions and data for each category
            positions = np.arange(len(sub_category_df)) + i * bar_width
            fig.add_trace(go.Bar(
                x=sub_category_df['Sub-Category'],
                y=sub_category_df['Accuracy (%)'],
                name=category,
                text=[f"{acc:.1f}%" for acc in sub_category_df['Accuracy (%)']],
                textposition='outside',
                marker=dict(
                    color=color,  # Assign unique color
                    opacity=0.8,
                    line=dict(color='black', width=0.8)  # Add border to bars
                )
            ))

        # Update layout for readability
        fig.update_layout(
            barmode='group',  # Group bars by category
            title=f"{model_name} Accuracy Chart",
            xaxis_title="Sub-Categories",
            yaxis_title="Accuracy (%)",
            yaxis=dict(range=[0, 110]),
            legend_title_text='Categories',
            template="plotly_white",
            title_x=0.5,
            title_font_size=20
        )

        # Save the chart as PNG if a path is provided
        if path:
            # Use the title as the file name, replacing spaces with underscores
            file_name = f"{fig.layout.title.text.replace(' ', '_')}.png"
            full_path = f"{path}{file_name}"

            # Save the plot as PNG
            fig.write_image(full_path, format="png", width=1200, height=900, scale=2)
        fig.show()





    def create_comparison_bar_chart(self, df_before, df_after, model_name_1, model_name_2, path=None):
        # Combine data to get categories and assign unique colors
        df_before['Model'] = model_name_1
        df_after['Model'] = model_name_2
        df_combined = pd.concat([df_before, df_after])

        # Get a list of unique sub-categories
        sub_categories = df_combined['Sub-Category'].unique()

        # Use a vibrant color palette
        color_palette = px.colors.qualitative.Dark24 + px.colors.qualitative.Set2
        num_colors_needed = len(sub_categories)
        colors = color_palette[:num_colors_needed]

        # Prepare figure
        fig = go.Figure()

        # Manually adjust x-axis positions for before and after bars
        x_positions_before = [i - 0.2 for i in range(len(sub_categories))]  # Slightly wider bars
        x_positions_after = [i + 0.2 for i in range(len(sub_categories))]

        # Iterate over each sub-category to add two bars with different transparency
        for i, sub_category in enumerate(sub_categories):
            sub_data_before = df_combined[(df_combined['Sub-Category'] == sub_category) & (df_combined['Model'] == model_name_1)]
            sub_data_after = df_combined[(df_combined['Sub-Category'] == sub_category) & (df_combined['Model'] == model_name_2)]

            if not sub_data_before.empty:
                fig.add_trace(go.Bar(
                    x=[x_positions_before[i]],
                    y=sub_data_before['Accuracy (%)'],
                    name=f"{model_name_1} (Before)",
                    marker_color=colors[i],
                    marker=dict(opacity=0.8),
                    width=0.4,  # Increased bar width for better visibility
                    text=[f"{acc:.1f}%" for acc in sub_data_before['Accuracy (%)']],
                    textposition='outside',
                    textfont=dict(size=16, color="black")
                ))

            if not sub_data_after.empty:
                fig.add_trace(go.Bar(
                    x=[x_positions_after[i]],
                    y=sub_data_after['Accuracy (%)'],
                    name=f"{model_name_2} (After)",
                    marker_color=colors[i],
                    marker=dict(opacity=0.5),
                    width=0.4,  # Increased bar width for better visibility
                    text=[f"{acc:.1f}%" for acc in sub_data_after['Accuracy (%)']],
                    textposition='outside',
                    textfont=dict(size=16, color="black")
                ))

        # Layout customization
        fig.update_layout(
            title=f"Comparison of {model_name_1} and {model_name_2} Accuracy by Sub-Category",
            xaxis=dict(
                title='Sub-Categories',
                tickvals=list(range(len(sub_categories))),
                ticktext=sub_categories,
                tickangle=45,  # Rotate labels diagonally in the opposite direction
                tickfont=dict(size=11)  # Smaller font size for sub-category labels
            ),
            yaxis=dict(
                title='Accuracy (%)',
                range=[0, 110],  # Extend range to go beyond 100%
                tickformat=".0f"
            ),
            showlegend=False,
            barmode='group',
            bargap=0,  # No gaps between grouped bars
            template="plotly_white",
            font=dict(size=14)  # General font size for axis labels
        )
        if path:
            # Use the title as the file name, replacing spaces with underscores
            file_name = f"{fig.layout.title.text.replace(' ', '_')}.png"
            full_path = f"{path}{file_name}"

            # Save the plot as SVG
            fig.write_image(full_path, format="png", width=1200, height=900, scale=2)
        fig.show()


    def plot_compare_likelihood(self, df1, df2, model_name_1, model_name_2, path=None, with_std_bar=False):
        # Calculate mean and std for each model per sub-category
        stats1 = df1.groupby('Sub-Category')['Mean Confidence (%)'].agg(['mean', 'std'])
        stats2 = df2.groupby('Sub-Category')['Mean Confidence (%)'].agg(['mean', 'std'])

        # Join stats to combine both models' data in a single DataFrame
        combined_stats = stats1.join(stats2, lsuffix='_df1', rsuffix='_df2', how='outer').fillna(0)
        combined_stats_sorted = combined_stats.sort_values(by='mean_df1', ascending=True)

        # Create a plotly figure
        fig = go.Figure()

        # Add bars for the first model
        fig.add_trace(go.Bar(
            x=combined_stats_sorted.index,
            y=combined_stats_sorted['mean_df1'],
            name=model_name_1,
            error_y=dict(
                type='data',
                array=combined_stats_sorted['std_df1'] if with_std_bar else None,
                visible=with_std_bar  # Ensure this is a boolean
            ),
            marker_color='rgba(55, 128, 191, 0.7)',
            text=[f"{val:.1f}%" for val in combined_stats_sorted['mean_df1']],
            textposition='outside'
        ))

        # Add bars for the second model
        fig.add_trace(go.Bar(
            x=combined_stats_sorted.index,
            y=combined_stats_sorted['mean_df2'],
            name=model_name_2,
            error_y=dict(
                type='data',
                array=combined_stats_sorted['std_df2'] if with_std_bar else None,
                visible=with_std_bar  # Ensure this is a boolean
            ),
            marker_color='rgba(255, 153, 51, 0.7)',
            text=[f"{val:.1f}%" for val in combined_stats_sorted['mean_df2']],
            textposition='outside'
        ))

        # Update layout for grouped bars
        fig.update_layout(
            barmode='group',
            title=f"Compare {model_name_1} and {model_name_2} Confidence Score by Sub-Category",
            xaxis_title="Sub-Category",
            yaxis_title="Mean Confidence (%) in Model Answer",
            yaxis=dict(range=[0, 110]),
            template="plotly_white",
            title_x=0.5,
            title_font_size=20
        )

        # Save the chart as SVG if a path is provided
        if path:
            # Use the title text as the file name
            file_name = f"{fig.layout.title.text.replace(' ', '_')}.png"
            full_path = f"{path}{file_name}"

            # Save the plot as SVG
            fig.write_image(full_path, format="png", width=1200, height=900, scale=2)

        # Show the figure
        fig.show()



    def create_comparison_table(self, df1, df2, df3, df4, model_name_1, model_name_2, model_name_3, model_name_4, title, path=None):
        # Group each DataFrame by 'Sub-Category' and count occurrences
        stats1 = df1.groupby('Sub-Category').size().reset_index(name='Question Count')
        stats1['Question %'] = (stats1['Question Count'] / stats1['Question Count'].sum() * 100).map("{:.2f}%".format)

        # Summarize the number of correct answers per model
        model1_results = df1.groupby('Sub-Category')['Correct'].sum().rename(model_name_1)
        model2_results = df2.groupby('Sub-Category')['Correct'].sum().rename(model_name_2)
        model3_results = df3.groupby('Sub-Category')['Correct'].sum().rename(model_name_3)
        model4_results = df4.groupby('Sub-Category')['Correct'].sum().rename(model_name_4)

        # Combine all results into a single DataFrame
        comparison_df = stats1.set_index('Sub-Category').join([model1_results, model2_results, model3_results, model4_results])

        # Calculate percentage of correct answers for each model in each sub-category
        comparison_df[f'{model_name_1} %'] = (comparison_df[model_name_1] / comparison_df['Question Count'] * 100).fillna(0).map("{:.2f}%".format)
        comparison_df[f'{model_name_2} %'] = (comparison_df[model_name_2] / comparison_df['Question Count'] * 100).fillna(0).map("{:.2f}%".format)
        comparison_df[f'{model_name_3} %'] = (comparison_df[model_name_3] / comparison_df['Question Count'] * 100).fillna(0).map("{:.2f}%".format)
        comparison_df[f'{model_name_4} %'] = (comparison_df[model_name_4] / comparison_df['Question Count'] * 100).fillna(0).map("{:.2f}%".format)

        # Add a total row at the bottom
        total_row = pd.DataFrame({
            'Question Count': [comparison_df['Question Count'].sum()],
            'Question %': ['100.00%'],
            model_name_1: [comparison_df[model_name_1].sum()],
            model_name_2: [comparison_df[model_name_2].sum()],
            model_name_3: [comparison_df[model_name_3].sum()],
            model_name_4: [comparison_df[model_name_4].sum()],
            f'{model_name_1} %': [f"{(comparison_df[model_name_1].sum() / comparison_df['Question Count'].sum() * 100):.2f}%"],
            f'{model_name_2} %': [f"{(comparison_df[model_name_2].sum() / comparison_df['Question Count'].sum() * 100):.2f}%"],
            f'{model_name_3} %': [f"{(comparison_df[model_name_3].sum() / comparison_df['Question Count'].sum() * 100):.2f}%"],
            f'{model_name_4} %': [f"{(comparison_df[model_name_4].sum() / comparison_df['Question Count'].sum() * 100):.2f}%"]
        }, index=['Grand Total'])

        # Append the total row
        comparison_df = pd.concat([comparison_df, total_row])

        # Reset index for display
        comparison_df.reset_index(inplace=True)

        # Create the Plotly table
        fig = go.Figure(data=[go.Table(
            header=dict(
                values=list(comparison_df.columns),
                fill_color='paleturquoise',
                align='left',
                font=dict(size=14)  # Larger font size for headers
            ),
            cells=dict(
                values=[comparison_df[col] for col in comparison_df.columns],
                fill_color='lavender',
                align='left',
                font=dict(size=12)  # Larger font size for cells
            )
        )])

        fig.update_layout(title_text=title, title_x=0.5, title_font=dict(size=18))  # Larger title font size

        if path:
            # Use the title as the file name, replacing spaces with underscores
            file_name = f"{title.replace(' ', '_')}.png"
            full_path = f"{path}{file_name}"

            # Save the plot as PNG
            fig.write_image(full_path, format="png", width=1200, height=900, scale=2)

        fig.show()
        return fig





    def plot_confidence_accuracy_correlation(self, df, title,path=None):
        # Calculate mean confidence and accuracy per sub-category
        confidence_data = df.groupby('Sub-Category')['Mean Confidence (%)'].mean()
        accuracy_data = df.groupby('Sub-Category')['Correct'].mean() * 100  # Convert accuracy to percentage

        # Combine into a DataFrame
        correlation_df = pd.DataFrame({
            'Sub-Category': confidence_data.index,
            'Mean Confidence (%)': confidence_data.values,
            'Accuracy (%)': accuracy_data.values
        })

        # Calculate correlation
        correlation = correlation_df['Mean Confidence (%)'].corr(correlation_df['Accuracy (%)'])
        correlation_text = f"Correlation: {correlation:.2f}"

        # Create an interactive scatter plot with sub-category color and a distinct trend line
        fig = px.scatter(
            correlation_df,
            x='Mean Confidence (%)',
            y='Accuracy (%)',
            color='Sub-Category',  # Different color for each sub-category
            title=f"{title} <br>{correlation_text}",
            labels={"Mean Confidence (%)": "Mean Confidence (%)", "Accuracy (%)": "Accuracy (%)"}
        )

        # Add a trendline using OLS and customize its color
        trendline = px.scatter(correlation_df, x='Mean Confidence (%)', y='Accuracy (%)', trendline="ols")
        trendline.data[1].marker.color = 'black'  # Change trend line color
        trendline.data[1].name = 'Trend Line'     # Label for trend line
        fig.add_trace(trendline.data[1])  # Add trend line to the main plot

        # Customize layout for readability
        fig.update_layout(
            title_x=0.5,
            title_font_size=20,
            template="plotly_white",
            showlegend=True
        )
        if path:
            # Use the title as the file name, replacing spaces with underscores
            file_name = f"{title.replace(' ', '_')}.png"
            full_path = f"{path}{file_name}"

            # Save the plot as SVG
            fig.write_image(full_path, format="png", width=1200, height=900, scale=2)
        fig.show()






---



---



---



Mistral 7B:

In [7]:
basis_path_mistral_7b ="/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/Graphs & Drawings/full_corpus_recursive_split/Mistral_7B"
partial_1st_path_mistral_7b = basis_path_mistral_7b +"/partial_benchmark_start_1/"
partial_5th_path_mistral_7b = basis_path_mistral_7b +"/partial_benchmark_start_5/"
custom_corpus_path_mistral_7b = basis_path_mistral_7b +"/custom_corpus/"


# No RAG visualization Mistral_7B:
# mistral_7b_no_RAG_vis = LmHarnessVis(mistral_7b_no_RAG_results_path+"/mistralai__Mistral-7B-v0.1")
mistral_7b_no_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Mistral_7B/no_RAG_results/partial_1st_path/mistralai__Mistral-7B-v0.1")
json_mistral_7b_no_RAG = mistral_7b_no_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-15T10-22-02.958834.json")

# mistral_7b_no_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Mistral_7B/no_RAG_results/partial_5th_path/mistralai__Mistral-7B-v0.1")
# json_mistral_7b_no_RAG = mistral_7b_no_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-15T15-21-35.447939.json")


mistral_7b_no_RAG_df = mistral_7b_no_RAG_vis.create_res_df(json_mistral_7b_no_RAG)
mistral_7b_no_RAG_sub_acc_sub_df = mistral_7b_no_RAG_vis.calculate_accuracy(mistral_7b_no_RAG_df)
total_acc_mistral_7b_no_RAG = np.round(mistral_7b_no_RAG_vis.total_accuracy(mistral_7b_no_RAG_df),2)
mistral_7b_no_RAG_vis.create_bar_chart(mistral_7b_no_RAG_df,f'Mistral_7B_no_rag total acc: {total_acc_mistral_7b_no_RAG}',mistral_7b_no_RAG_sub_acc_sub_df,partial_1st_path_mistral_7b)
mistral_7b_no_RAG_vis.plot_average_likelihood(mistral_7b_no_RAG_df,"Mistral_7B_no_rag",partial_1st_path_mistral_7b)
mistral_7b_no_RAG_vis.plot_confidence_accuracy_correlation(mistral_7b_no_RAG_df,"Confident to Accuracy correlation - Mistral_7B no RAG",partial_1st_path_mistral_7b)

# RAG visualization Mistral_7B:
mistral_7b_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Mistral_7B/RAG_results/partial_1st_path/mistralai__Mistral-7B-v0.1")
json_mistral_7b_RAG = mistral_7b_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-27T10-30-31.710391.json")

# mistral_7b_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Mistral_7B/RAG_results/partial_5th_path/mistralai__Mistral-7B-v0.1")
# json_mistral_7b_RAG = mistral_7b_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-15T15-41-08.482236.json")

# mistral_7b_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Mistral_7B/RAG_results/custom_corpus")
# json_mistral_7b_RAG = mistral_7b_RAG_vis.read_res_json("samples_partial_sysengbench_2024-11-14T10-31-14.641784.json")

mistral_7b_df_RAG = mistral_7b_RAG_vis.create_res_df(json_mistral_7b_RAG)
mistral_7b_RAG_sub_acc_sub_df = mistral_7b_RAG_vis.calculate_accuracy(mistral_7b_df_RAG)
total_acc_mistral_7b_RAG = np.round(mistral_7b_RAG_vis.total_accuracy(mistral_7b_df_RAG),2)
mistral_7b_RAG_vis.create_bar_chart(mistral_7b_df_RAG,f'Mistral_7B_RAG total acc: {total_acc_mistral_7b_RAG}',mistral_7b_RAG_sub_acc_sub_df,partial_1st_path_mistral_7b)
mistral_7b_RAG_vis.plot_average_likelihood(mistral_7b_df_RAG,"Mistral_7B_RAG",partial_1st_path_mistral_7b)
mistral_7b_RAG_vis.plot_confidence_accuracy_correlation(mistral_7b_df_RAG,"Confident to Accuracy correlation - Mistral_7B RAG",partial_1st_path_mistral_7b)

# # RAG vs No RAG comparison:
mistral_7b_no_RAG_vis.create_comparison_bar_chart(mistral_7b_no_RAG_sub_acc_sub_df,mistral_7b_RAG_sub_acc_sub_df,"Mistral_7B_no_rag","Mistral_7B_rag",partial_1st_path_mistral_7b)
mistral_7b_no_RAG_vis.plot_compare_likelihood(mistral_7b_no_RAG_df,mistral_7b_df_RAG,"Mistral_7B_no_rag","Mistral_7B_rag",partial_1st_path_mistral_7b)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




---



---



---



Llama 2 13B:

In [None]:
# path to save images:
basis_path_llama_2_13b ="/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/Graphs & Drawings/full_corpus_recursive_split/Llama_2_13B"
partial_1st_path_llama_2_13b = basis_path_llama_2_13b +"/partial_benchmark_start_1/"
partial_5th_path_llama_2_13b = basis_path_llama_2_13b +"/partial_benchmark_start_5/"
custom_corpus_path_llama_2_13b = basis_path_llama_2_13b +"/custom_corpus/"


# No RAG visualization Llama_2_13B:
llama_2_13b_no_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Llama_2_13B/no_RAG_results/partial_1st_path/meta-llama__Llama-2-13b-chat-hf")
json_llama_2_13_no_RAG = llama_2_13b_no_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-14T16-06-25.251975.json")

# llama_2_13b_no_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Llama_2_13B/no_RAG_results/partial_5th_path/meta-llama__Llama-2-13b-chat-hf")
# json_llama_2_13_no_RAG = llama_2_13b_no_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-15T15-26-46.107117.json")


llama_2_13_no_RAG_df = llama_2_13b_no_RAG_vis.create_res_df(json_llama_2_13_no_RAG)
# llama_2_13_no_RAG_sub_acc_sub_df = llama_2_13b_no_RAG_vis.calculate_accuracy(llama_2_13_no_RAG_df)
# total_acc_llama_2_13_no_RAG = np.round(llama_2_13b_no_RAG_vis.total_accuracy(llama_2_13_no_RAG_df),2)
# llama_2_13b_no_RAG_vis.create_bar_chart(llama_2_13_no_RAG_df,f'Llama_2_13B_no_rag total acc: {total_acc_llama_2_13_no_RAG}',llama_2_13_no_RAG_sub_acc_sub_df,custom_corpus_path_llama_2_13b)
# llama_2_13b_no_RAG_vis.plot_average_likelihood(llama_2_13_no_RAG_df,"Llama_2_13B_no_rag",custom_corpus_path_llama_2_13b)
# llama_2_13b_no_RAG_vis.plot_confidence_accuracy_correlation(llama_2_13_no_RAG_df,"Confident to Accuracy correlation - Llama_2_13B no RAG",custom_corpus_path_llama_2_13b)


# RAG visualization Llama_2_13B:
llama_2_13b_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Llama_2_13B/RAG_results/partial_1st_path/meta-llama__Llama-2-13b-chat-hf")
json_llama_2_13_RAG = llama_2_13b_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-15T12-59-30.220476.json")

# llama_2_13b_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Llama_2_13B/RAG_results/partial_5th_path/meta-llama__Llama-2-13b-chat-hf")
# json_llama_2_13_RAG = llama_2_13b_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-15T15-38-45.114743.json")

# llama_2_13b_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Llama_2_13B/RAG_results/custom_corpus_partial")
# json_llama_2_13_RAG = llama_2_13b_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-15T11-19-57.314650.json")

llama_2_13_df_RAG = llama_2_13b_RAG_vis.create_res_df(json_llama_2_13_RAG)
# llama_2_13_RAG_sub_acc_sub_df = llama_2_13b_RAG_vis.calculate_accuracy(llama_2_13_df_RAG)
# total_acc_llama_2_13_RAG = np.round(llama_2_13b_RAG_vis.total_accuracy(llama_2_13_df_RAG),2)
# llama_2_13b_RAG_vis.create_bar_chart(llama_2_13_df_RAG,f'Llama_2_13B_RAG total acc: {total_acc_llama_2_13_RAG}',llama_2_13_RAG_sub_acc_sub_df,custom_corpus_path_llama_2_13b)
# llama_2_13b_RAG_vis.plot_average_likelihood(llama_2_13_df_RAG,"Llama_2_13B_RAG",custom_corpus_path_llama_2_13b)
# llama_2_13b_RAG_vis.plot_confidence_accuracy_correlation(llama_2_13_df_RAG,"Confident to Accuracy correlation - Llama_2_13B RAG",custom_corpus_path_llama_2_13b)

# # RAG vs No RAG comparison:
# llama_2_13b_RAG_vis.create_comparison_bar_chart(llama_2_13_no_RAG_sub_acc_sub_df,llama_2_13_RAG_sub_acc_sub_df,"Llama_2_13B_no_rag","Llama_2_13B_rag",custom_corpus_path_llama_2_13b)
# llama_2_13b_RAG_vis.plot_compare_likelihood(llama_2_13_no_RAG_df,llama_2_13_df_RAG,"Llama_2_13B_no_rag","Llama_2_13B_rag",custom_corpus_path_llama_2_13b)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




---



---



---



Orca 2 7B:

In [None]:
basis_path_orca_2 ="/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/Graphs & Drawings/full_corpus_recursive_split/Orca_2_7B"
partial_1st_path_gorca_2 = basis_path_orca_2 +"/partial_benchmark_start_1/"
partial_5th_path_orca_2 = basis_path_orca_2 +"/partial_benchmark_start_5/"
custom_corpus_path_orca_2 = basis_path_orca_2 +"/custom_corpus/"


# No RAG visualization Orca_2:
orca_2_no_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Orca_2_7B/no_RAG_results/partial_1st_path/microsoft__Orca-2-7b")
json_orca_2_no_RAG = orca_2_no_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-15T10-42-50.239223.json")

# orca_2_no_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Orca_2_7B/no_RAG_results/partial_5th_path/microsoft__Orca-2-7b")
# json_orca_2_no_RAG = orca_2_no_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-15T09-11-45.281239.json")



orca_2_no_RAG_df = orca_2_no_RAG_vis.create_res_df(json_orca_2_no_RAG)
# orca_2_no_RAG_sub_acc_sub_df = orca_2_no_RAG_vis.calculate_accuracy(orca_2_no_RAG_df)
# total_acc_orca_2_no_RAG = np.round(orca_2_no_RAG_vis.total_accuracy(orca_2_no_RAG_df),2)
# orca_2_no_RAG_vis.create_bar_chart(orca_2_no_RAG_df,f'Orca_2_no_rag total acc: {total_acc_orca_2_no_RAG}',orca_2_no_RAG_sub_acc_sub_df,custom_corpus_path_orca_2)
# orca_2_no_RAG_vis.plot_average_likelihood(orca_2_no_RAG_df,"Orca_2_no_rag",custom_corpus_path_orca_2)
# orca_2_no_RAG_vis.plot_confidence_accuracy_correlation(orca_2_no_RAG_df,"Confident to Accuracy correlation - Orca_2 no RAG",custom_corpus_path_orca_2)

# RAG visualization Orca_2:
orca_2_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Orca_2_7B/RAG_results/partial_1st_path/microsoft__Orca-2-7b")
json_orca_2_RAG = orca_2_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-14T16-42-02.056195.json")

# orca_2_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Orca_2_7B/RAG_results/partial_5th_path/microsoft__Orca-2-7b")
# json_orca_2_RAG = orca_2_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-14T16-49-23.878589.json")

# orca_2_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Orca_2_7B/RAG_results/custom_corpus")
# json_orca_2_RAG = orca_2_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-15T11-04-38.044703.json")

orca_2_df_RAG = orca_2_RAG_vis.create_res_df(json_orca_2_RAG)
# orca_2_RAG_sub_acc_sub_df = orca_2_RAG_vis.calculate_accuracy(orca_2_df_RAG)
# total_acc_orca_2_RAG = np.round(orca_2_RAG_vis.total_accuracy(orca_2_df_RAG),2)
# orca_2_RAG_vis.create_bar_chart(orca_2_df_RAG,f'Orca_2_RAG total acc: {total_acc_orca_2_RAG}',orca_2_RAG_sub_acc_sub_df,custom_corpus_path_orca_2)
# orca_2_RAG_vis.plot_average_likelihood(orca_2_df_RAG,"Orca_2_RAG",custom_corpus_path_orca_2)
# orca_2_RAG_vis.plot_confidence_accuracy_correlation(orca_2_df_RAG,"Confident to Accuracy correlation - Orca_2 RAG",custom_corpus_path_orca_2)

# # RAG vs No RAG comparison:
# orca_2_no_RAG_vis.create_comparison_bar_chart(orca_2_no_RAG_sub_acc_sub_df,orca_2_RAG_sub_acc_sub_df,"Orca_2_no_RAG","Orca_2_RAG",custom_corpus_path_orca_2)
# orca_2_no_RAG_vis.plot_compare_likelihood(orca_2_no_RAG_df,orca_2_df_RAG,"Orca_2_no_RAG","Orca_2_RAG",custom_corpus_path_orca_2)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




---



---



---



Gemma 2 2B:

In [None]:
# path to save images:
basis_path_gemma_2 ="/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/Graphs & Drawings/full_corpus_recursive_split/Gemma_2_2B"
partial_1st_path_gemma_2 = basis_path_gemma_2 +"/partial_benchmark_start_1/"
partial_5th_path_gemma_2 = basis_path_gemma_2 +"/partial_benchmark_start_5/"
custom_corpus_path_gemma_2 = basis_path_gemma_2 +"/custom_corpus/"


# No RAG visualization Gemma 2:
gemma_2_no_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Gemma_2_2B/no_RAG_results/partial_1st_path/google__gemma-2-2b-it")
json_gemma_2_no_RAG = gemma_2_no_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-21T10-11-39.938950.json")

# gemma_2_no_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Gemma_2_2B/no_RAG_results/partial_5th_path/google__gemma-2-2b-it")
# json_gemma_2_no_RAG = gemma_2_no_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-21T10-21-43.085026.json")


gemma_2_no_RAG_df = gemma_2_no_RAG_vis.create_res_df(json_gemma_2_no_RAG)
gemma_2_no_RAG_sub_acc_sub_df = gemma_2_no_RAG_vis.calculate_accuracy(gemma_2_no_RAG_df)
total_acc_gemma_2_no_RAG = np.round(gemma_2_no_RAG_vis.total_accuracy(gemma_2_no_RAG_df),2)
gemma_2_no_RAG_vis.create_bar_chart(gemma_2_no_RAG_df,f'Gemma_2_no_rag total acc: {total_acc_gemma_2_no_RAG}',gemma_2_no_RAG_sub_acc_sub_df,path = custom_corpus_path_gemma_2)
gemma_2_no_RAG_vis.plot_average_likelihood(gemma_2_no_RAG_df,"Gemma_2_no_rag",path = custom_corpus_path_gemma_2)
gemma_2_no_RAG_vis.plot_confidence_accuracy_correlation(gemma_2_no_RAG_df,"Confident to Accuracy correlation - Gemma_2 no RAG",custom_corpus_path_gemma_2)


# RAG visualization Gemma 2:
gemma_2_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Gemma_2_2B/RAG_results/partial_1st_path/google__gemma-2-2b-it")
json_gemma_2_RAG = gemma_2_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-21T10-19-30.651800.json")

# gemma_2_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Gemma_2_2B/RAG_results/partial_5th_path/google__gemma-2-2b-it")
# json_gemma_2_RAG = gemma_2_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-21T10-22-53.730365.json")

# gemma_2_RAG_vis = LmHarnessVis("/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/lm_harness_results/Gemma_2_2B/RAG_results/google__gemma-2-2b-it")
# json_gemma_2_RAG = gemma_2_RAG_vis.read_res_json("samples_partial_sysengbench_prompt_eng_2024-11-21T10-25-12.248356.json")

gemma_2_df_RAG = gemma_2_RAG_vis.create_res_df(json_gemma_2_RAG)
# gemma_2_RAG_sub_acc_sub_df = gemma_2_RAG_vis.calculate_accuracy(gemma_2_df_RAG)
# total_acc_gemma_2_RAG = np.round(gemma_2_RAG_vis.total_accuracy(gemma_2_df_RAG),2)
# gemma_2_RAG_vis.create_bar_chart(gemma_2_df_RAG,f'Gemma_2_RAG total acc: {total_acc_gemma_2_RAG}',gemma_2_RAG_sub_acc_sub_df,custom_corpus_path_gemma_2)
# gemma_2_RAG_vis.plot_average_likelihood(gemma_2_df_RAG,"Gemma_2_RAG",custom_corpus_path_gemma_2)
# gemma_2_RAG_vis.plot_confidence_accuracy_correlation(gemma_2_df_RAG,"Confident to Accuracy correlation - Gemma_2 RAG",custom_corpus_path_gemma_2)


# # RAG vs No RAG comparison:
# gemma_2_no_RAG_vis.create_comparison_bar_chart(gemma_2_no_RAG_sub_acc_sub_df,gemma_2_RAG_sub_acc_sub_df,"Gemma_2_no_RAG","Gemma_2_RAG",custom_corpus_path_gemma_2)
# gemma_2_no_RAG_vis.plot_compare_likelihood(gemma_2_no_RAG_df,gemma_2_df_RAG,"Gemma_2_no_RAG","Gemma_2_RAG",custom_corpus_path_gemma_2)




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




---



---



---



Comparison table:

In [None]:
path = "/content/drive/MyDrive/Technion - LLM Research 2024/Coding Environment /Resources/RAG resouces/Graphs & Drawings/full_corpus_recursive_split/"
vis = LmHarnessVis("")
# vis.create_comparison_table(mistral_7b_no_RAG_df,orca_2_no_RAG_df,llama_2_13_no_RAG_df,gemma_2_no_RAG_df,"Mistral 7B","Orca 2 7B","Llama 2 13B","Gemma 2 2B","160 Question start at 1 no RAG",path)
# vis.create_comparison_table(mistral_7b_no_RAG_df,orca_2_no_RAG_df,llama_2_13_no_RAG_df,gemma_2_no_RAG_df,"Mistral 7B","Orca 2 7B","Llama 2 13B","Gemma 2 2B","160 Question start at 5 no RAG",path)
# vis.create_comparison_table(mistral_7b_df_RAG,orca_2_df_RAG,llama_2_13_df_RAG,gemma_2_df_RAG,"Mistral 7B","Orca 2 7B","Llama 2 13B","Gemma 2 2B","160 Question start at 1 RAG - Custom corpus",path)
vis.create_comparison_table(mistral_7b_df_RAG,orca_2_df_RAG,llama_2_13_df_RAG,gemma_2_df_RAG,"Mistral 7B","Orca 2 7B","Llama 2 13B","Gemma 2 2B","160 Question start at 1 RAG",path)
# vis.create_comparison_table(mistral_7b_df_RAG,orca_2_df_RAG,llama_2_13_df_RAG,gemma_2_df_RAG,"Mistral 7B","Orca 2 7B","Llama 2 13B","Gemma 2 2B","160 Question start at 5 RAG",path)





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




---



---



---

