# Review analysis

In this notebook, we conduct all general analyses from the paper section 5. For the analysis of the second review, and the reasoning for the subsequent omission of the Expertise dimension in these analyses, see '2nd_review_analysis.ipynb'.

In [81]:
# Load the data
from pathlib import Path
import pandas as pd
import numpy as np

data_csv = Path("reviews.csv")

if not data_csv.exists():  # Make sure the file exists, otherwise compile it
    from result_compiler import main as data_compiler
    data_compiler()

df = pd.read_csv(data_csv)

# Sort the venues (sources) in the DataFrame as AAAI, IJCAI then ICML, NeurIPS and JAIR, JMLR
source_order = ["AAAI", "IJCAI", "ICML", "NeurIPS", "JAIR", "JMLR"]
df["source"] = pd.Categorical(df["source"], source_order)
# Drop the index column
df.drop("index", axis=1, inplace=True)

# Import the analysis/visualisation libraries
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import latex
pio.kaleido.scope.mathjax = None

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)  

# Define default plot layout
review_categories = ["Implementation", "Data", "Configuration", "Experimental Procedure", "Expertise"]
width = 450
height = 200
layout = dict(
    boxmode="group",
    width=width * (1/0.6),
    height=height * (1/0.6),
    font=dict(
        family="serif",
        size=15,
    ),
    margin=dict(
        t=10,
        b=10,
        l=10,
        r=10,
    ),
    xaxis=dict(
        tickangle=0,
    ),
    yaxis=dict(
        title="Cost",
        range=[1, 10],
        dtick = 1,
    ),
    legend_title=None,
    legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="right", x=1),
)



Use of plotly.io.kaleido.scope.mathjax is deprecated and support will be removed after September 2025.
Please use plotly.io.defaults.mathjax instead.




In [82]:
# Analysis functions for generating tables and graphs

def display_table_statistics(df: pd.DataFrame, columns: list) -> None:
    """Create a table from the selected data and print it."""
    df2 = df.drop([c for c in df.columns if c not in columns], axis=1).T
    dropping = df2.columns
    mean = df2.mean(axis=1)
    std = df2.std(axis=1)
    q1 = df2.quantile(q=0.25, axis=1)
    q2 = df2.quantile(q=0.5, axis=1)
    q3 = df2.quantile(q=0.75, axis=1)
    variance = df2.var(axis=1)
    df2 = df2.drop(dropping, axis=1)
    df2["mean"] = mean
    df2["std"] = std
    df2["q1"] = q1
    df2["q2"] = q2
    df2["q3"] = q3
    df2["variance"] = variance
    display(df2.round(4))

def display_box_plot(df: pd.DataFrame, layout: dict, title: str, labels: list) -> None:
    """Create a boxplot from the selected data and print it."""
    fig = px.box(df, y=labels, color="year", title=title, labels=labels)
    fig.update_layout(layout)
    fig.show()

def display_bar_plot(df: pd.DataFrame, layout: dict, title: str, labels: list) -> None:
    """Create a barplot from the selected data and print it."""
    fig = px.bar(df, x="year", y=labels, labels=labels)
    layout_adapt = layout.copy()
    layout_adapt["barmode"] = "group"
    layout_adapt["yaxis"] = dict(range=[1, 350], dtick=50, title=title)
    fig.update_layout(layout_adapt)
    fig.show()

def display_correlation_matrix(df: pd.DataFrame, filter=None, threshold = 0.25) -> None:
    """Create a correlation matrix from the selected data and print it.

    Args:
        df: The dataframe to create the correlation matrix from.
        filter: Columns to filter
        threshold: The threshold to apply to the correlation matrix for visualisation.
    """
    def dfstyle(val):
        map = []
        for v in val:
            if not isinstance(v, float) or abs(v) > threshold:
                map.append("font-weight: bold")
            else:
                map.append("font-weight: normal")
        return map

    # We select Kendall because it makes no assumptions about the data distributions,
    # and its robust to outliers and non-linearity.
    correlation = df.corr(method="kendall", numeric_only=True)
    if filter:
        correlation = correlation.drop(filter, axis=1)
    correlation.mask(correlation == 1.0, inplace=True)
    display(correlation.style.apply(dfstyle))

def display_correlation_heatmap(df: pd.DataFrame, title: str, show_title_plot=False, filter=None) -> None:
    """Create a correlation heatmap from the selected data and print it."""
    corr_df = df.corr(method="kendall", numeric_only=True).round(2)
    if filter:
        corr_df = corr_df.drop(filter, axis=1).drop(filter, axis=0)
    # Mark the 'significant' results bold face?
    fig = px.imshow(corr_df, text_auto=True, zmin=-1, zmax=1, color_continuous_scale="Viridis", title=title if show_title_plot else None)
    fig.write_image(f"latex/plots/{title}.pdf")
    fig.show()

def display_covariance_heatmap(df: pd.DataFrame, title: str, filter=None) -> None:
    """Create a covariance heatmap from the selected data and print it."""
    cov_df = df.cov(numeric_only=True).round(2)
    if filter:
        cov_df = cov_df.drop(filter, axis=1).drop(filter, axis=0)
    np.fill_diagonal(cov_df.values, 0)  # Mask the variance
    fig = px.imshow(cov_df, text_auto=True, color_continuous_scale="Viridis", title=title)
    fig.show()

## General Metrics

In this section of the notebook, we extract general metrics over all sources, such as the implementation rate and data set statistics. We also count the number of theoretical papers and *drop them from the dataset* s.t. as they are not part of any further analysis.



In [83]:
# Create a general dataset statistics table
header = source_order
years = list(df["year"].unique())

values = []
for year in years:
    values.append([])
    for source in header:
        subset = df[(df["source"] == source) & (df["year"] == year)]
        values[-1].append(f"{len(subset[subset["theoretical"] == False])} ({len(subset[subset["theoretical"] == True])})")

# Add total row
values.append([])
for source in header:
    subset = df[(df["source"] == source)]
    values[-1].append(f"{len(subset[subset["theoretical"] == False])} ({len(subset[subset["theoretical"] == True])})")
gen_stats = latex.latex_table(columns=header, rows=years + ["Total"], values=values, label="tab:data_summary", column_char="r",
                              caption="A summary of the collected data, where the first number represents the number of applicable reviews "
                                      "and the number of excluded (theoretical) papers is represented in brackets. "
                                      f"We collected {len(df[df["theoretical"] == False])} applicable reviews out of {len(df)} total papers reviewed.")
latex.compile_latex(gen_stats, "tables/general_statistics")
# Measure the amount of theoretical papers that we will have to discount
theoretical_papers = df["theoretical"].sum()
print(f"Total number of theoretical papers: {theoretical_papers}. Percentage: {(theoretical_papers / len(df) * 100).round(4)}%")

df = df[df["theoretical"] == False]
df.drop(["theoretical"], axis=1, inplace=True)
print(f"Total number of applicable papers: {len(df)}")

# Measure the amount of papers that have implementation URLS
print(f"Total number of implementation URLs: {df["implementation_url"].sum()}. Percentage: {(df["implementation_url"].sum() / len(df) * 100).round(4)}%")

# Measure the amount of data sets used and how many were public vs private
avg = (df["public_datasets"].sum() / df["total_datasets"].sum() * 100).round(2)
title = f"Public data sets vs Total Data sets ({avg}%)"
plot = px.bar(df, x="public_datasets", y="total_datasets", title=title)
plot.show()

df["public_data_fraction"] = df["public_datasets"] / df["total_datasets"]
print("Number of papers / percentage that only use private data:", len(df[df["public_datasets"] == 0]), round(len(df[df["public_datasets"] == 0]) / len(df) * 100, 2))
df.drop(["public_datasets", "total_datasets"], axis=1, inplace=True)


# Calculate the papers with implementation url and cost >= 5 based on the guideline
bad_url_count = len(df[(df["implementation_url"] == True) & (df["Implementation"] >= 5)])
print(f"The bad URL count is: {bad_url_count}")

# Calculate the average rating for each paper (Excluding expertise)
df["Average"] = df[review_categories[0:-1]].mean(axis=1)

Total number of theoretical papers: 134. Percentage: 14.7091%
Total number of applicable papers: 777
Total number of implementation URLs: 477. Percentage: 61.39%


Number of papers / percentage that only use private data: 31 3.99
The bad URL count is: 47


Now we verify the distribution of the reviews, to check which statistical tests can be applied

In [84]:
import scipy
# Check the normality distribution
for source in source_order:
    print(f"{source}:")
    for dimension in review_categories[0:-1]:
        normality_assumption = scipy.stats.normaltest(df[df["source"] == source][dimension].to_numpy())
        if normality_assumption.pvalue < 0.05:
            print(f"\t{source} {dimension} is NOT normally distributed ({normality_assumption.statistic:.2f}, {normality_assumption.pvalue:.2f})")
        else:
            print(f"\t{source} {dimension} IS normally distributed ({normality_assumption})")

AAAI:
	AAAI Implementation is NOT normally distributed (1821.31, 0.00)
	AAAI Data is NOT normally distributed (36.11, 0.00)
	AAAI Configuration is NOT normally distributed (15.86, 0.00)
	AAAI Experimental Procedure is NOT normally distributed (36.61, 0.00)
IJCAI:
	IJCAI Implementation is NOT normally distributed (1429.60, 0.00)
	IJCAI Data is NOT normally distributed (37.79, 0.00)
	IJCAI Configuration is NOT normally distributed (20.90, 0.00)
	IJCAI Experimental Procedure is NOT normally distributed (66.52, 0.00)
ICML:
	ICML Implementation is NOT normally distributed (400.27, 0.00)
	ICML Data is NOT normally distributed (12.57, 0.00)
	ICML Configuration is NOT normally distributed (55.77, 0.00)
	ICML Experimental Procedure is NOT normally distributed (33.17, 0.00)
NeurIPS:
	NeurIPS Implementation is NOT normally distributed (23.17, 0.00)
	NeurIPS Data is NOT normally distributed (33.66, 0.00)
	NeurIPS Configuration is NOT normally distributed (35.49, 0.00)
	NeurIPS Experimental Procedu

We find that none of the venues are normally distributed, thus we use KS-test and permutation test as they make no assumptions regarding the distributions.

Now we first conduct an analysis on all award winning papers, and *drop them afterwards from the dataset*.

In [85]:
# Compare award papers per source and check if they are in or out of distribution
import itertools
import scipy
ks_table = [[], []]

conference_source_order = ["AAAI", "IJCAI", "ICML", "NeurIPS"]  # Not all venues were applicable for awards
years = df["year"].unique()

all_combinations = [(source, year) for source, year in itertools.product(source_order, years)]
all_combination_title = [f"{source}, {year}" for source, year in all_combinations]
for source in conference_source_order:
    kstest_res = scipy.stats.kstest(df[(df["source"] == source) & (df["awards"].isna())]["Average"].to_numpy(),
                                    df[(df["source"] == source) & (df["awards"].notna())]["Average"].to_numpy())
    result = f"{kstest_res.statistic:.2f} ({kstest_res.pvalue:.2f})"

    if kstest_res.pvalue < 0.05:  # Write significant values in boldface
        result = "\\textbf{" + f"{kstest_res.statistic:.2f}" + "} " + f"({kstest_res.pvalue:.2f})"
    ks_table[0].append(result)

for source in conference_source_order:
    def statistic_func(x, y, axis):
            return np.mean(x, axis=axis) - np.mean(y, axis=axis)
    permutation_test_res = scipy.stats.permutation_test([df[(df["source"] == source) & (df["awards"].notna())]["Average"].to_numpy(),
                                                         df[(df["source"] == source) & (df["awards"].isna())]["Average"].to_numpy()],
                                                        statistic_func,
                                                        alternative="less")
    result = f"{permutation_test_res.statistic:.2f} ({permutation_test_res.pvalue:.2f})"
    if permutation_test_res.pvalue < 0.05:
        print(f"Awards of {source} are significantly less costly ({permutation_test_res.statistic:.2f}, {permutation_test_res.pvalue:.2f})")
        result = "\\textbf{" + f"{permutation_test_res.statistic:.2f}" + "} " + f"({permutation_test_res.pvalue:.2f})"
    ks_table[1].append(result)

ks_table = latex.latex_table(conference_source_order, ["KS-Test result", "Permutation test result"], ks_table, column_char="r",
                             caption="Kolmogorovâ€“Smirnov test and Permutation test per source between papers with and without awards. The P-values are in brackets. The statistically significant results are highlighted in \\textbf{bold}.",
                             label="tab:ks_test_sources_awards")

latex.compile_latex(ks_table, "tables/ks_test_awards")

# Remove awards from population
df = df[df["awards"].isna()]

Awards of IJCAI are significantly less costly (-1.46, 0.01)


After reading the papers, we had a sneaking suspicion... 

In [91]:
# Random statistic for fun
def average_title_length(df: pd.DataFrame, source: str, year: int) -> None:
    return df.query(f"source == '{source}' and year == {year}")["title"].apply(len).mean().__round__(2)
print("AAAI average title lengths over 3 years:",
      average_title_length(df, "AAAI", 2022),
      average_title_length(df, "AAAI", 2023),
      average_title_length(df, "AAAI", 2024))
print("ICML average title lengths over 3 years:",
      average_title_length(df, "ICML", 2022),
      average_title_length(df, "ICML", 2023),
      average_title_length(df, "ICML", 2024))
print("IJCAI average title lengths over 3 years:",
      average_title_length(df, "IJCAI", 2022),
      average_title_length(df, "IJCAI", 2023),
      average_title_length(df, "IJCAI", 2024))
print("JAIR average title lengths over 3 years:",
      average_title_length(df, "JAIR", 2022),
      average_title_length(df, "JAIR", 2023),
      average_title_length(df, "JAIR", 2024))
print("JMLR average title lengths over 3 years:",
      average_title_length(df, "JMLR", 2022),
      average_title_length(df, "JMLR", 2023),
      average_title_length(df, "JMLR", 2024))
print("NeurIPS average title lengths over 3 years:",
      average_title_length(df, "NeurIPS", 2022),
      average_title_length(df, "NeurIPS", 2023),
      average_title_length(df, "NeurIPS", 2024))


AAAI average title lengths over 3 years: 76.18 74.6 82.33
ICML average title lengths over 3 years: 67.62 72.87 74.6
IJCAI average title lengths over 3 years: 74.0 73.9 78.07
JAIR average title lengths over 3 years: 71.49 74.86 71.94
JMLR average title lengths over 3 years: 73.15 68.21 71.39
NeurIPS average title lengths over 3 years: 63.42 75.02 73.47


Interesting! But lets not delve into the statistical significance of it ;)

Now we calculate the average and standard deviations of each venue/dimension combination and create several tables to use for the paper

In [98]:
display_correlation_heatmap(df.rename({"implementation_url": "URL", "public_data_fraction": "Public Data"}, axis=1), "Correlation Heatmap", filter=["Average", "year"])



Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




In [99]:
# Create a correlation heatmap per source as subplots
filter = ["year", "Average"]

# Arrange subfigures axis for paper
#enable_axis = [(True, False), (False, False), (True, False), (False, False), (True, True), (False, True)]
for index, source in enumerate(source_order):
    corr_df = df[df["source"] == source].corr(method="kendall", numeric_only=True).round(2)
    corr_df = corr_df.drop(filter, axis=1).drop(filter, axis=0)
    corr_df = corr_df.rename({"implementation_url": "URL", "public_data_fraction": "Public Data"}, axis=1)
    corr_df = corr_df.rename({"implementation_url": "URL", "public_data_fraction": "Public Data"}, axis=0)
    sub_fig = px.imshow(corr_df, text_auto=True, zmin=-1, zmax=1, color_continuous_scale="Viridis", title=f"{source}")
    #if not enable_axis[index][0]:
    #    sub_fig.update_yaxes(showticklabels=False)
    #if not enable_axis[index][1]:
    #    sub_fig.update_xaxes(showticklabels=False)
    sub_fig.write_image(f"latex/plots/{source}_correlation.pdf")
sub_fig.show()



Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 202