In [103]:
# Load the data
from pathlib import Path
import pandas as pd
import numpy as np

data_csv = Path("reviews.csv")

if not data_csv.exists():  # make sure the file exists, otherwise compile it
    from result_compiler import main as data_compiler
    data_compiler()

df = pd.read_csv(data_csv)

# Sort the sources in the DataFrame as AAAI, IJCAI then ICML, NeurIPS and JAIR, JMLR
source_order = ["AAAI", "IJCAI", "ICML", "NeurIPS", "JAIR", "JMLR"]
df["source"] = pd.Categorical(df["source"], source_order)
# Drop the index column
df.drop("index", axis=1, inplace=True)
# Drop the theoretical papers
df = df[df["theoretical"] == False]

# Load the enriched data
enriched_data_csv = Path("enriched_data.csv")
enriched_df = pd.read_csv(enriched_data_csv)

missing_titles_enriched = [l.strip() for l in Path("unfindable.txt").open().readlines()]

In [104]:
# Create a mapping between titles
import difflib

mapping_to_enriched_title = {}
for title in df["title"]:
    if title in missing_titles_enriched:  # Is not present, was not findable on scopus
        continue
    if title in enriched_df["title"]:  # Easy case
        mapping_to_enriched_title[title] = title
    else:
        close_matches = difflib.get_close_matches(title, enriched_df["title"], cutoff=0.6)
        if len(close_matches) == 0:
            print("Could not match:", title)
            continue
        mapping_to_enriched_title[title] = close_matches[0]

print(len(mapping_to_enriched_title.keys()))
print(len(set([v for v in mapping_to_enriched_title.values()])))

Could not match: Entropy Estimation via Normalizing Flow
Could not match: SVFI: Spiking-Based Video Frame Interpolation for High-Speed Motion
Could not match: Align, Perturb and Decouple: Toward Better Leverage of Difference Information for RSI Change Detection
Could not match: Convergence in Multi-Issue Iterative Voting under Uncertainty
Could not match: HEBO: Pushing The Limits of Sample-Efficient Hyper-parameter Optimisation
Could not match: Underspecification Presents Challenges for Credibility in Modern Machine Learning
Could not match: On the Effect of Initialization: The Scaling Path of 2-Layer Neural Networks
Could not match: Nonparametric Inference under B-bits Quantization
Could not match: Mean-Square Analysis of Discretized ItÃ´ Diffusions for Heavy-tailed Sampling
Could not match: Convergence for nonconvex ADMM, with applications to CT imaging
Could not match: Localized Debiased Machine Learning: Efficient Inference on Quantile Treatment Effects and Beyond
Could not match: 

In [105]:
for key in mapping_to_enriched_title.keys():
    value = mapping_to_enriched_title[key]
    matches = [v for v in mapping_to_enriched_title.values() if v == value]
    if len(matches) > 1:
        print(key, ":", value)
        

In [106]:
# Add citations count column and country of first authors
df["citation_count"] = None
df["country"] = None

def most_frequent(l: list):
    return max(set(l), key=l.count)

for title, mapping in mapping_to_enriched_title.items():
    enriched_row = enriched_df[enriched_df["title"] == mapping]
    df.loc[df["title"] == title, "citation_count"] = enriched_row["citedby_count"].values[0]
    aff = enriched_row["affiliation_country"].values[0]
    if isinstance(aff, str):
        countries = enriched_row["affiliation_country"].values[0].split(";")
        #df.loc[df["title"] == title, "country"] = enriched_row["affiliation_country"].values[0].split(";")[0]  # First author country affiliation
        df.loc[df["title"] == title, "country"] = most_frequent(countries)  # Most frequent country affiliation

In [107]:
# Calculate the average rating for each paper (Excluding expertise)
review_categories = ["Implementation", "Data", "Configuration", "Experimental Procedure", "Expertise"]
df["Average"] = df[review_categories[0:-1]].mean(axis=1)

In [108]:
df_reduce = df.drop(['source', 'year', 'title', 'authors', 'keywords', 'pdf_path', 'awards', 'theoretical', 'implementation_url', 'public_datasets','total_datasets', 'Implementation', 'Data', 'Configuration', 'Experimental Procedure', 'Expertise'], axis=1)
# Only use the ones with citation values
df_reduce = df_reduce[df_reduce["citation_count"].notna()]
country_id = {v:i for i, v in enumerate(df_reduce["country"].unique())}
df_reduce["country"] = df_reduce["country"].replace(country_id)

correlation = df_reduce.corr(method="kendall")
print(correlation)

                citation_count   country   Average
citation_count        1.000000 -0.063923  0.045194
country              -0.063923  1.000000 -0.161919
Average               0.045194 -0.161919  1.000000


  df_reduce["country"] = df_reduce["country"].replace(country_id)


# SCOPUS Data: Our thoughts on this study

We also conducted a brief analysis by enriching our collected data with data from SCOPUS, using Pybliometrics. We were able to enrich 854 reviews. From this data, we used the citation count and the majority country of origin of the author's affiliation to calculate the Kendall correlation with the average cost. We found that the citation count has no significant correlation with $0.05$, whereas the affiliation country only has a weak negative correlation of $-0.16$. We do recognise that making claims based on this meta information may lead to a misunderstanding between the found correlation of sensitive information, such as country of affiliation origin, and thus do not attach any claims or conclusions to this information.