# Bar Graph Classification

Teng-Jui Lin

- Created: 2023-05-27
- Edited: 2024-09-28

In [None]:
from bar_constants import *
set_save_fig_rc()
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

## Calculate article statistics

### Import Zotero article data

In [None]:
# import zotero article data
articles_df = pd.DataFrame()
for i, journal in enumerate(JOURNALS):
    filename = os.path.join(ZOTERO_PATH, f'{journal}.csv')
    articles_df_ = pd.read_csv(filename)
    articles_df = pd.concat([articles_df, articles_df_])
articles_df.shape

### Calculate number of authors

In [None]:
# extract individual authors
authors_df_ = articles_df['Author'].str.split(';', expand=True)
# calculate number of authors
num_authors_series = (~pd.isna(authors_df_)).sum(axis=1)
articles_df[NUM_AUTHORS_LABEL] = num_authors_series

In [None]:
# extract individual word in title
NUM_WORDS_TITLE_LABEL = 'Title Word Count'
title_df_ = articles_df['Title'].str.split(' ', expand=True)
# calculate number of words in title
title_series = (~pd.isna(title_df_)).sum(axis=1)
articles_df[NUM_WORDS_TITLE_LABEL] = title_series

### Add article statistics labels

In [None]:
def assign_boolean(df, regex, label, assigned_bool):
    df.loc[df[MANUAL_TAGS_LABEL].str.contains(regex), label] = assigned_bool

In [None]:
# mutually exclusive label: does the article has at least 1 bar graph
articles_df[HAS_BAR_GRAPH_LABEL] = True
assign_boolean(articles_df, NO_BAR_GRAPH_REGEX_LABEL, HAS_BAR_GRAPH_LABEL, False)

In [None]:
# mutually exclusive label: does the article has at least 1 inaccurate bar graph
# separated into two columns because have articles with no bar graph
articles_df[NO_MISUSE_LABEL] = False
articles_df[HAS_MISUSE_LABEL] = False
assign_boolean(articles_df, NO_MISUSE_REGEX_LABEL, NO_MISUSE_LABEL, True)
assign_boolean(articles_df, HAS_MISUSE_REGEX_LABEL, HAS_MISUSE_LABEL, True)

In [None]:
# non-mutually exclusive labels with one hot encoding
# log, zero, others
articles_df[ZERO_PROBLEM_LABEL] = False
articles_df[LOG_PROBLEM_LABEL] = False
articles_df[OTHER_PROBLEM_LABEL] = False
assign_boolean(articles_df, ZERO_PROBLEM_REGEX_LABEL, ZERO_PROBLEM_LABEL, True)
assign_boolean(articles_df, LOG_PROBLEM_REGEX_LABEL, LOG_PROBLEM_LABEL, True)
assign_boolean(articles_df, OTHER_PROBLEM_REGEX_LABEL, OTHER_PROBLEM_LABEL, True)

In [None]:
# save raw data to csv
articles_df.to_csv(articles_df_filepath)

### Calculate cumulative article statistics

In [None]:
def get_article_stats(df, cond):
    return df[cond].groupby(PUBLICATION_LABEL).count().iloc[:, 0]

In [None]:
articles_stat_df = pd.DataFrame()
articles_stat_df[NUM_ARTICLES_LABEL] = articles_df.groupby(PUBLICATION_LABEL).count().iloc[:, 0]
articles_stat_df[NUM_ARTICLES_WITH_BAR_GRAPH_LABEL] = get_article_stats(articles_df, articles_df[HAS_BAR_GRAPH_LABEL] == True)
articles_stat_df[NUM_ARTICLES_WITHOUT_BAR_GRAPH_LABEL] = get_article_stats(articles_df, ~articles_df[HAS_BAR_GRAPH_LABEL] == True)
articles_stat_df[NUM_ARTICLES_CORRECT_BAR_GRAPH_LABEL] = get_article_stats(articles_df, articles_df[NO_MISUSE_LABEL] == True)
articles_stat_df[NUM_ARTICLES_INCORRECT_BAR_GRAPH_LABEL] = get_article_stats(articles_df, articles_df[HAS_MISUSE_LABEL] == True)
articles_stat_df[NUM_ARTICLES_ZERO_PROBLEM_LABEL] = get_article_stats(articles_df, articles_df[ZERO_PROBLEM_LABEL] == True)
articles_stat_df[NUM_ARTICLES_LOG_PROBLEM_LABEL] = get_article_stats(articles_df, articles_df[LOG_PROBLEM_LABEL] == True)
articles_stat_df[NUM_ARTICLES_OTHER_PROBLEM_LABEL] = get_article_stats(articles_df, articles_df[OTHER_PROBLEM_LABEL] == True)
articles_stat_df = articles_stat_df.fillna(0)
articles_stat_df.loc[TOTAL_LABEL, :] = articles_stat_df.sum(axis=0)

In [None]:
articles_stat_df[PERCENT_ARTICLES_WITHOUT_BAR_GRAPH_LABEL] = articles_stat_df[NUM_ARTICLES_WITHOUT_BAR_GRAPH_LABEL] / articles_stat_df[NUM_ARTICLES_LABEL] * 100
articles_stat_df[PERCENT_ARTICLES_WITH_BAR_GRAPH_LABEL] = articles_stat_df[NUM_ARTICLES_WITH_BAR_GRAPH_LABEL] / articles_stat_df[NUM_ARTICLES_LABEL] * 100
articles_stat_df[PERCENT_ARTICLES_CORRECT_BAR_GRAPH_LABEL] = articles_stat_df[NUM_ARTICLES_CORRECT_BAR_GRAPH_LABEL] / articles_stat_df[NUM_ARTICLES_WITH_BAR_GRAPH_LABEL] * 100
articles_stat_df[PERCENT_ARTICLES_INCORRECT_BAR_GRAPH_LABEL] = articles_stat_df[NUM_ARTICLES_INCORRECT_BAR_GRAPH_LABEL] / articles_stat_df[NUM_ARTICLES_WITH_BAR_GRAPH_LABEL] * 100
articles_stat_df[PERCENT_ARTICLES_ZERO_PROBLEM_LABEL] = articles_stat_df[NUM_ARTICLES_ZERO_PROBLEM_LABEL] / articles_stat_df[NUM_ARTICLES_INCORRECT_BAR_GRAPH_LABEL] * 100
articles_stat_df[PERCENT_ARTICLES_LOG_PROBLEM_LABEL] = articles_stat_df[NUM_ARTICLES_LOG_PROBLEM_LABEL] / articles_stat_df[NUM_ARTICLES_INCORRECT_BAR_GRAPH_LABEL] * 100
articles_stat_df[PERCENT_ARTICLES_OTHER_PROBLEM_LABEL] = articles_stat_df[NUM_ARTICLES_OTHER_PROBLEM_LABEL] / articles_stat_df[NUM_ARTICLES_INCORRECT_BAR_GRAPH_LABEL] * 100

In [None]:
# temporarily remove total for sorting
articles_stat_total_series = articles_stat_df.loc[TOTAL_LABEL, :]
articles_stat_df = articles_stat_df.drop(TOTAL_LABEL)

In [None]:
articles_stat_df = articles_stat_df.sort_values(PERCENT_ARTICLES_INCORRECT_BAR_GRAPH_LABEL, ascending=False)
articles_stat_df.loc[TOTAL_LABEL, :] = articles_stat_total_series
articles_stat_df

In [None]:
# save raw data to csv
articles_stat_df.to_csv(articles_stat_df_filepath)

### Extract sub-dataframes of interest and key statistics

In [None]:
# extract sub-df for plotting
percent_bar_df = articles_stat_df.loc[:, [
    PERCENT_ARTICLES_WITH_BAR_GRAPH_LABEL, 
    PERCENT_ARTICLES_WITHOUT_BAR_GRAPH_LABEL,
]]
percent_bar_correct_df = articles_stat_df.loc[:, [
    PERCENT_ARTICLES_INCORRECT_BAR_GRAPH_LABEL, 
    PERCENT_ARTICLES_CORRECT_BAR_GRAPH_LABEL,
]]
percent_bar_incorrect_df = articles_stat_df.loc[:, [
    PERCENT_ARTICLES_ZERO_PROBLEM_LABEL,
    PERCENT_ARTICLES_LOG_PROBLEM_LABEL,
    PERCENT_ARTICLES_OTHER_PROBLEM_LABEL,
]]
num_total_articles_series = articles_stat_df.loc[:, NUM_ARTICLES_LABEL]
num_articles_bar_graph_series = articles_stat_df.loc[:, NUM_ARTICLES_WITH_BAR_GRAPH_LABEL]
num_articles_misused_bar_graph_series = articles_stat_df.loc[:, NUM_ARTICLES_INCORRECT_BAR_GRAPH_LABEL]

In [None]:
# save raw data to csv
percent_bar_df.to_csv(percent_bar_df_filepath)
percent_bar_correct_df.to_csv(percent_bar_correct_df_filepath)
percent_bar_incorrect_df.to_csv(percent_bar_incorrect_df_filepath)
num_total_articles_series.to_csv(num_total_articles_series_filepath)
num_articles_bar_graph_series.to_csv(num_articles_bar_graph_series_filepath)
num_articles_misused_bar_graph_series.to_csv(num_articles_misused_bar_graph_series_filepath)

In [None]:
# mutually exclusive categories
# percentage based on number of all articles
percent_bar_df.loc[TOTAL_LABEL, :]

In [None]:
# mutually exclusive categories
# percentage based on number of all articles
percent_bar_correct_df.loc[TOTAL_LABEL, :]

In [None]:
# non-mutually exclusive categories
# percentage based on number of articles with at least 1 incorrect bar graph
# note this percentage does not add up to the above percentage
# because one article could have multiple types of problems
percent_bar_incorrect_df.loc[TOTAL_LABEL, :]

### Visualize prevalence of misused bar graph

In [None]:
alpha = 0.85
fig, axs = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
percent_bar_df.plot(
    kind='barh', 
    stacked=True, 
    ax=axs[0],
    legend=False,
    alpha=alpha,
    color=[ARTICLES_WITH_BAR_GRAPH_COLOR, ARTICLES_WITHOUT_BAR_GRAPH_COLOR], 
)
percent_bar_correct_df.plot(
    kind='barh', 
    stacked=True, 
    ax=axs[1],
    legend=False,
    alpha=alpha,
    color=[ARTICLES_INCORRECT_BAR_GRAPH_COLOR, ARTICLES_CORRECT_BAR_GRAPH_COLOR], 
)
percent_bar_incorrect_df.plot(
    kind='barh', 
    stacked=False, 
    ax=axs[2],
    legend=True,
    alpha=alpha,
    color=[ARTICLES_ZERO_PROBLEM_COLOR, ARTICLES_LOG_PROBLEM_COLOR, ARTICLES_OTHER_PROBLEM_COLOR], 
)

# iterable plot settings
for i in range(2):
    axs[i].set_xlim(0, 100)
for i in range(3):
    axs[i].set_box_aspect(1)
    axs[i].set_xlabel('Percentage')

# individual plot settings
axs[0].set_ylabel('')
axs[2].set_xlim(0, 100)

# plot legends
axs[0].legend(
    ['Bar graphs', 'No bar graphs'], 
    bbox_to_anchor=(0.5, 1.02), 
    loc='lower center',
)
axs[1].legend(
    ['Visualization mistake', 'No visualization mistake'], 
    bbox_to_anchor=(0.5, 1.02), 
    loc='lower center',
)
axs[2].legend(
    [ZERO_LABEL, LOG_LABEL, OTHERS_LABEL], 
    bbox_to_anchor=(0.5, 1.02), 
    loc='lower center',
    ncol=2,
)

# overlay n onto bars in subplot 1/3
for i in range(num_total_articles_series.shape[0]):
    try:
        axs[0].text(5, i - 0.2, f'n = {int(num_total_articles_series.iloc[i])}', fontsize=8, color='w')
    except ValueError:
        pass

# overlay n onto bars in subplot 2/3
for i in range(num_total_articles_series.shape[0]):
    try:
        axs[1].text(3, i - 0.2, f'n = {int(num_articles_bar_graph_series.iloc[i])}', fontsize=8, color='w')
    except ValueError:
        pass

# overlay n onto bars in subplot 3/3
for i in range(num_total_articles_series.shape[0]):
    try:
        axs[2].text(87, i - 0.2, f'n = {int(num_articles_misused_bar_graph_series.iloc[i])}', fontsize=8, color='k')
    except ValueError:
        pass

# add average line to subplot 2/3
total_misuse_percentage = percent_bar_correct_df.loc[TOTAL_LABEL, PERCENT_ARTICLES_INCORRECT_BAR_GRAPH_LABEL]
axs[1].axvline(
    total_misuse_percentage,
    color='k',
    lw=1,
    ls='--'
)
plt.tight_layout()

## Frequency of mistakes per article

### Optional: Construct structured files for quantity annotation

Run only when needed.

Warning: Change `generate_annotation` to True to generate (or override) a fresh quantity annotation sheet.

In [None]:
generate_annotation = False
file_df = pd.DataFrame()
for j, mistake in enumerate(MISTAKES):
    for i, journal in enumerate(JOURNALS):
        filepath = f'{MISUSED_BAR_FIG_PATH}/{journal}/{mistake}'
        file_list = pd.Series(sorted(os.listdir(filepath)))
        file_list = file_list[file_list.str.contains('.png')]
        file_list = file_list.str.replace('.png', '')
        figidx = file_list.copy()
        file_list = file_list.str.replace('fig', '')
        new_file_df = file_list.str.split('_', expand=True)
        new_file_df.columns = ['DOI', 'Mistake', 'Fig ID']
        new_file_df['Journal'] = journal
        new_file_df['Fig Index'] = figidx
        new_file_df = new_file_df.set_index('Fig Index')
        if generate_annotation:
            new_file_df.to_excel(os.path.join(filepath, 'annotation.xlsx'))

### Import quantity annotation data

In [None]:
bar_annot_df = pd.DataFrame()
for j, mistake in enumerate(MISTAKES):
    for i, journal in enumerate(JOURNALS):
        filepath = f'{MISUSED_BAR_FIG_PATH}/{journal}/{mistake}'
        filename = os.path.join(filepath, 'annotation.xlsx')
        bar_annot_df_ = pd.read_excel(filename)
        bar_annot_df = pd.concat([bar_annot_df, bar_annot_df_]).reset_index(drop=True)
bar_annot_df.shape

In [None]:
# save raw data to csv
bar_annot_df.to_csv(bar_annot_df_filepath)

### Quantifying graph-level bias

In [None]:
# 同一种错误（mistake, Measurand Level I）在一个文章（DOI）里能犯错几次？
# How many times can graphs with the same (mistake, Measurand Level I) appear in the same article? i.e. How many graphs have the same mistake/being reused multiple times?
# `journal` redundant but for for later use
# how much graph-level bias can we get?
graph_bias_count_df = bar_annot_df.groupby(['Journal', 'DOI', 'Mistake', 'Measurand Level I']).count().iloc[:, 0]
graph_bias_count_df.name = 'Count'
graph_bias_count_df = graph_bias_count_df.reset_index()
graph_bias_count_df

In [None]:
bins = np.arange(0.5, graph_bias_count_df['Count'].max() + 0.5, 1)
fig, ax = plt.subplots(figsize=(5, 5))
sns.histplot(
    graph_bias_count_df,
    x='Count',
    bins=bins,
    ax=ax,
)
ax.set_xlim(0, 25)

In [None]:
bins = np.arange(0.5, graph_bias_count_df['Count'].max() + 0.5, 1)
fig, ax = plt.subplots(figsize=(5, 5))
sns.histplot(
    graph_bias_count_df,
    x='Count',
    hue='Mistake',
    bins=bins,
    hue_order=['log', 'zero'],
    palette=[ARTICLES_LOG_PROBLEM_COLOR, ARTICLES_ZERO_PROBLEM_COLOR],
    ax=ax,
)
ax.set_xlim(0, 25)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
res = scipy.stats.probplot(
    graph_bias_count_df['Count'],
    dist=scipy.stats.geom,
    sparams=(0.23,),
    plot=ax,
)
ax.plot([0, 60], [0, 60], 'k--')
ax.set_box_aspect(1)
ax.set_aspect('equal')
ax.set_xlim(0, 60)
ax.set_ylim(0, 60)

### Quantifying frequency of making mistakes

In [None]:
# 同一篇文章（DOI）里能犯几个不同的错误（mistake）？
# After graph-level bias mitigation, how many mistakes does each article make?
# `journal` redundant but for later use
mistake_count_df = graph_bias_count_df.groupby(['Journal', 'DOI', 'Mistake']).count()['Count']
mistake_count_df = mistake_count_df.reset_index()
mistake_count_df

In [None]:
bins = np.arange(0.5, mistake_count_df['Count'].max() + 0.5, 1)
fig, ax = plt.subplots(figsize=(5, 5))
sns.histplot(
    mistake_count_df,
    x='Count',
    bins=bins,
    ax=ax,
)
ax.set_xlim(0, 7)

In [None]:
bins = np.arange(0.5, mistake_count_df['Count'].max() + 0.5, 1)
fig, ax = plt.subplots(figsize=(5, 5))
sns.histplot(
    mistake_count_df,
    x='Count',
    hue='Mistake',
    bins=bins,
    hue_order=['log', 'zero'],
    palette=[ARTICLES_LOG_PROBLEM_COLOR, ARTICLES_ZERO_PROBLEM_COLOR],
    ax=ax,
)
ax.set_xlim(0, 7)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
res = scipy.stats.probplot(
    mistake_count_df['Count'],
    dist=scipy.stats.geom,
    sparams=(0.7,),
    plot=ax,
)
xmax = 8
ymax = xmax
ax.plot([0, xmax], [0, ymax], 'k--')
ax.set_box_aspect(1)
ax.set_aspect('equal')
ax.set_xlim(0, xmax)
ax.set_ylim(0, ymax)

### Visualize frequency of mistakes after graph-level bias mitigation

In [None]:
bins = np.arange(0.5, graph_bias_count_df['Count'].max() + 0.5, 1)
fig, axs = plt.subplots(1, 4, figsize=(15, 5))
sns.histplot(
    graph_bias_count_df,
    x='Count',
    hue='Mistake',
    bins=bins,
    hue_order=['log', 'zero'],
    palette=[ARTICLES_LOG_PROBLEM_COLOR, ARTICLES_ZERO_PROBLEM_COLOR],
    ax=axs[0],
)
axs[0].set_xlim(0.5, 25.5)
axs[0].set_ylim(0, 450)
axs[0].set_title('Before graph-level bias mitigation')
axs[0].set_xlabel('Misused graph count')
axs[0].set_ylabel('Frequency')

# bins = np.arange(0.5, mistake_count_df['Count'].max() + 0.5, 1)
sns.histplot(
    mistake_count_df,
    x='Count',
    hue='Mistake',
    bins=bins,
    hue_order=['log', 'zero'],
    palette=[ARTICLES_LOG_PROBLEM_COLOR, ARTICLES_ZERO_PROBLEM_COLOR],
    ax=axs[1],
)
axs[1].set_xlim(0.5, 25.5)
axs[1].set_ylim(0, 450)
axs[1].set_title('After graph-level bias mitigation')
axs[1].set_xlabel('Misused graph count')
axs[1].set_ylabel('Frequency')

# subplot 3
mistake_count_df_time = [sum(mistake_count_df['Count'] <= i) / len(mistake_count_df['Count']) for i in range(mistake_count_df['Count'].max()+1)]
graph_bias_count_df_time = [sum(graph_bias_count_df['Count'] <= i) / len(graph_bias_count_df['Count']) for i in range(graph_bias_count_df['Count'].max()+1)]

axs[2].step(range(graph_bias_count_df['Count'].max()+1), graph_bias_count_df_time, lw=1.5, color='tab:blue', label='Before')
axs[2].plot(graph_bias_count_df['Count'].max(), 1, '.', color='tab:blue')
axs[2].step(range(mistake_count_df['Count'].max()+1), mistake_count_df_time, lw=1.5, color='tab:orange', label='After')
axs[2].plot(mistake_count_df['Count'].max(), 1, '.', color='tab:orange')
axs[2].set_xlabel('Misused graph count')
axs[2].set_ylabel('Cumulative fraction')
axs[2].set_title('Cumulative fraction comparison')
axs[2].set_xlim(0, 80)
axs[2].set_ylim(0, 1.2)
axs[2].axhline(1, color='k', lw=1, ls='--')
axs[2].legend(title='Bias mitigation', loc='lower right')
axs[2].text(
    0.85, 
    0.9,
    "$x_{\\max}$" + f" = {graph_bias_count_df['Count'].max()}",
    ha='center',
    transform=axs[2].transAxes,
    color='tab:blue',
)
axs[2].text(
    0.15, 
    0.9,
    "$x_{\\max}$" + f" = {mistake_count_df['Count'].max()}",
    ha='center',
    transform=axs[2].transAxes,
    color='tab:orange',
)

# number of articles with zeroing and log mistakes for each journal
x = bar_annot_df.groupby(['Journal', 'DOI']).count().iloc[:, 0].reset_index().groupby(['Journal']).count().iloc[:, 0].values
# effective number of bar graphs with zeroing and log mistakes after graph-level bias adjustment for each journal
y = mistake_count_df.groupby('Journal').count().iloc[:, 0].values
# linear fit
slope, intercept, rvalue, pvalue, _ = scipy.stats.linregress(x, y)
rsquared = rvalue ** 2
# linear fit points
x_fit = np.arange(0, 250)
y_fit = slope * x_fit + intercept
print(f'y = {slope:.3}x + {intercept:.3}')
print(f'R2 = {rsquared:.3}')
print(f'P value = {pvalue:.3}')

axs[3].plot(x, y, '.', color='tab:orange', alpha=0.75)
axs[3].plot(x_fit, y_fit, color='tab:orange', ls='--')
# axs[3].plot([0, 250], [0, 250], color='gray', ls='--')
axs[3].set_xlabel('Number of articles with \nzeroing and log mistakes')
axs[3].set_ylabel('Number of misused bar graphs')
axs[3].set_title('After graph-level bias mitigation')
axs[3].set_xlim(0, 250)
axs[3].set_ylim(0, 250)
axs[3].set_box_aspect(1)
axs[3].text(
    0.45, 
    0.2,
    "$y$" + f" = {slope:.3}" + "$x$" + f" + {intercept:.3}" + "\n$R^2$" + f" = {rsquared:.3}" + '\n$P$ = ' + f'{pvalue:.1e}',
    ha='left',
    transform=axs[3].transAxes,
)

for i in range(4):
    axs[i].set_box_aspect(1)
plt.tight_layout()
# fig.savefig('figures/subpanels/ex-fig-9-bias-mitigation.pdf')

In [None]:
print(f"First panel excluded {sum(graph_bias_count_df['Count'] > 25)}/{len(graph_bias_count_df['Count'])} = {sum(graph_bias_count_df['Count'] > 25) / len(graph_bias_count_df['Count'])}")

In [None]:
bins = np.arange(0.5, graph_bias_count_df['Count'].max() + 0.5, 1)
fig, axs = plt.subplots(1, 4, figsize=(15, 5))
sns.histplot(
    graph_bias_count_df,
    x='Count',
    hue='Mistake',
    bins=bins,
    hue_order=['log', 'zero'],
    palette=[ARTICLES_LOG_PROBLEM_COLOR, ARTICLES_ZERO_PROBLEM_COLOR],
    ax=axs[0],
)
axs[0].set_xlim(0.5, 25.5)
axs[0].set_ylim(0, 450)
axs[0].set_title('Before Graph-Level Bias Mitigation')
axs[0].set_xlabel('Misused Graph Count')
axs[0].set_ylabel('Frequency')

# bins = np.arange(0.5, mistake_count_df['Count'].max() + 0.5, 1)
sns.histplot(
    mistake_count_df,
    x='Count',
    hue='Mistake',
    bins=bins,
    hue_order=['log', 'zero'],
    palette=[ARTICLES_LOG_PROBLEM_COLOR, ARTICLES_ZERO_PROBLEM_COLOR],
    ax=axs[1],
)
axs[1].set_xlim(0.5, 25.5)
axs[1].set_ylim(0, 450)
axs[1].set_title('After Graph-Level Bias Mitigation')
axs[1].set_xlabel('Misused Graph Count')
axs[1].set_ylabel('Frequency')

# subplot 3
mistake_count_df_time = [sum(mistake_count_df['Count'] <= i) / len(mistake_count_df['Count']) for i in range(mistake_count_df['Count'].max()+1)]
graph_bias_count_df_time = [sum(graph_bias_count_df['Count'] <= i) / len(graph_bias_count_df['Count']) for i in range(graph_bias_count_df['Count'].max()+1)]

axs[2].step(range(graph_bias_count_df['Count'].max()+1), graph_bias_count_df_time, lw=1.5, color='tab:blue', label='Before')
axs[2].plot(graph_bias_count_df['Count'].max(), 1, '.', color='tab:blue')
axs[2].step(range(mistake_count_df['Count'].max()+1), mistake_count_df_time, lw=1.5, color='tab:orange', label='After')
axs[2].plot(mistake_count_df['Count'].max(), 1, '.', color='tab:orange')
axs[2].set_xlabel('Misused Graph Count')
axs[2].set_ylabel('Cumulative Fraction')
axs[2].set_title('Cumulative Fraction Comparison')
axs[2].set_xlim(0, 80)
axs[2].set_ylim(0, 1.2)
axs[2].axhline(1, color='k', lw=1, ls='--')
axs[2].legend(title='Bias Mitigation', loc='lower right')
axs[2].text(
    0.85, 
    0.9,
    "$x_{\\max}$" + f" = {graph_bias_count_df['Count'].max()}",
    ha='center',
    transform=axs[2].transAxes,
    color='tab:blue',
)
axs[2].text(
    0.15, 
    0.9,
    "$x_{\\max}$" + f" = {mistake_count_df['Count'].max()}",
    ha='center',
    transform=axs[2].transAxes,
    color='tab:orange',
)

# number of articles with zeroing and log mistakes for each journal
x = bar_annot_df.groupby(['Journal', 'DOI']).count().iloc[:, 0].reset_index().groupby(['Journal']).count().iloc[:, 0].values
# effective number of bar graphs with zeroing and log mistakes after graph-level bias adjustment for each journal
y = mistake_count_df.groupby('Journal').count().iloc[:, 0].values
# linear fit
slope, intercept, rvalue, pvalue, _ = scipy.stats.linregress(x, y)
rsquared = rvalue ** 2
# linear fit points
x_fit = np.arange(0, 250)
y_fit = slope * x_fit + intercept
print(f'y = {slope:.3}x + {intercept:.3}')
print(f'R2 = {rsquared:.3}')
print(f'P value = {pvalue:.3}')

axs[3].plot(x, y, '.', color='tab:orange', alpha=0.75)
axs[3].plot(x_fit, y_fit, color='tab:orange', ls='--')
# axs[3].plot([0, 250], [0, 250], color='gray', ls='--')
axs[3].set_xlabel('Number of Articles')
axs[3].set_ylabel('Number of Misused Bar Graphs')
axs[3].set_title('After Graph-Level Bias Mitigation')
axs[3].set_xlim(0, 250)
axs[3].set_ylim(0, 250)
axs[3].set_box_aspect(1)
axs[3].text(
    0.45, 
    0.2,
    "$y$" + f" = {slope:.3}" + "$x$" + f" + {intercept:.3}" + "\n$R^2$" + f" = {rsquared:.3}" + '\n$P$ = ' + f'{pvalue:.1e}',
    ha='left',
    transform=axs[3].transAxes,
)

for i in range(4):
    axs[i].set_box_aspect(1)
plt.tight_layout()
# fig.savefig('figures/subpanels/ex-fig-9-bias-mitigation.pdf')

In [None]:
bar_annot_df.groupby(['Journal', 'DOI']).count().iloc[:, 0].reset_index().groupby(['Journal']).count().iloc[:, 0].values

## Number of mistakes per article by journal

### Median

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5), sharex=True, sharey=True)
sns.boxplot(
    mistake_count_df[mistake_count_df['Mistake'] == 'zero'],
    y='Journal',
    x='Count',
    ax=axs[0],
    color='w',
    linecolor='k',
    width=0.5,
)
sns.boxplot(
    mistake_count_df[mistake_count_df['Mistake'] == 'log'],
    y='Journal',
    x='Count',
    ax=axs[1],
    color='w',
    linecolor='k',
    width=0.5,
)
axs[0].set_title('Zero')
axs[1].set_title('Log')
for i in range(2):
    axs[i].set_box_aspect(1)
    axs[i].set_xlim(0)
    axs[i].set_xlabel('# of Misused Graphs')
plt.tight_layout()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5), sharex=True, sharey=True)
sns.barplot(
    mistake_count_df.loc[mistake_count_df['Mistake'] == 'zero'].groupby('Journal').agg({'Count': 'median'}).reset_index(),
    y='Journal',
    x='Count',
    ax=axs[0],
)
sns.barplot(
    mistake_count_df.loc[mistake_count_df['Mistake'] == 'log'].groupby('Journal').agg({'Count': 'median'}).reset_index(),
    y='Journal',
    x='Count',
    ax=axs[1],
)
axs[0].set_title('Zero')
axs[1].set_title('Log')
for i in range(2):
    axs[i].set_box_aspect(1)
    axs[i].set_xlim(0, 2)
    axs[i].set_xlabel('Median # of Misused Graphs')

### Mean

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5), sharex=True, sharey=True)
sns.barplot(
    mistake_count_df.loc[mistake_count_df['Mistake'] == 'zero'].groupby('Journal').agg({'Count': 'mean'}).reset_index(),
    y='Journal',
    x='Count',
    ax=axs[0],
)
sns.barplot(
    mistake_count_df.loc[mistake_count_df['Mistake'] == 'log'].groupby('Journal').agg({'Count': 'mean'}).reset_index(),
    y='Journal',
    x='Count',
    ax=axs[1],
)
axs[0].set_title('Zero')
axs[1].set_title('Log')
for i in range(2):
    axs[i].set_box_aspect(1)
    axs[i].set_xlim(0, 2)
    axs[i].set_xlabel('Median # of Misused Graphs')

## Correlation with number of authors

### Articles with bar graphs on average has more authors

In [None]:
def label_stats(ax, pvalue, gamma):
    if pvalue < 0.001:
        ax.text(
            0.5, 
            0.9,
            '$\it{P}$ = ' + f'{pvalue:.1e}' + '\n$\gamma$ = ' + f'{gamma:.2}',
            ha='center',
            transform=ax.transAxes,
        )
    else:
        ax.text(
        0.5, 
        0.9,
        '$\it{P}$ = ' + f'{pvalue:.2}' + '\n$\gamma$ = ' + f'{gamma:.2}',
        ha='center',
        transform=ax.transAxes,
    )
    ax.axhline(
        0.875 * ax.get_ylim()[1],
        0.35, 
        0.65,
        color='k',
        lw=1,
    )

In [None]:
alpha = 0.85
author_count_viz_max = 60

fig, axs = plt.subplots(1, 3, figsize=(7, 5), sharey=True)
# common settings
for i in range(3):
    axs[i].set_ylim(0, 80)
    axs[i].set_box_aspect(2.5)

## subplot 1
print(f'n = {sum(articles_df[HAS_BAR_GRAPH_LABEL] == True)} Bar graph')
print(f'n = {sum(articles_df[HAS_BAR_GRAPH_LABEL] == False)} No bar graph')
print(f'Visualization excluded outlier {sum(articles_df[NUM_AUTHORS_LABEL] > author_count_viz_max)}/{len(articles_df)} = {sum(articles_df[NUM_AUTHORS_LABEL] > author_count_viz_max)/len(articles_df)}')
plot_df = articles_df[articles_df[NUM_AUTHORS_LABEL] <= author_count_viz_max]

violin = sns.violinplot(
    plot_df,
    y=NUM_AUTHORS_LABEL,
    hue=HAS_BAR_GRAPH_LABEL,
    split=True,
    inner="quart",
    density_norm='area',
    common_norm=True,
    hue_order=[True, False],
    palette=[ARTICLES_WITH_BAR_GRAPH_COLOR, ARTICLES_WITHOUT_BAR_GRAPH_COLOR],
    alpha=alpha,
    linecolor='black',
    linewidth=1,
    ax=axs[0],
)

# label statistics
stat, pvalue = scipy.stats.mannwhitneyu(
    articles_df.loc[articles_df[HAS_BAR_GRAPH_LABEL] == True, NUM_AUTHORS_LABEL],
    articles_df.loc[articles_df[HAS_BAR_GRAPH_LABEL] == False, NUM_AUTHORS_LABEL],
)
gamma = util.get_gamma(
    articles_df.loc[articles_df[HAS_BAR_GRAPH_LABEL] == True, NUM_AUTHORS_LABEL],
    articles_df.loc[articles_df[HAS_BAR_GRAPH_LABEL] == False, NUM_AUTHORS_LABEL],
)
label_stats(axs[0], pvalue, gamma)

# configure legend
sns.move_legend(
    violin, 
    loc='lower center', 
    **dict(
        bbox_to_anchor=(0.5, 1.02),
        title='',
        labels=['Bar graphs', 'No bar grpahs'],
    )
)

## subplot 2
plot_df = articles_df.loc[(articles_df[NO_MISUSE_LABEL] == True) | (articles_df[HAS_MISUSE_LABEL] == True)]
print(f'n = {sum(plot_df[HAS_MISUSE_LABEL] == True)} Incorrect visualization')
print(f'n = {sum(plot_df[HAS_MISUSE_LABEL] == False)} Correct visualization')
print(f'Visualization excluded outlier {sum(plot_df[NUM_AUTHORS_LABEL] > author_count_viz_max)}/{len(plot_df)} = {sum(plot_df[NUM_AUTHORS_LABEL] > author_count_viz_max)/len(plot_df)}')
plot_df = plot_df[plot_df[NUM_AUTHORS_LABEL] <= author_count_viz_max]

violin = sns.violinplot(
    plot_df,
    y=NUM_AUTHORS_LABEL,
    hue=HAS_MISUSE_LABEL,
    split=True,
    inner="quart",
    density_norm='area',
    common_norm=True,
    hue_order=[True, False],
    palette=[ARTICLES_INCORRECT_BAR_GRAPH_COLOR, ARTICLES_CORRECT_BAR_GRAPH_COLOR],
    alpha=alpha,
    linecolor='black',
    linewidth=1,
    ax=axs[1],
)

# label statistics
stat, pvalue = scipy.stats.mannwhitneyu(
    articles_df.loc[articles_df[NO_MISUSE_LABEL] == True, NUM_AUTHORS_LABEL],
    articles_df.loc[articles_df[HAS_MISUSE_LABEL] == True, NUM_AUTHORS_LABEL],
)
gamma = util.get_gamma(
    articles_df.loc[articles_df[NO_MISUSE_LABEL] == True, NUM_AUTHORS_LABEL],
    articles_df.loc[articles_df[HAS_MISUSE_LABEL] == True, NUM_AUTHORS_LABEL],
)
label_stats(axs[1], pvalue, gamma)

# configure legend
sns.move_legend(
    violin, 
    loc='lower center', 
    **dict(
        bbox_to_anchor=(0.5, 1.02),
        title='',
        labels=['Visualization mistake', 'No visualization mistake'],
    )
)

## subplot 3
plot_df = articles_df.loc[(articles_df[ZERO_PROBLEM_LABEL] == True) | (articles_df[LOG_PROBLEM_LABEL] == True)]
print(f'n = {sum(plot_df[ZERO_PROBLEM_LABEL] == True)} Zeroing')
print(f'n = {sum(plot_df[ZERO_PROBLEM_LABEL] == False)} Log')
print(f'Visualization excluded outlier {sum(plot_df[NUM_AUTHORS_LABEL] > author_count_viz_max)}/{len(plot_df)} = {sum(plot_df[NUM_AUTHORS_LABEL] > author_count_viz_max)/len(plot_df)}')
plot_df = plot_df[plot_df[NUM_AUTHORS_LABEL] <= author_count_viz_max]

violin = sns.violinplot(
    plot_df,
    y=NUM_AUTHORS_LABEL,
    hue=ZERO_PROBLEM_LABEL,
    split=True,
    inner="quart",
    density_norm='area',
    common_norm=True,
    hue_order=[True, False],
    palette=[ARTICLES_ZERO_PROBLEM_COLOR, ARTICLES_LOG_PROBLEM_COLOR],
    alpha=alpha,
    linecolor='black',
    linewidth=1,
    ax=axs[2],
)

# label statistics
stat, pvalue = scipy.stats.mannwhitneyu(
    articles_df.loc[articles_df[ZERO_PROBLEM_LABEL] == True, NUM_AUTHORS_LABEL],
    articles_df.loc[articles_df[LOG_PROBLEM_LABEL] == True, NUM_AUTHORS_LABEL],
)
gamma = util.get_gamma(
    articles_df.loc[articles_df[ZERO_PROBLEM_LABEL] == True, NUM_AUTHORS_LABEL],
    articles_df.loc[articles_df[LOG_PROBLEM_LABEL] == True, NUM_AUTHORS_LABEL],
)
label_stats(axs[2], pvalue, gamma)

# configure legend
sns.move_legend(
    violin, 
    loc='lower center', 
    **dict(
        bbox_to_anchor=(0.5, 1.02),
        title='',
        labels=[ZERO_LABEL, LOG_LABEL],
    )
)

# change quartile line color
for l in axs[0].lines[0:3]:
    l.set_color('white')

plt.tight_layout()
# fig.savefig('figures/subpanels/ex-fig-6-author-number-correlation.pdf')

In [None]:
alpha = 0.85

fig, axs = plt.subplots(1, 3, figsize=(7, 5), sharey=True)
# common settings
for i in range(3):
    axs[i].set_ylim(0, 40)
    axs[i].set_box_aspect(2.5)
    
## subplot 1
print(f'n = {sum(articles_df[HAS_BAR_GRAPH_LABEL] == True)} Bar graph')
print(f'n = {sum(articles_df[HAS_BAR_GRAPH_LABEL] == False)} No bar graph')
plot_df = articles_df.copy()

violin = sns.violinplot(
    plot_df,
    y=NUM_WORDS_TITLE_LABEL,
    hue=HAS_BAR_GRAPH_LABEL,
    split=True,
    inner="quart",
    density_norm='area',
    common_norm=True,
    hue_order=[True, False],
    palette=[ARTICLES_WITH_BAR_GRAPH_COLOR, ARTICLES_WITHOUT_BAR_GRAPH_COLOR],
    alpha=alpha,
    linecolor='black',
    linewidth=1,
    ax=axs[0],
)

# label statistics
stat, pvalue = scipy.stats.mannwhitneyu(
    articles_df.loc[articles_df[HAS_BAR_GRAPH_LABEL] == True, NUM_WORDS_TITLE_LABEL],
    articles_df.loc[articles_df[HAS_BAR_GRAPH_LABEL] == False, NUM_WORDS_TITLE_LABEL],
)
gamma = util.get_gamma(
    articles_df.loc[articles_df[HAS_BAR_GRAPH_LABEL] == True, NUM_WORDS_TITLE_LABEL],
    articles_df.loc[articles_df[HAS_BAR_GRAPH_LABEL] == False, NUM_WORDS_TITLE_LABEL],
)
label_stats(axs[0], pvalue, gamma)

# configure legend
sns.move_legend(
    violin, 
    loc='lower center', 
    **dict(
        bbox_to_anchor=(0.5, 1.02),
        title='',
        labels=['Bar graphs', 'No bar grpahs'],
    )
)

## subplot 2
plot_df = articles_df.loc[(articles_df[NO_MISUSE_LABEL] == True) | (articles_df[HAS_MISUSE_LABEL] == True)]
print(f'n = {sum(plot_df[HAS_MISUSE_LABEL] == True)} Incorrect visualization')
print(f'n = {sum(plot_df[HAS_MISUSE_LABEL] == False)} Correct visualization')

violin = sns.violinplot(
    plot_df,
    y=NUM_WORDS_TITLE_LABEL,
    hue=HAS_MISUSE_LABEL,
    split=True,
    inner="quart",
    density_norm='area',
    common_norm=True,
    hue_order=[True, False],
    palette=[ARTICLES_INCORRECT_BAR_GRAPH_COLOR, ARTICLES_CORRECT_BAR_GRAPH_COLOR],
    alpha=alpha,
    linecolor='black',
    linewidth=1,
    ax=axs[1],
)

# label statistics
stat, pvalue = scipy.stats.mannwhitneyu(
    articles_df.loc[articles_df[NO_MISUSE_LABEL] == True, NUM_WORDS_TITLE_LABEL],
    articles_df.loc[articles_df[HAS_MISUSE_LABEL] == True, NUM_WORDS_TITLE_LABEL],
)
gamma = util.get_gamma(
    articles_df.loc[articles_df[NO_MISUSE_LABEL] == True, NUM_WORDS_TITLE_LABEL],
    articles_df.loc[articles_df[HAS_MISUSE_LABEL] == True, NUM_WORDS_TITLE_LABEL],
)
label_stats(axs[1], pvalue, gamma)

# configure legend
sns.move_legend(
    violin, 
    loc='lower center', 
    **dict(
        bbox_to_anchor=(0.5, 1.02),
        title='',
        labels=['Visualization mistake', 'No visualization mistake'],
    )
)

## subplot 3
plot_df = articles_df.loc[(articles_df[ZERO_PROBLEM_LABEL] == True) | (articles_df[LOG_PROBLEM_LABEL] == True)]
print(f'n = {sum(plot_df[ZERO_PROBLEM_LABEL] == True)} Zeroing')
print(f'n = {sum(plot_df[ZERO_PROBLEM_LABEL] == False)} Log')

violin = sns.violinplot(
    plot_df,
    y=NUM_WORDS_TITLE_LABEL,
    hue=ZERO_PROBLEM_LABEL,
    split=True,
    inner="quart",
    density_norm='area',
    common_norm=True,
    hue_order=[True, False],
    palette=[ARTICLES_ZERO_PROBLEM_COLOR, ARTICLES_LOG_PROBLEM_COLOR],
    alpha=alpha,
    linecolor='black',
    linewidth=1,
    ax=axs[2],
)

# label statistics
stat, pvalue = scipy.stats.mannwhitneyu(
    articles_df.loc[articles_df[ZERO_PROBLEM_LABEL] == True, NUM_WORDS_TITLE_LABEL],
    articles_df.loc[articles_df[LOG_PROBLEM_LABEL] == True, NUM_WORDS_TITLE_LABEL],
)
gamma = util.get_gamma(
    articles_df.loc[articles_df[ZERO_PROBLEM_LABEL] == True, NUM_WORDS_TITLE_LABEL],
    articles_df.loc[articles_df[LOG_PROBLEM_LABEL] == True, NUM_WORDS_TITLE_LABEL],
)
label_stats(axs[2], pvalue, gamma)

# configure legend
sns.move_legend(
    violin, 
    loc='lower center', 
    **dict(
        bbox_to_anchor=(0.5, 1.02),
        title='',
        labels=[ZERO_LABEL, LOG_LABEL],
    )
)

# change quartile line color
for l in axs[0].lines[0:3]:
    l.set_color('white')

plt.tight_layout()