In [None]:
!pip install plotly pandas numpy nbformat scikit-learn

In [None]:
# Paths and parameters
data_dir = 'data'
paper_info_filename = 'cvpr_paper_info_and_keywords.pkl'
paper_extra_info_filename = 'cvpr_papers_to_all_patents'
countries_filename = 'country-codes.txt'
elite_univs_filename = 'elite_universities.txt'
codes_filename = "final_keywords.txt"
stopwords_filename = 'stopwords.txt'
codes_of_interest = ["final"]

# Data setup

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from pathlib import Path
(figs_dir := Path(data_dir) / '..' / 'figures').mkdir(exist_ok=True)
import pandas as pd
from pandas import DataFrame
import os
from os.path import join
from itertools import chain
import plotly.graph_objects as go
import plotly
import plotly.express as px
import numpy as np
import pickle
from plotly.subplots import make_subplots
import plotly.io as pio; pio.renderers.default = "iframe"

np.random.seed(1991)
plotly.offline.init_notebook_mode()

In [None]:
def read_lists(filepath, filter_comments=True):
    '''Read a file containing codes in our special format and return a dictionary of the codes'''
    with open(filepath) as f:
        d = {section.split('\n')[0].strip('## '): [item for item in section.strip().split('\n')[1:] if item]  # code to keywords
             for section in f.read().split('\n\n')}  # for each code's section
    if filter_comments:
        d = {key: [item.split('#')[0].strip() for item in d[key] if not item.startswith('#')] for key in d}  # keep real part of valid lines
        for key in list(d):
            if d[key] == []:  # delete codes with no items
                del d[key]
    return d

def print_lists(lists):
    print('\n'.join([header + ': ' + ', '.join(lists[header]) + '\n' for header in lists]))
    
def load_papers():
    '''Load all papers and info into a Pandas DataFrame, annotated with codes'''
    papers = (pd.read_pickle(os.path.join(data_dir, paper_info_filename))
              .rename_axis('PaperId')  # Rename index
              .rename(dict(DisplayName='Institution', DisplayName_fields='Field'), axis=1))  # Rename some columns
    # Load papers' patents
    papers_patents = pickle.load(open(os.path.join(data_dir, paper_extra_info_filename), 'rb'))
    papers_patents = DataFrame(papers_patents.items(), columns=['PaperId',  'patents']).set_index('PaperId')  # Convert to Dataframe
    papers_patents.patents = papers_patents.patents.apply(lambda patents: [p.replace('-', '') for p in patents])
    # Combine and clean
    papers = papers.join(papers_patents)
    papers.patents = papers.patents.fillna("").apply(list)
    for col in ['PaperId', 'Latitude', 'Longitude']:  # Clean columns that say the same info multiple times
        papers[col] = papers[col].apply(lambda vals: vals[0])
    papers.Year = papers.Year.astype(int)  # Convert year column from strings to integers
    papers['Decade'] = papers.Year.apply(lambda year: 10 * (year//10))  # Save decade
    # Aggregate keyword columns into code columns
    codes = read_lists(os.path.join(data_dir, codes_filename))
    for code, keywords in codes.items():
        papers[f'{code}_code'] = papers[keywords].sum(axis=1)
    # Add whether a paper was ever cited in a patent (either of the following two lines have the same result)
    papers['n_patents'] = papers.patents.apply(len)
    # papers['n_patents'] = papers['keyword_depth_vec'].apply(lambda cites: int(cites[0,:,0].sum()))
    return papers

def filter_paper_years(papers, year_range=range(1990, 2022), years_to_drop=[1990, 1995, 2002]):
    '''Filter papers DataFrame to requested years and return updated Pandas DataFrame.
    The defaults result in limiting to the years we view as reliable data.'''
    return papers.query('(Year in @year_range) & (Year not in @years_to_drop)')

def mark_interesting(papers, codes_of_interest):
    '''Given papers DataFrame and which codes are of interest (list of strings),
    add columns indicating which papers are of interest and related info'''
    papers['patents_of_interest'] = papers[codes_of_interest].sum(axis=1).apply(set).apply(list)
    papers['n_patents_of_interest'] = papers.patents_of_interest.apply(len)
    papers['n_patents_not_of_interest'] = papers.n_patents - papers.n_patents_of_interest

def analyze_sources(papers, source):
    '''Given papers DataFrame and desired source type for analysis
    (string, e.g. Institution, Country, Field, Year, or Decade),
    collect and return institutions DataFrame'''
    # Each link between a source and a paper+details
    # (e.g. a paper affiliated with U.S. and China has two otherwise identical rows)
    if source == 'Country':  # Extra step before
        source = 'Iso3166Code'
    links = (papers[[source, 'PaperId', 'patents', 'n_patents', 'patents_of_interest', 'n_patents_of_interest']]
             .explode(source)  # each paper has multiple links
             .drop_duplicates(['PaperId', source])  # remove duplicate links
             .reset_index(drop=True))
    if source == 'Iso3166Code':  # Extra step after
        source = 'Country'
        countries = dict([line.strip().split('\t') for line in open(os.path.join(data_dir, countries_filename)).readlines()])
        links.insert(0, 'Country', links.Iso3166Code.replace(countries))
    # Summarize the sources' downstream patents
    sources_general_info = links.groupby(source).agg(dict(PaperId=list, patents=sum)).rename(dict(PaperId='papers'), axis=1)
    sources_general_info.patents = sources_general_info.patents.apply(set).apply(list)  # drop duplicate patents
    sources_patent_info = links.query('n_patents > 0').groupby(source).agg(dict(PaperId=list)).rename(dict(PaperId='papers_patented'), axis=1)
    sources_info_of_interest = links.query('n_patents_of_interest > 0').groupby(source).agg(dict(PaperId=list, patents_of_interest=sum)).rename(dict(PaperId='papers_of_interest'), axis=1)
    sources_info_of_interest.patents_of_interest = sources_info_of_interest.patents_of_interest.apply(set).apply(list)  # drop duplicate patents
    sources = sources_general_info.join([sources_patent_info, sources_info_of_interest]).reset_index()
    for col in ['papers', 'papers_patented', 'patents', 'papers_of_interest', 'patents_of_interest']:
        sources[col] = sources[col].fillna("").apply(list)
        sources[f'n_{col}'] = sources[col].apply(len)
    # Drop generic sources
    if source == 'Field':
        generic_fields = [
            'Artificial intelligence', 'Computer vision', 'Pattern recognition', 'Mathematics', 'Computer science',
            'Machine learning', 'Deep learning', 'Algorithm', '(', 'Convolutional neural net', 'Pixel', 'Artificial neural networks']
        sources = sources[~sources.Field.apply(lambda field: any([generic_field.lower() in field.lower() for generic_field in generic_fields]))]
    # Clean
    sources = sources.replace('French Institute for Research in Computer Science and Automation', 'IRIA')
    sources = sources.replace('Korea Advanced Institute of Science and Technology', 'Korea Advanced Inst. of Science & Tech.')
    sources = sources.replace('University of Illinois Urbana-Champaign', 'Urbana-Champaign')
    sources = sources.replace('University of California, Berkeley', 'Berkeley')
    sources = sources.replace('Massachusetts Institute of Technology', 'MIT')
    sources = sources.replace('Cognitive neuroscience of visual object recognition', 'CogSci of object recognition')
    if source == 'Institution':
        sources.Institution = sources.Institution.apply(lambda name: name.replace('University', 'Univ.'))
    if source == 'Decade':
        decades_of_interest = [1990, 2010]
        sources = sources.query('Decade in @decades_of_interest')
    return sources


def _compute_error_bars(row, n_tries: int = 1000):
    '''
    Function to compute error bars; use with df.apply()
    Error is estimated by using a bootstrap method.
    Positive and negative samples are drawn with replacement from the original data over `n_tries` iterations.
    The error is the standard deviation of the number of positive samples over the iterations.
    '''
    pos = row.n_papers_of_interest
    neg = row.n_papers_patented - row.n_papers_of_interest
    to_sample = np.concatenate((np.ones(pos, dtype=int), np.zeros(neg, dtype=int)))

    sampled_n_patents_of_interest = []
    sampled_n_patents_not_of_interest = []
    for _ in range(n_tries):
        sampled = np.random.choice(to_sample, size=len(to_sample), replace=True)
        sampled_n_patents_of_interest.append(np.sum(sampled))
        sampled_n_patents_not_of_interest.append(np.sum(1 - sampled))

    row['n_patents_not_of_interest'] = neg
    row['n_patents_of_interest_error'] = np.std(sampled_n_patents_of_interest)
    row['n_patents_not_of_interest_error'] = np.std(sampled_n_patents_not_of_interest)

    return row


def compute_error_bars(papers, n_tries: int = 1000):
    '''
    Compute error bars for the entire papers DataFrame.
    '''
    papers = papers.apply(_compute_error_bars, axis=1, args=(n_tries,))
    return papers

def code_name(code):
    return f'{code}_code'

# Create data

In [None]:
papers = load_papers()
papers.drop(columns=["keyword_depth_vec", "keyword_depth_vec_fields", "PaperId_fields"], inplace=True)
all_papers = filter_paper_years(papers).copy()
codes = read_lists(join(data_dir, codes_filename))
print_lists({code: codes[code] for code in codes_of_interest})
mark_interesting(all_papers, map(code_name, codes_of_interest))

In [None]:
# Quick check prevalence of norm
# We see there is a norm that for entitites authoring papers with patents, at least half are used in surveillance. (See paper for more details.)
# Quantify how prevalent this norm is
for source in ['Institution', 'Country', 'Field']:
    sources = analyze_sources(all_papers, source)
    sources['percent_of_interest'] = sources.n_papers_of_interest / sources.n_papers_patented
    n_surveillance_sources = len(sources.query("(n_papers_patented > 0) & (percent_of_interest > .5)"))
    n_relevant_sources = len(sources.query("(n_papers_patented > 0)"))
    print(f'{100 * n_surveillance_sources / n_relevant_sources:.1f}% ({n_surveillance_sources} out of {n_relevant_sources}) of {source.lower()}s follow this norm.')

In [None]:
num_patents = len(set([p for ps in all_papers['patents'].to_list() for p in ps]))
num_patents_of_interest = len(set([p for ps in all_papers['patents_of_interest'].to_list() for p in ps]))

print(f'Number of papers: {len(all_papers):,}')
print(f'Number of patents: {num_patents:,}')
print(f'Number of patents of interest: {num_patents_of_interest:,}')

# Visualization setup

In [None]:
def visualize_sources_percents(papers, source, filt=None, sort='n_papers', n=None,
                               color='MediumTurquoise', line=False, error_color='darkgrey',
                               background_stack: bool = True, whisk_width=0, whisk_thick=2, condensed_bars=False, **kwargs):
    '''
    Visualize the mapping from sources to percents of interest.

    papers: DataFrame  -- Paper info
    source: str -- Source type of interest (e.g. Institution, Country, Year, etc)
    filt: str -- Pandas-style query that will filter down which sources will be included in the figure
    sort: str -- Property that will sort the sources before creating the figure
    n: int -- Number of sources to include in figure
    color: str -- Color of main bars
    line: bool -- Should a line be added as a 50% marker?
    kwargs -- All other args are assumed to be plotly figure parameters that will be passed along
    '''
    # Get sources, filter, and sort
    sources = analyze_sources(papers, source)
    print(f"We consider {len(set(sources.papers.sum()))} papers.")
    if filt: sources = sources.query(filt)
    if sort: sources = sources.sort_values(sort, ascending=False)[:n]
    print(f"We plot {len(set(sources.papers.sum()))} papers.")

    sources = compute_error_bars(sources)

    # New columns
    sources['Percent used in surveillance patents'] = 100 * sources.n_papers_of_interest / sources.n_papers_patented
    sources['Percent not used in surveillance patents'] = 100 - sources['Percent used in surveillance patents']
    sources['Percent error surveillance patents'] = 100 * sources.n_patents_of_interest_error / sources.n_papers_patented
    sources['Percent error not surveillance patents'] = 100 * sources.n_patents_not_of_interest_error / sources.n_papers_patented

    # Visualize
    fig = go.Figure()
    if background_stack:
        fig.add_trace(
            go.Bar(
                name='Percent not used in surveillance patents',
                x=sources[source],
                y=sources['Percent not used in surveillance patents'],
                marker_color='lightgrey',
                base=sources['Percent used in surveillance patents'],
            )
        )
    fig.add_trace(
        go.Bar(
            name='Percent used in surveillance patents',
            x=sources[source],
            y=sources['Percent used in surveillance patents'],
            marker_color=color,
            error_y={
                'type': 'data',
                'array': sources['Percent error surveillance patents'],
                'symmetric': True,
                'color': error_color,
                'thickness': whisk_thick,
                'width': whisk_width
            },
        )
    )
    fig.update_layout()

    if line: fig.add_hline(50, line_color='salmon', line_width=5)
    if condensed_bars:
        fig.update_traces(marker_line_width = 0)
    fig.update_layout(
        yaxis=dict(title='Papers with downstream patents', title_font_size=15, range=(-2, 100), dtick=50, tickfont_size=15, ticksuffix='%', showgrid=False),
        xaxis=dict(dtick=1, tickfont_size=13), legend=dict(title='', xanchor='center', x=.5, yanchor='top', y=1.35, font_size=15),
        template='plotly_white', barmode='stack', font_size=15, paper_bgcolor='rgba(0,0,0,0)', bargap=.03)
    fig.update_layout(**kwargs)
    return fig

def visualize_sources_numbers_vertical(
        papers, source, filt=None, sort='n_papers_of_interest', n=None,
        color='MediumTurquoise', stackbars=True, error_color='darkgrey', show_error: bool = True,
        whisk_width=5, whisk_thick=2, **kwargs):
    '''Visualize vertically the mapping from sources to number of papers of interest

    papers: DataFrame  -- Paper info
    source: str -- Source type of interest (e.g. Institution, Country, Year, etc)
    filt: str -- Pandas-style query that will filter down which sources will be included in the figure
    sort: str -- Property that will sort the sources before creating the figure
    n: int -- Number of sources to include in figure
    color: str -- Color of main bars
    stackbars: bool -- Should surveillance and non-surveillance bars be stacked or side-by-side?
    kwargs -- All other args are assumed to be plotly figure parameters that will be passed along
    '''
    sources = analyze_sources(papers, source)

    # Filter and sort sources
    if filt: sources = sources.query(filt)
    if sort: sources = sources.sort_values(sort, ascending=False)[:n]
    print(f"We plot {len(set(sources.papers.sum()))} papers.")

    sources = compute_error_bars(sources)

    # New columns
    surv_label, not_surv_label = 'Number used in surveillance patents', 'Number not used in surveillance patents'
    err_surv_label, err_not_surv_label = 'Error surveillance patents', 'Error not surveillance patents'
    sources[surv_label] = sources.n_papers_of_interest
    sources[not_surv_label] = sources.n_papers_patented - sources.n_papers_of_interest
    sources[err_surv_label] = sources.n_patents_of_interest_error
    sources[err_not_surv_label] = sources.n_patents_not_of_interest_error

    # Visualize
    columns_to_plot = [not_surv_label, surv_label] if not stackbars else [surv_label, not_surv_label]
    traceorder = 'reversed' if not stackbars else 'normal'
    barmode = 'stack' if stackbars else 'group'

    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            name=not_surv_label,
            x=sources[source],
            y=sources[not_surv_label],
            orientation='v',
            marker_color='darkgrey',
            error_y={
                'type': 'data',
                'array': sources[err_not_surv_label],
                'symmetric': True,
                'color': 'grey',
                'thickness': whisk_thick,
                'width': whisk_width
            },
        )
    )
    fig.add_trace(
        go.Bar(
            name=surv_label,
            x=sources[source],
            y=sources[surv_label],
            orientation='v',
            marker_color=color,
            error_y={
                'type': 'data',
                'array': sources[err_surv_label],
                'symmetric': True,
                'color': error_color,
                'thickness': whisk_thick,
                'width': whisk_width
            },
        )
    )
    fig.update_layout(
        yaxis=dict(title='Number of papers<br>with downstream patents', title_font_size=15, tickfont_size=15, ticksuffix=' '),
        xaxis=dict(tickfont_size=15),
        legend=dict(title='', xanchor='center', x=.5, yanchor='top', y=1.35, font_size=15, traceorder=traceorder),
        template='plotly_white', font_size=15, barmode=barmode, paper_bgcolor='rgba(0,0,0,0)')
    fig.update_layout(**kwargs)
    return fig

def visualize_sources_numbers_horizontal(papers, source, filt=None, sort='n_papers', n=None, color='MediumTurquoise',
                                         error_color='darkgrey', whisk_width=5, whisk_thick=2, insidetext=False, **kwargs):
    '''Visualize horizontally the mapping from sources to number of papers of interest

    papers: DataFrame  -- Paper info
    source: str -- Source type of interest (e.g. Institution, Country, Year, etc)
    filt: str -- Pandas-style query that will filter down which sources will be included in the figure
    sort: str -- Property that will sort the sources before creating the figure
    n: int -- Number of sources to include in figure
    color: str -- Color of main bars
    kwargs -- All other args are assumed to be plotly figure parameters that will be passed along
    '''
    sources = analyze_sources(papers, source)

    # Filter and sort sources
    if filt: sources = sources.query(filt)
    if sort: sources = sources.sort_values(sort, ascending=False)[:n].sort_values(sort, ascending=True)
    print(f"We plot {len(set(sources.papers.sum()))} papers.")

    sources = compute_error_bars(sources)

    # New columns
    surv_label, not_surv_label = 'Used in surveillance patents', 'Not used in surveillance patents'
    err_surv_label, err_not_surv_label = 'Error surveillance patents', 'Error not surveillance patents'
    sources[surv_label] = sources.n_papers_of_interest
    sources[not_surv_label] = sources.n_papers_patented - sources.n_papers_of_interest
    sources[err_surv_label] = sources.n_patents_of_interest_error
    sources[err_not_surv_label] = sources.n_patents_not_of_interest_error

    # Visualize
    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            name=surv_label,
            y=sources[source],
            x=sources[surv_label],
            text='    ' + sources[source].apply(str) + '    ',
            orientation='h',
            marker_color=color,
            base=sources[not_surv_label],
        )
    )
    fig.add_trace(
        go.Bar(
            name=not_surv_label,
            y=sources[source],
            x=sources[not_surv_label],
            orientation='h',
            marker_color='darkgrey',
            error_x={
                'type': 'data',
                'array': sources[err_not_surv_label],
                'symmetric': True,
                'color': error_color,
                'thickness': whisk_thick,
                'width': whisk_width
            },
        )
    )
    if insidetext:
        fig.update_traces(
            textposition='inside', insidetextanchor="end", insidetextfont=dict(size=13, color='black'),
            constraintext='none', cliponaxis=False)
    else:
        fig.update_traces(textposition='none')
    fig.update_traces(marker_line_width = 0)
    fig.update_layout(
        xaxis=dict(title='Number of papers with downstream patents', title_font_size=20, tickfont_size=15, ticksuffix=' '),
        yaxis=dict(title='', dtick=1, tickfont_size=20),
        legend=dict(title='', xanchor='center', yanchor='bottom', x=.5, y=1.05, orientation='v', font_size=18),
        template='plotly_white', barmode='stack',
        title_y=.94, font_size=15, bargap=.12,  paper_bgcolor='rgba(0,0,0,0)', margin=dict(l=0,r=0,t=0,b=0))
    fig.update_layout(**kwargs)
    return fig

# Visualizations

## Across time

In [None]:
papers = filter_paper_years(all_papers, range(1990, 2018))
fig = visualize_sources_percents(papers, 'Year', color='mediumvioletred', width=400, height=400, xaxis=dict(range=(1990,2021), dtick=5), condensed_bars=False, error_color='black')
plotly.io.write_image(fig, os.path.join(figs_dir, f'years.png'),scale=10)
fig.show()

In [None]:
papers = filter_paper_years(all_papers, range(1990, 2018))
fig = visualize_sources_percents(papers, 'Decade', color='mediumvioletred', width=400, height=400,
                                 xaxis=dict(tickvals=[1990,2010], ticksuffix='s'), error_color='black')
fig.show()

In [None]:
papers = filter_paper_years(all_papers, range(1990,2018))
fig = visualize_sources_numbers_vertical(
    papers, 'Decade', sort=None, stackbars=False, color='mediumslateblue', width=400, height=400, yaxis_range=(0,2010), xaxis_ticksuffix='s', barmode='group', error_color='darkslateblue')
plotly.io.write_image(fig, os.path.join(figs_dir, f'decades.png'),scale=10)
fig.show()

In [None]:
# Top words across the decades

from fightin_words import bayes_compare_language
from collections import Counter

# Get decades' paper titles
sources = analyze_sources(all_papers, 'Decade')
def get_titles(paper_ids):
    titles = [all_papers.query(f'PaperId == "{paper_id}"').PaperTitle[0] for paper_id in paper_ids]
    return titles
sources['titles'] = sources.papers.apply(get_titles)

# Compute log odds ratios and z-scores
l1, l2 = list(sources.titles)
fightin_words = bayes_compare_language(l2, l1, ngram=1)
words_to_remove = []
stopwords = [word.strip() for word in open(os.path.join(data_dir, stopwords_filename)).readlines()]
words_to_remove.extend(list(set(stopwords) | {'via', 'using'}))
words_to_remove.extend(['network', 'neural', 'networks', 'machine', 'learning', 'models', 'convolutional', 'unsupervised', 'supervised'])
fightin_words = list(filter(
    lambda word_score: word_score[0] not in words_to_remove, fightin_words))

# Visualize top words
n = 10
top_words = pd.DataFrame(fightin_words[:n] + [('', 0)] + fightin_words[-n:], columns=['word', 'z-score'])
fig = px.bar(top_words, x='z-score', y='word', text='word', width=450, height=500, color='z-score',
    template='plotly_white', color_discrete_sequence=['midnightblue'], color_continuous_scale='tropic')
fig.update_traces(textposition='outside', textfont=dict(size=60), cliponaxis=False,)
fig.update_layout(
    yaxis_visible=False,
    plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', margin=dict(l=30,r=30,t=0,b=0),
    xaxis=dict(dtick=5, title='', range=(-15,15), title_font_color='grey', tickfont_color='grey'), coloraxis_showscale=False
)
plotly.io.write_image(fig, os.path.join(figs_dir, f'fightin_words.png'),scale=10)
display(fig)

## Top entities

In [None]:
fig = visualize_sources_numbers_horizontal(
    all_papers, 'Institution', sort='n_papers_of_interest', n=10,  insidetext=False, width=800, color='darkcyan', height=400, xaxis_dtick=100,
    error_color='black', whisk_thick=1, yaxis_visible=True, whisk_width=0, font_size=10,
    title=dict(text="Institutions".upper(), font=dict(size=17), xanchor='center', x=.5, xref='paper', y=.99),
    xaxis=dict(side='top', title_font_size=17, title_standoff=5, tickfont_size=13, range=(0,390)),
    yaxis=dict(title='', title_font_size=20, dtick=1, tickfont_size=17, title_standoff=0),
    showlegend=False,
    legend=dict(xanchor='right', x=.98, yanchor='bottom', y=.1, font_size=20),
    yaxis_automargin=False, xaxis_automargin=False, margin=dict(l=300, r=50, t=100, b=50)
)
plotly.io.write_image(fig, os.path.join(figs_dir, f'top_insts.pdf'), scale=10)
fig.show()

In [None]:
fig = visualize_sources_numbers_horizontal(
    all_papers, 'Country', sort='n_papers_of_interest', n=10,  insidetext=False, width=800, color='darkcyan', height=400, xaxis_dtick=100,
    error_color='black', whisk_thick=1, yaxis_visible=True, whisk_width=0, font_size=10,
    title=dict(text="Nations".upper(), font=dict(size=17), xanchor='center', x=.5, xref='paper', y=.99),
    xaxis=dict(side='top', title_font_size=17, dtick=1000, title_standoff=5, tickfont_size=13, range=(0,3300)),
    yaxis=dict(title='', title_font_size=20, dtick=1, tickfont_size=17, title_standoff=0),
    showlegend=False,
    legend=dict(xanchor='right', x=.98, yanchor='bottom', y=.1, font_size=20),
    yaxis_automargin=False, xaxis_automargin=False, margin=dict(l=300, r=50, t=100, b=50)
)
plotly.io.write_image(fig, os.path.join(figs_dir, f'top_countries.pdf'), scale=10)
fig.show()

## Fieldwide dominance

In [None]:
fig = visualize_sources_percents(
    all_papers, 'Institution', filt='n_papers_patented >= 10', color='dimgray', line=True, width=1000, height=400, margin=dict(t=100,b=180),
    xaxis=dict(title='', tickangle=270, tickfont_size=6), yaxis_title='Patented papers', condensed_bars=False,
    legend=dict(xanchor='left', x=.03, yanchor='bottom', y=1, font_size=15),
    background_stack=False, error_color='black')
plotly.io.write_image(fig, os.path.join(figs_dir, f'inst_percents_labeled.pdf'),scale=10)
fig.show()

In [None]:
fig = visualize_sources_percents(
    all_papers, 'Field', filt='n_papers_patented >= 10', n=150, color='dimgray', line=True, width=1000, height=250, margin=dict(t=0,b=180),
    xaxis=dict(title='', tickangle=270, tickfont_size=6), yaxis_title='Patented papers', condensed_bars=False,
    legend=dict(xanchor='left', x=.03, yanchor='bottom', y=1, font_size=15), error_color='black', background_stack=False)
plotly.io.write_image(fig, os.path.join(figs_dir, f'field_percents_labeled.pdf'),scale=10)
fig.show()


In [None]:
fig = visualize_sources_percents(
    all_papers, 'Country', filt='n_papers_patented >= 10', n=150, color='dimgray', line=True, width=400, height=250, margin=dict(t=0,b=0),
    xaxis=dict(title='', tickangle=270, tickfont_size=8), yaxis_title='Patented papers',
    legend=dict(xanchor='left', x=.03, yanchor='bottom', y=1, font_size=15), error_color='black', background_stack=False)
plotly.io.write_image(fig, os.path.join(figs_dir, f'country_percents_labeled.pdf'),scale=10)
fig.show()