In [1]:
import pandas as pd
import plotly.express as px
from fuzzywuzzy import process, fuzz
import numpy as np
from collections import Counter
import re 
import collections
from scipy.stats import spearmanr, pearsonr
from sklearn.linear_model import LinearRegression
import numpy as np


### Merging Files into One Dataframe

Our dataset contains one CSV file per year so we read each file in as a dataframe and then concatenate the dataframe. The final dataset contains 10,529 entries spanning 2016 to 2023.

In [2]:
df_2016 = pd.read_csv('raw_abstracts/2016Export.csv', encoding_errors='ignore')
df_2016['year'] = 2016

df_2017 = pd.read_csv('raw_abstracts/2017Export.csv', encoding_errors='ignore')
df_2017['year'] = 2017

df_2018 = pd.read_csv('raw_abstracts/2018Export.csv', encoding_errors='ignore')
df_2018['year'] = 2018

df_2019 = pd.read_csv('raw_abstracts/2019Export.csv', encoding_errors='ignore')
df_2019['year'] = 2019

df_2020 = pd.read_csv('raw_abstracts/2020Export.csv', encoding_errors='ignore')
df_2020['year'] = 2020

df_2021 = pd.read_csv('raw_abstracts/2021Export.csv', encoding_errors='ignore')
df_2021['year'] = 2021

df_2022 = pd.read_csv('raw_abstracts/2022Export.csv', encoding_errors='ignore')
df_2022['year'] = 2022

df_2023 = pd.read_csv('raw_abstracts/2023Export.csv', encoding_errors='ignore')
df_2023['year'] = 2023

df_2024 = pd.read_csv('raw_abstracts/2024Export.csv', encoding_errors='ignore')
df_2024['year'] = 2024

df = pd.concat([df_2016, df_2017, df_2018, df_2019, df_2020, df_2021, df_2022, df_2023, df_2024], axis=0)


print(len(df))

12294


### Removing Entries with Incomplete Data

Next, we remove any entries that are missing an author block as this will prevent subsequent data cleaning. It looks like only one such entry is affected as the new total is 10,528 entries.

In [3]:
df.dropna(subset=['author_block'], inplace=True)
print(len(df))

12293


In [4]:
df.drop_duplicates(subset=['control_number'], inplace=True)
print(len(df))

12142


### Filtering Authors

We do data cleaning to separate the author portion and institution portion of the author block. We obtain the first and last authors in author block. We also remove middle initials so that each author is in the format F. Lastname (as opposed to F.M. Lastname). This is to make sure that authors are grouped properly even if their middle initial was used variably in the abstract. However, we do lose resolution if two authors with the same first initial and last name are merged (however this appears to be rare). We unfortunately do not have full first and last name information in this data.

In [5]:
df['author_block'] = df['author_block'].str.replace('(&nbsp;)', ' ', regex=True)

## Remove all non-ascii characters from the author_block
df['author_block'] = df['author_block'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

## Remove &nbsp; from the author_block
df['author_block'] = df['author_block'].str.replace('(&nbsp;)', ' ')

## For some reason its not working to remove the &nbsp; from the author_block
## Can we do it another way

## Remove all non-ascii characters from the author_block
df['author_block'] = df['author_block'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

## Remove &nbsp; from the author_block
df['author_block'] = df['author_block'].str.replace('(&nbsp;)', ' ')

df['author_block'] = df['author_block'].str.replace('\D', '')


In [6]:
df_author = df['author_block'].str.split('(<sup>,?[\d*]+(,?\.? ?[\d*]?)*<\/sup>)[\s\r\n]*[;]*[ ]*[\t]*(<sup>,?[\d*]*(,?\.? ?[\d*])*<\/sup>)', 
                                     expand=True)


df_author['authors'] = df_author[0].str.replace('(<sup>,?[0-9]+(,[0-9]+)*,?</sup>)', '')

df_author['author_list'] = df_author.apply(lambda x: x['authors'].split(','), axis=1)


df_author['first_author'] = df_author.apply(lambda x: x['author_list'][0], axis=1)

## Remove html tags from the first author
df_author['first_author'] = df_author['first_author'].str.replace('<[^<]+?>', '', regex=True)

## Remove numbers from the first author
df_author['first_author'] = df_author['first_author'].str.replace('\d+', '', regex=True)

## Remove middle initials from the first author
df_author['first_author'] = df_author['first_author'].str.replace('(. \w. )', '. ', regex=True)

## Extract senior author
df_author['senior_author'] = df_author.apply(lambda x: x['author_list'][-1], axis=1)

## Remove middle initial from senior author
df_author['senior_author'] = df_author['senior_author'].str.replace('(. \w. )', '. ', regex=True)

df_author

Unnamed: 0,0,1,2,3,4,5,authors,author_list,first_author,senior_author
0,"P. T. White<sup>1</sup>, C. Subramanian<sup>1<...","<sup>1,4</sup>",,<sup>5</sup>,,"University Of Michigan,Department Of Biomedica...","P. T. White<sup>1</sup>, C. Subramanian<sup>1<...","[P. T. White<sup>1</sup>, C. Subramanian<sup>...",P. White,M. Cohen
1,"P. Patel<sup>1</sup>, T. Kato<sup>1</sup>, H. ...","<sup>1,2,3</sup>",,<sup>1</sup>,,"University of Toronto,Division Of Thoracic Sur...","P. Patel<sup>1</sup>, T. Kato<sup>1</sup>, H. ...","[P. Patel<sup>1</sup>, T. Kato<sup>1</sup>, ...",P. Patel,K. Yasufuku
2,"R. Jaskula Sztul<sup>1,3</sup>, G. Chen<sup>2<...","<sup>1,3</sup>",,<sup>1</sup>,,"University Of Wisconsin,Surgery,Madison, WI, U...","R. Jaskula Sztul<sup>1,3</sup>, G. Chen<sup>2<...","[R. Jaskula Sztul<sup>1, 3</sup>, G. Chen<sup...",R. Jaskula Sztul,H. Chen
3,"M. M. Hodges<sup>1</sup>, C. Zgheib<sup>1</sup...",<sup>1</sup>,,<sup>1</sup>,,"University Of Colorado Denver,Laboratory For F...","M. M. Hodges<sup>1</sup>, C. Zgheib<sup>1</sup...","[M. M. Hodges<sup>1</sup>, C. Zgheib<sup>1</s...",M. Hodges,K. Liechty
4,"I. J. Lawandy<sup>1</sup>, B. A. Potz<sup>1</s...",<sup>1</sup>,,<sup>1</sup>,,"Brown University,Surgery/Cardiothoracic Surger...","I. J. Lawandy<sup>1</sup>, B. A. Potz<sup>1</s...","[I. J. Lawandy<sup>1</sup>, B. A. Potz<sup>1<...",I. Lawandy,F. Sellke
...,...,...,...,...,...,...,...,...,...,...
1760,"P. W. Wachira<sup>1</sup>, A. Savage<sup>2</su...",<sup>1</sup>,,<sup>1</sup>,,"The University of Alabama at Birmingham (UAB),...","P. W. Wachira<sup>1</sup>, A. Savage<sup>2</su...","[P. W. Wachira<sup>1</sup>, A. Savage<sup>2</...",P. Wachira,A. Gillis
1761,"M. Kheng<sup>1</sup>, A. Manzella<sup>1</sup>,...","<sup>1,2</sup>",,<sup>2</sup>,,"Rutgers Cancer Institute of New Jersey, Endocr...","M. Kheng<sup>1</sup>, A. Manzella<sup>1</sup>,...","[M. Kheng<sup>1</sup>, A. Manzella<sup>1</sup...",M. Kheng,A. Laird
1762,"B. N. Titanji<sup>1,2</sup>, M. Earley<sup>1</...",<sup>1</sup>,,<sup>1</sup>,,"Stanford University, Department Of Surgery, Pa...","B. N. Titanji<sup>1,2</sup>, M. Earley<sup>1</...","[B. N. Titanji<sup>1, 2</sup>, M. Earley<sup>...",B. Titanji,E. Kebebew
1763,"J. Hwang<sup>1,2</sup>, J. Sharpe<sup>2</sup>,...","<sup>2,3</sup>",,<sup>1</sup>,,"Hospital Of The University Of Pennsylvania, Ge...","J. Hwang<sup>1,2</sup>, J. Sharpe<sup>2</sup>,...","[J. Hwang<sup>1, 2</sup>, J. Sharpe<sup>2</su...",J. Hwang,R. Kelz


### Filtering Institutions

For the institution list, we make the assumption that the first institution listed in the block is the main institution that contributed the work. In the string of the first assumption, we look for the words "university", "hospital", "institute", or "medical center" or if none of those words exist, the first expression separated by a comma. 

In [7]:
df_author['institutions'] = df_author[5].str.replace('(<sup>,?[0-9]+(,[0-9]+)*,?</sup>)', '|', n=1)

df_author['institution_list'] = df_author.apply(lambda x: x['institutions'].split('|'), axis=1)
df_author['first_institution'] = df_author.apply(lambda x: x['institution_list'][0].split(',')[:-3], axis=1)

df_author['institution_list']

KEYWORDS = ['university', 'hospital', 'institute', 'medical center']


def smart_institution(list):
    for item in list:
        for word in KEYWORDS:
            if word in item.lower():
                return item
    return list[0]
            

df_author['smart_institution'] = df_author.apply(lambda x: smart_institution(x['institution_list']), axis=1)
df_author['smart_institution2'] = df_author['smart_institution'].str.split(',', expand=True)[0]
df_author['smart_institution2'] = df_author['smart_institution2'].str.title()

df_author

Unnamed: 0,0,1,2,3,4,5,authors,author_list,first_author,senior_author,institutions,institution_list,first_institution,smart_institution,smart_institution2
0,"P. T. White<sup>1</sup>, C. Subramanian<sup>1<...","<sup>1,4</sup>",,<sup>5</sup>,,"University Of Michigan,Department Of Biomedica...","P. T. White<sup>1</sup>, C. Subramanian<sup>1<...","[P. T. White<sup>1</sup>, C. Subramanian<sup>...",P. White,M. Cohen,"University Of Michigan,Department Of Biomedica...","[University Of Michigan,Department Of Biomedic...","[University Of Michigan, Department Of Biomedi...","University Of Michigan,Department Of Biomedica...",University Of Michigan
1,"P. Patel<sup>1</sup>, T. Kato<sup>1</sup>, H. ...","<sup>1,2,3</sup>",,<sup>1</sup>,,"University of Toronto,Division Of Thoracic Sur...","P. Patel<sup>1</sup>, T. Kato<sup>1</sup>, H. ...","[P. Patel<sup>1</sup>, T. Kato<sup>1</sup>, ...",P. Patel,K. Yasufuku,"University of Toronto,Division Of Thoracic Sur...","[University of Toronto,Division Of Thoracic Su...","[University of Toronto, Division Of Thoracic S...","University of Toronto,Division Of Thoracic Sur...",University Of Toronto
2,"R. Jaskula Sztul<sup>1,3</sup>, G. Chen<sup>2<...","<sup>1,3</sup>",,<sup>1</sup>,,"University Of Wisconsin,Surgery,Madison, WI, U...","R. Jaskula Sztul<sup>1,3</sup>, G. Chen<sup>2<...","[R. Jaskula Sztul<sup>1, 3</sup>, G. Chen<sup...",R. Jaskula Sztul,H. Chen,"University Of Wisconsin,Surgery,Madison, WI, U...","[University Of Wisconsin,Surgery,Madison, WI, ...","[University Of Wisconsin, Surgery, Madison, W...","University Of Wisconsin,Surgery,Madison, WI, U...",University Of Wisconsin
3,"M. M. Hodges<sup>1</sup>, C. Zgheib<sup>1</sup...",<sup>1</sup>,,<sup>1</sup>,,"University Of Colorado Denver,Laboratory For F...","M. M. Hodges<sup>1</sup>, C. Zgheib<sup>1</sup...","[M. M. Hodges<sup>1</sup>, C. Zgheib<sup>1</s...",M. Hodges,K. Liechty,"University Of Colorado Denver,Laboratory For F...","[University Of Colorado Denver,Laboratory For ...","[University Of Colorado Denver, Laboratory For...","University Of Colorado Denver,Laboratory For F...",University Of Colorado Denver
4,"I. J. Lawandy<sup>1</sup>, B. A. Potz<sup>1</s...",<sup>1</sup>,,<sup>1</sup>,,"Brown University,Surgery/Cardiothoracic Surger...","I. J. Lawandy<sup>1</sup>, B. A. Potz<sup>1</s...","[I. J. Lawandy<sup>1</sup>, B. A. Potz<sup>1<...",I. Lawandy,F. Sellke,"Brown University,Surgery/Cardiothoracic Surger...","[Brown University,Surgery/Cardiothoracic Surge...","[Brown University, Surgery/Cardiothoracic Surg...","Brown University,Surgery/Cardiothoracic Surger...",Brown University
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1760,"P. W. Wachira<sup>1</sup>, A. Savage<sup>2</su...",<sup>1</sup>,,<sup>1</sup>,,"The University of Alabama at Birmingham (UAB),...","P. W. Wachira<sup>1</sup>, A. Savage<sup>2</su...","[P. W. Wachira<sup>1</sup>, A. Savage<sup>2</...",P. Wachira,A. Gillis,"The University of Alabama at Birmingham (UAB),...",[The University of Alabama at Birmingham (UAB)...,[The University of Alabama at Birmingham (UAB)...,"The University of Alabama at Birmingham (UAB),...",The University Of Alabama At Birmingham (Uab)
1761,"M. Kheng<sup>1</sup>, A. Manzella<sup>1</sup>,...","<sup>1,2</sup>",,<sup>2</sup>,,"Rutgers Cancer Institute of New Jersey, Endocr...","M. Kheng<sup>1</sup>, A. Manzella<sup>1</sup>,...","[M. Kheng<sup>1</sup>, A. Manzella<sup>1</sup...",M. Kheng,A. Laird,"Rutgers Cancer Institute of New Jersey, Endocr...","[Rutgers Cancer Institute of New Jersey, Endoc...","[Rutgers Cancer Institute of New Jersey, Endo...","Rutgers Cancer Institute of New Jersey, Endocr...",Rutgers Cancer Institute Of New Jersey
1762,"B. N. Titanji<sup>1,2</sup>, M. Earley<sup>1</...",<sup>1</sup>,,<sup>1</sup>,,"Stanford University, Department Of Surgery, Pa...","B. N. Titanji<sup>1,2</sup>, M. Earley<sup>1</...","[B. N. Titanji<sup>1, 2</sup>, M. Earley<sup>...",B. Titanji,E. Kebebew,"Stanford University, Department Of Surgery, Pa...","[Stanford University, Department Of Surgery, P...","[Stanford University, Department Of Surgery, ...","Stanford University, Department Of Surgery, Pa...",Stanford University
1763,"J. Hwang<sup>1,2</sup>, J. Sharpe<sup>2</sup>,...","<sup>2,3</sup>",,<sup>1</sup>,,"Hospital Of The University Of Pennsylvania, Ge...","J. Hwang<sup>1,2</sup>, J. Sharpe<sup>2</sup>,...","[J. Hwang<sup>1, 2</sup>, J. Sharpe<sup>2</su...",J. Hwang,R. Kelz,"Hospital Of The University Of Pennsylvania, Ge...","[Hospital Of The University Of Pennsylvania, G...","[Hospital Of The University Of Pennsylvania, ...","Hospital Of The University Of Pennsylvania, Ge...",Hospital Of The University Of Pennsylvania


### Combine Dataframes

Now, we combine the new columns into the existing dataframe.

In [8]:
df2 = pd.concat([df, df_author], axis=1)
# df2.to_csv('data.csv')

In [9]:
len(df2['control_number'].unique())

12142

### Number of Abstracts By Institution

Here, we calculate the number of abstracts by institution for the duration of the study period

In [10]:
df_institutions = df2.groupby(['smart_institution2']).agg(
    count = ('control_number', 'nunique')
)
df_institutions.sort_values(by='count', ascending=False, inplace=True)
df_institutions.reset_index(inplace=True)

df_institutions

Unnamed: 0,smart_institution2,count
0,University Of Alabama At Birmingham,543
1,University Of Michigan,390
2,Baylor College Of Medicine,218
3,Massachusetts General Hospital,192
4,Medical College Of Wisconsin,188
...,...,...
1609,Mount Sinai St. Luke'S Roosevelt General Surge...,1
1610,Morsani College Of Medicine,1
1611,Morristown Medical Center,1
1612,Moores Cancer Center,1


### Correcting Institution Duplicates Using Fuzzy Matching

We need to combine data for institutions that are the same in reality even if the name is written differently. One example of this is "University of Alabama - Birmingham" vs "University of Alabama Birmingham" vs "University of Alabama". We will attempt to resolve this using fuzzy string matching. While not perfect, we can get pretty close with this and at least avoid errors among the most frequent institutions. Another example is "University of California" and "David Geffen School of Medicine". Given no string relation in this case, we will need to resolve these differences manually.


In [11]:
def _normalize_name(name) -> str:
    """
    Normalize the name of a university
    :param str name:
    :rtype: str
    """
    norm = name.lower()
    norm = re.sub(
        r'( +at +)|( *of *)|(university)|(hospital)|(medical center)|(institution)',
        ' ',
        norm,
    )
    return norm.replace("'", "").strip()


def process_unis(uni_list, uni_to_pub):
    """
    Process the list of unis
    :param List[str] uni_list:
    :param Dict[str, int] uni_to_pub:
    :rtype: Dict[str, str]
    """
    # Normalize unis
    uni_list_norm = [(uni, _normalize_name(uni)) for uni in uni_list]

    # Construct empty choices list - we will iterate through the
    # list of universities from highest to lowest number of publications
    # We'll normalize each one and then try to match it to the choices list
    #
    # If the normalized name is not in the choices list (i.e. there is
    # no match), we'll add it so that future universities can be matched to it
    # and map the original name to the normalized name
    #
    # If the normalized name is in the choices list, we'll map the original
    # name to the normalized name
    choices = []
    bad_to_good = collections.OrderedDict()
    for item, normalized_item in uni_list_norm:

        # if normalized name has cardinal directions in it or "washington", skip it
        if re.search(r'(north)|(south)|(east)|(west)|(central)|(washington)|(medical college)|(atlantic)|(commonwealth)|(children)|(illinois)|(atlantic)|(florida state)|(michigan state)|(mayo)|(york hospital)|(university college london)|(loyola)', normalized_item):
            bad_to_good[item] = normalized_item
            choices.append(normalized_item)
            continue

        potential_match = process.extractOne(normalized_item, choices)

        if not potential_match or potential_match[1] < 90:
            bad_to_good[item] = normalized_item
            choices.append(normalized_item)
        else:
            bad_to_good[item] = potential_match[0]

    # Now that we have a mapping of original name to normalized name
    # we can count the number of publications for each normalized name
    norm_counts = collections.defaultdict(int)
    for uni, norm in bad_to_good.items():
        norm_counts[norm] += uni_to_pub[uni]

    # Now we can go through the list of universities again and get the
    # normalized name from the bad_to_good mapping and add the number of
    # publications to the final map.
    #
    # Once we see a normalized name we'll add it to the seen_norms set
    # so that we don't double count and we'll assign the total number
    # of normalized publications to the most common name for that
    # university pre-normalization.
    #
    # Example - if we have Hospital of University of Pennsylvania (100) and
    # University of Pennsylvania Hospital (90), we'll assign the total number
    # of publications to Hospital of University of Pennsylvania (190) and
    # skip University of Pennsylvania Hospital (as they normalize to the same
    # thing)

    final_map = collections.defaultdict(int)
    norm_to_first_good_name = {}
    seen_norms = set()
    for uni in uni_list:
        norm = bad_to_good[uni]
        if norm not in seen_norms:
            norm_to_first_good_name[norm] = uni
            final_map[uni] += norm_counts[norm]
            seen_norms.add(norm)

    # print(norm_to_first_good_name)

    return {bad: norm_to_first_good_name.get(norm, None) for bad, norm in bad_to_good.items()}



l = df_institutions['smart_institution2'].to_list()
d = dict(zip(df_institutions['smart_institution2'], df_institutions['count']))

bad_to_good = process_unis(l, d)

In [12]:
## Create a dataframe from the bad to good mapping
df_bg = pd.DataFrame(bad_to_good.items(), columns=['smart_institution2', 'smart_institution3'])

## Create a new dataframe df_institution_map merging the original df_institutions with the df_bg
df_institution_map = pd.merge(df_institutions, df_bg, on='smart_institution2', how='left')

df_institution_grp = df_institution_map.groupby(['smart_institution3', 'smart_institution2']).agg(
    count = ('count', 'sum')
).reset_index()


## Create a new column with the sum of all count per smart_institution3
df_institution_grp['total_count'] = df_institution_grp.groupby('smart_institution3')['count'].transform('sum')

df_institution_grp

df_institution_grp.sort_values(by=['total_count', 'smart_institution3', 'count'], ascending=False, inplace=True)

# df_institution_grp.to_csv('prelim_map.csv')

In [13]:
## I did some manual updating in prelim_map.csv to fix some of the mappings
## I saved the file as 'updated_map.csv'
## read in the updated_map.csv

df_updated_map = pd.read_csv('updated_map.csv')

## count how many smart_institution4 are not null
len(df_updated_map[df_updated_map['smart_institution4'].notnull()])

## replace smart_institution4 with smart_institution3 if not null
df_updated_map['smart_institution5'] = df_updated_map['smart_institution4'].fillna(df_updated_map['smart_institution3'])

## Create a new dataframe called df_map_final with just smart_institution2 and smart_institution4
df_map_final = df_updated_map[['smart_institution2', 'smart_institution5']]

df_map_final

Unnamed: 0,smart_institution2,smart_institution5
0,University Of Alabama At Birmingham,University Of Alabama At Birmingham
1,University Of Alabama,University Of Alabama At Birmingham
2,The University Of Alabama At Birmingham,University Of Alabama At Birmingham
3,University Of Alabama At Birmingham School Of ...,University Of Alabama At Birmingham
4,University Of Alabama Birmingham,University Of Alabama At Birmingham
...,...,...
1609,Adventhealth Cancer Institute,Adventhealth Cancer Institute
1610,Academic Medical Center,Academic Medical Center
1611,Abilene Christian University,Abilene Christian University
1612,Aberdeen Royal Infirmary,Aberdeen Royal Infirmary


In [14]:
## Merge df_map_final with df2 on smart_institution2
df3 = pd.merge(df2, df_map_final, on='smart_institution2', how='left')

df3.columns

## Drop the following columns:
## session_type, 0, 1, 2, 3, 4, 5, 'authors', 'institutions', 'first_instiution', 'smart_institution', 'smart_institution2'

df3.drop(columns=[0, 1, 2, 3, 4, 5, 'authors', 'institutions', 'first_institution', 'smart_institution', 'smart_institution2'], inplace=True)

df3.to_csv('abstracts_final.csv')

# Start Data Analysis

### Number of Abstracts by Year

First, we calculate the number of abstracts by year

In [15]:
## Group number of abstracts by year
df_years = df3.groupby('year').agg(
    total_count = ('control_number', 'nunique')
)

df_years.reset_index(inplace=True)
df_years
fig = px.bar(df_years, x='year', y='total_count', text_auto='',
labels = dict(year = 'Year', total_count = 'Total Abstracts (2016-2024)'))
# fig.show()

fig.update_layout(
    width=1600,
    height=720,
    font=dict(
        family="Inter",
        size=24))

fig.update_traces(textfont_size=24)
fig.update_yaxes(showticklabels=False)

fig.write_image('fig_num_abstracts.svg')
fig.show()

In [16]:
df3_group


NameError: name 'df3_group' is not defined

In [None]:

abstract_columns = ['abstract_type', 'session_type']

## In df3 abstract type, change 'Outcomes/Clinical' to 'Clinical/Outcomes'
df3['abstract_type'] = df3['abstract_type'].str.replace('Outcomes/Clinical', 'Clinical/Outcomes')

## Change df3 session type to lowercase and back to title case
df3['session_type'] = df3['session_type'].str.lower().str.title()

## Remove any spaces from session type
df3['session_type'] = df3['session_type'].str.replace(' ', '')

df3_melt = pd.melt(df3, id_vars=['year', 'control_number'], value_vars=abstract_columns)

## Exclude session type poster from the melt
df3_melt = df3_melt[df3_melt['value'] != 'Poster']

df3_group = df3_melt.groupby(['year', 'variable', 'value']).agg(
    count = ('control_number', 'nunique')
).reset_index()

df3_group['value'] = pd.Categorical(df3_group['value'], ['Basic Science', 'Clinical/Outcomes', 'Education', 'Quickshot', 'Oral', 'Plenary'], ordered=True)
df3_group.sort_values(by=['value', 'year'], inplace=True)
df3_group

df3_group

fig = px.bar(
    df3_group,
    x='year',
    y='count',
    facet_row = 'variable',
    facet_row_spacing = 0.08,
    color='value',
    barmode='group',
    text_auto=''
)

fig.update_xaxes(matches=None, showticklabels=True)

fig.update_layout(
    width=1600,
    height=720,
    font=dict(
        family="Inter",
        size=24))

fig.update_traces(textfont_size=24)
fig.update_yaxes(showticklabels=False)

## set text position outside
fig.update_traces(textposition='auto', textangle=0)

## Get rid of facet title
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))




fig.show()








## Identify Top 20 Institutions

In [None]:
## Identify the top 20 institutions

## Group df3 by smart_institution5
df_institution_new = df3.groupby('smart_institution5').agg(
    count = ('control_number', 'nunique')
)

# df_institution_new.to_csv('institution_new.csv')

## Use the same code above to plot the number of abstracts per institution

df_institution_new.sort_values(by='count', ascending=False, inplace=True)
df_institution_new.reset_index(inplace=True)

## Create a subset of the top 20 institutions
df_institution_subset = df_institution_new[0:20]

df_institution_subset['smart_institution5'].to_list()

## Renaming Schema

## 'University Of Alabama At Birmingham' to 'U. Alabama'
## 'University Of Michigan' to 'U. Michigan'
## 'Hospital Of The University Of Pennsylvania' to 'Penn'
## 'Johns Hopkins University School Of Medicine' to 'Johns Hopkins'
## 'University Of California - Los Angeles' to 'UCLA'
## 'Baylor College Of Medicine' to 'Baylor'
## 'Stanford University' to 'Stanford'
## 'Massachusetts General Hospital' to 'Mass General'
## 'Indiana University School Of Medicine' to 'Indiana U.'
## 'University Of Pittsburgh' to 'UPMC'
## 'University Of Miami' to 'U. Miami'
## 'Medical College Of Wisconsin' to 'MCW'
## 'University Of Texas Southwestern Medical Center' to 'U.T. Southwestern'
## 'Northwestern University' to 'Northwestern'
## 'Yale University School Of Medicine' to 'Yale'
## 'University Of Colorado Denver' to 'U. Colorado'
## 'University Of Wisconsin' to 'U. Wisconsin'
## "Brigham And Women'S Hospital" to 'BWH'
## 'Ohio State University' to 'Ohio State U.'
## 'University Of Florida' to 'U. Florida'

## Make a new column called clean_institution and rename as above

df_institution_subset['clean_institution'] = df_institution_subset['smart_institution5'].copy()

df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('University Of Alabama At Birmingham', 'U. Alabama')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('University Of Michigan', 'U. Michigan')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Hospital Of The University Of Pennsylvania', 'Penn')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Johns Hopkins University School Of Medicine', 'Johns Hopkins')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('University Of California - Los Angeles', 'UCLA')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Baylor College Of Medicine', 'Baylor')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Stanford University', 'Stanford')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Massachusetts General Hospital', 'Mass General')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Indiana University School Of Medicine', 'Indiana U.')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('University Of Pittsburgh', 'UPMC')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('University Of Miami', 'U. Miami')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Medical College Of Wisconsin', 'MCW')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('University Of Texas Southwestern Medical Center', 'U.T. Southwestern')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Northwestern University', 'Northwestern')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Yale University School Of Medicine', 'Yale')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('University Of Colorado Denver', 'U. Colorado')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('University Of Wisconsin', 'U. Wisconsin')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Brigham And Women\'S Hospital', 'BWH')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('Ohio State University', 'Ohio State U.')
df_institution_subset['clean_institution'] = df_institution_subset['clean_institution'].str.replace('University Of Florida', 'U. Florida')

## Save top institutions to list
top_institutions = df_institution_subset['smart_institution5'].to_list()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

## Graph Top 20 Institutions

In [None]:
## Graph top 20 institutions
fig = px.bar(
    df_institution_subset,
    x='clean_institution',
    y='count',
    text_auto=''
)

fig.update_layout(
    width=1600,
    height=720,
    font=dict(
        family="Inter",
        size=24)
)

fig.update_traces(textfont_size=24)
fig.update_xaxes(tickangle=-90)
fig.update_yaxes(showticklabels=False)

fig.update_layout(
    xaxis_title="Institution",
    yaxis_title="Total Abstracts (2016-2023)"
)


# Remove tick labels from x-axis
fig.update_xaxes(showticklabels=False)



fig.write_image('fig_top_institutions.svg')

fig.update_xaxes(showticklabels=True)

fig.show()

## Create Graph for Institution and Year

In [None]:
## Create a new dataframe called df_institution_year

df_institution_year = df3.groupby(['smart_institution5', 'year']).agg(
    count = ('control_number', 'nunique')
)

df_institution_year.reset_index(inplace=True)

## Create a dataframe called df_institution_year_subset with the top 20 institutions
df_institution_year_subset = df_institution_year[df_institution_year['smart_institution5'].isin(top_institutions)]


## Merge in df_institution_year_subset with df_institution_subset to get the clean name
df_institution_year_subset = pd.merge(df_institution_year_subset, df_institution_subset, on='smart_institution5', how='left')

## Rename 'count_x' to 'count' and 'count_y' to 'total_count'
df_institution_year_subset.rename(columns={'count_x': 'count', 'count_y': 'total_count'}, inplace=True)

df_institution_year_subset

## Sort by year (most recent first) and then by count (most to least) but we want all 'smart
df_institution_year_subset.sort_values(by=['total_count', 'year', 'count'], ascending=False, inplace=True)

## Calculate the sum of 'count' for each 'smart_institution5'
df_institution_year_subset['total_count_verify'] = df_institution_year_subset.groupby('smart_institution5')['count'].transform('sum')

## Confirm that the total_count and total_count_verify are the same for all rows
df_institution_year_subset['total_count'].equals(df_institution_year_subset['total_count_verify'])

## Drop the total_count_verify column
df_institution_year_subset.drop(columns=['total_count_verify'], inplace=True)

## Merge in df_year with df_institution_year_subset
df_institution_year_subset = pd.merge(df_institution_year_subset, df_years, on='year', how='left')

## Rename 'total_count_x' to 'total_count' and 'total_count_y' to 'abstracts_per_year'
df_institution_year_subset.rename(columns={'total_count_x': 'total_count', 'total_count_y': 'abstracts_per_year'}, inplace=True)

## Create a new column called percent
df_institution_year_subset['percent'] = df_institution_year_subset['count'] / df_institution_year_subset['abstracts_per_year']

df_institution_year_subset



Unnamed: 0,smart_institution5,year,count,total_count,clean_institution,abstracts_per_year,percent
0,University Of Alabama At Birmingham,2024,77,589,U. Alabama,1746,0.044101
1,University Of Alabama At Birmingham,2023,75,589,U. Alabama,1469,0.051055
2,University Of Alabama At Birmingham,2022,67,589,U. Alabama,848,0.079009
3,University Of Alabama At Birmingham,2021,66,589,U. Alabama,922,0.071584
4,University Of Alabama At Birmingham,2020,94,589,U. Alabama,1742,0.053961
...,...,...,...,...,...,...,...
175,Ohio State University,2020,35,171,Ohio State U.,1742,0.020092
176,Ohio State University,2019,24,171,Ohio State U.,1536,0.015625
177,Ohio State University,2018,28,171,Ohio State U.,1368,0.020468
178,Ohio State University,2017,11,171,Ohio State U.,1386,0.007937


## Graph the Results

In [None]:
## Melt the df_institution_year_subset dataframe and create a new line plot faceting the count and percent

df_institution_year_subset_melt = pd.melt(df_institution_year_subset, id_vars=['year', 'clean_institution'], value_vars=['count', 'percent'])

df_institution_year_subset_melt

fig = px.line(

    df_institution_year_subset_melt,
    x='year',
    y='value',
    color='clean_institution',
    facet_col='variable',
    facet_col_wrap=2,
    facet_col_spacing=0.05,
    labels = dict(year = 'Year', value = 'Total Abstracts')

)

fig.update_layout(
    width=1600,
    height=720,
    font=dict(
        family="Inter",
        size=16)
)

fig.update_traces(textfont_size=24)
fig.update_xaxes(tickangle=0)

fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Total Abstracts"
)

## Non standard y-axes
fig.update_yaxes(matches=None)

## Show y-axes labels for both facets
fig.update_yaxes(showticklabels=True)

fig.update_traces(line=dict(width=5))

fig.write_image('fig_top_institutions_year_melt.svg')

## Get rid of 'variable=' in the facet title
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))



fig.show()

## Show Institution Facets

In [None]:
## Make new line plots for percent with each institution as a facet from df_institution_year_subset

fig = px.line(
    df_institution_year_subset,
    x='year',
    y='percent',
    color='clean_institution',
    facet_col='clean_institution',
    facet_col_wrap=5
)

fig.update_layout(
    width=1600,
    height=720,
    font=dict(
        family="Inter",
        size=16)
)

fig.update_traces(textfont_size=24)


fig.update_traces(line=dict(width=5))



## Remove legend
fig.update_layout(showlegend=False)

## Remove clean_institution= from the facet title
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))



## If facet is 1, 6, 11, or 16, left justify the title
for i in [0, 5, 10, 15]:
    fig.layout.annotations[i].update(x=0, xanchor='left')

for i in [1, 6, 11, 16]:
    fig.layout.annotations[i].update(x=0.202, xanchor='left')

for i in [2, 7, 12, 17]:
    fig.layout.annotations[i].update(x=0.406, xanchor='left')

for i in [3, 8, 13, 18]:
    fig.layout.annotations[i].update(x=0.610, xanchor='left')

for i in [4, 9, 14, 19]:
    fig.layout.annotations[i].update(x=0.814, xanchor='left')

fig.write_image('fig_top_institutions_year_percent_facet.svg')

fig.show()


In [None]:
df_institution_year_subset[df_institution_year_subset['clean_institution'] == 'U. Wisconsin']

Unnamed: 0,smart_institution5,year,count,total_count,clean_institution,abstracts_per_year,percent
54,University Of Wisconsin,2024,11,216,U. Wisconsin,1746,0.0063
55,University Of Wisconsin,2023,12,216,U. Wisconsin,1469,0.008169
56,University Of Wisconsin,2022,16,216,U. Wisconsin,848,0.018868
57,University Of Wisconsin,2021,12,216,U. Wisconsin,922,0.013015
58,University Of Wisconsin,2020,28,216,U. Wisconsin,1742,0.016073
59,University Of Wisconsin,2019,26,216,U. Wisconsin,1536,0.016927
60,University Of Wisconsin,2018,24,216,U. Wisconsin,1368,0.017544
61,University Of Wisconsin,2017,37,216,U. Wisconsin,1386,0.026696
62,University Of Wisconsin,2016,50,216,U. Wisconsin,1125,0.044444


In [None]:
df_institution_year_subset

## Create a subset removing Alabama and Michigan
df_institution_year_subset2 = df_institution_year_subset[~df_institution_year_subset['clean_institution'].isin(['U. Alabama', 'U. Michigan'])]

df_institution_year_subset2

## calculate the mean and standard deviation for percent
print(df_institution_year_subset2['percent'].mean())
print(df_institution_year_subset2['percent'].std())

0.01656716933729753
0.00629519961270328


## Year over Year Changes

In [None]:
df_institution_year_subset

## For each institution, calculate the difference between the current year and the prior year
## For example, for 2024, calculate the difference between 2024 and 2023
## For example, for 2023, calculate the difference between 2023 and 2022
## Create a new column called diff to store this amount

df_institution_year_subset['prev_count'] = df_institution_year_subset['count'].shift(-1).where(df_institution_year_subset['clean_institution'].eq(df_institution_year_subset['clean_institution'].shift(-1)))

df_institution_year_subset['diff'] = df_institution_year_subset['count'] - df_institution_year_subset['prev_count']

df_institution_year_subset['pct_diff'] = df_institution_year_subset['diff'] / df_institution_year_subset['count'].shift(-1) * 100

df_institution_year_subset

## Melt the diff and pct_diff columns into a new data frame

df_institution_year_subset_melt = pd.melt(df_institution_year_subset, id_vars=['year', 'clean_institution'], value_vars=['diff', 'pct_diff'])

## Create a new column combining clean_institution and year
df_institution_year_subset_melt['institution_year'] = df_institution_year_subset_melt['clean_institution'] + ' ' + df_institution_year_subset_melt['year'].astype(str)

## Sort by variable and value
df_institution_year_subset_melt.sort_values(by=['variable', 'value'], ascending=[True, False], inplace=True)

df_institution_year_subset_melt

## Round Value to Int
df_institution_year_subset_melt['value'] = df_institution_year_subset_melt['value'].round(0)

In [None]:
## Graph a bar graph with facets for diff and pct_diff

## Create a subset where value > 0 and year >= 2022
df_institution_year_subset_melt_positive = df_institution_year_subset_melt[(df_institution_year_subset_melt['value'] > 0) &
                                                                           (df_institution_year_subset_melt['year'] > 2022)]
## Show only the top ten diff and pct_diff
df_institution_year_subset_melt_positive = df_institution_year_subset_melt_positive.groupby('variable').head(10)

fig = px.bar(
    df_institution_year_subset_melt_positive,
    x='institution_year',
    y='value',
    facet_col='variable',
    facet_col_wrap=2,
    facet_col_spacing=0.05,
    labels = dict(year = 'Year', value = 'Total Abstracts'),
    text_auto=''

)

fig.update_layout(
    width=1600,
    height=720,
    font=dict(
        family="Inter",
        size=16)
)

fig.update_traces(textfont_size=24)
fig.update_xaxes(tickangle=90)

fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Total Abstracts"
)

## Non standard y-axes
fig.update_yaxes(matches=None)
fig.update_xaxes(matches=None)

## Show y-axes labels for both facets
fig.update_yaxes(showticklabels=True)

## Get rid of 'variable=' in the facet title
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

## Remove legend
fig.update_layout(showlegend=False)

## Show xaxis labels
fig.update_xaxes(showticklabels=False)

fig.write_image('fig_top_institutions_year_diff_melt.svg')

## Show xaxis labels
fig.update_xaxes(showticklabels=True)

## Text should be rounded to nearest int and horizontal
fig.update_traces(textposition='inside', textfont_size=16)


fig.show()



