In [1]:
import pandas as pd
from scipy.stats import fisher_exact
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from scipy.stats import fisher_exact
from sklearn.metrics import r2_score


In [2]:
## Read in abstracts_final.csv into new dataframe called df_abstracts
df_abstracts = pd.read_csv('abstracts_final.csv')

## Include only 2024 for this subset analysis
df_abstracts_subset = df_abstracts[df_abstracts['year'] == 2024]

## Rename smart_institution5 as institution
df_abstracts_subset.rename(columns = {'smart_institution5': 'institution'}, inplace = True)

df_abstracts_group = df_abstracts_subset.groupby('institution').agg(
    count = ('control_number', 'nunique')
)

df_abstracts_group = df_abstracts_group.reset_index()

# df_abstracts_group.to_csv('abstracts_by_institution.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_abstracts_subset.rename(columns = {'smart_institution5': 'institution'}, inplace = True)


In [3]:
## Get a dataframe of top 40 institutions in df_abstracts_group by count
df_abstracts_group_top40 = df_abstracts_group.nlargest(40, 'count')

## Add a column called df_abstracts_group_top40['is_top20'] if the institution is in the top 20
df_abstracts_group_top40['is_top10'] = df_abstracts_group_top40['institution'].isin(df_abstracts_group.nlargest(10, 'count')['institution'])
df_abstracts_group_top40['is_top20'] = df_abstracts_group_top40['institution'].isin(df_abstracts_group.nlargest(20, 'count')['institution'])

In [4]:
## Read in leadership.csv into new dataframe called df_leadership
df_leadership = pd.read_csv('leadership.csv')

## Remove any rows where "Role" contains candidate
df_leadership = df_leadership[~df_leadership['Role'].str.lower().str.contains('candidate')]

## Remove any rows where Role == "Member"
df_leadership = df_leadership[~df_leadership['Role'].str.lower().str.contains('member')]

## Remove duplicate Names
df_leadership.drop_duplicates(subset='Name', keep='first', inplace=True)

## Rename all columns to lowercase
df_leadership.columns = df_leadership.columns.str.lower()

## Turn institution into Title Case
df_leadership['institution'] = df_leadership['institution'].str.title()

## change unnamed:0 to society
df_leadership.rename(columns={'unnamed: 0':'society'}, inplace=True)

df_leadership['has_leadership'] = 1

# Dropping duplicates since an institution may have multiple leaders
leadership_representation = df_leadership[['institution', 'has_leadership']].drop_duplicates()

# df_leadership.to_csv('leadership_by_institution.csv')

In [5]:
## Define df_leadership_group counting number of surgeons from each institution
df_leadership_group = df_leadership.groupby('institution').agg(
    count = ('name', 'nunique')
)

df_leadership_group.reset_index(inplace=True)
df_leadership_group.sort_values(by='count', ascending=False, inplace=True)

df_leadership_group

Unnamed: 0,institution,count
34,University Of Texas Md Anderson Cancer Center,5
15,Medical College Of Wisconsin,4
23,University Of Alabama At Birmingham,4
31,University Of North Carolina At Chapel Hill,3
11,Johns Hopkins University School Of Medicine,3
20,Stanford University,2
19,Rutgers Robert Wood Johnson Medical School,2
33,University Of Pittsburgh,2
27,University Of Florida,2
13,Massachusetts General Hospital,2


Now we will merge the two datasets just to make sure there are no errors in institution matching

In [6]:
# Merge this binary variable with the abstracts dataframe
abstracts_with_leadership = df_abstracts_group_top40.merge(leadership_representation, on='institution', how='left')

# Fill NaN values with 0 where there is no leadership representation
abstracts_with_leadership['has_leadership'].fillna(0, inplace=True)
abstracts_with_leadership['is_top10'].fillna(False, inplace=True)
abstracts_with_leadership['is_top20'].fillna(False, inplace=True)

abstracts_with_leadership.sort_values(by='count', ascending=False, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  abstracts_with_leadership['has_leadership'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  abstracts_with_leadership['is_top10'].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

## Top 10 Odds Ratio

In [7]:

# Create a contingency table for the chi-square test
contingency_table = pd.crosstab(abstracts_with_leadership['has_leadership'], abstracts_with_leadership['is_top10'])

# Extracting the necessary data for the Fisher's exact test (which will also give us the Odds Ratio)
odds_table = contingency_table.values

# Calculate the Odds Ratio and its confidence interval using Fisher's Exact Test
odds_ratio, p_value = fisher_exact(odds_table, alternative='two-sided')

print(f"Odds Ratio: {odds_ratio}")
print(f"p-value: {p_value}")

contingency_table.to_csv('contingency_table.csv')

Odds Ratio: 11.76923076923077
p-value: 0.01263902664581782


## Top 20 Odds Ratio

In [8]:

# Create a contingency table for the chi-square test
contingency_table = pd.crosstab(abstracts_with_leadership['has_leadership'], abstracts_with_leadership['is_top20'])

# Extracting the necessary data for the Fisher's exact test (which will also give us the Odds Ratio)
odds_table = contingency_table.values

# Calculate the Odds Ratio and its confidence interval using Fisher's Exact Test
odds_ratio, p_value = fisher_exact(odds_table, alternative='two-sided')

print(f"Odds Ratio: {odds_ratio}")
print(f"p-value: {p_value}")

Odds Ratio: 5.571428571428571
p-value: 0.024841720377823298


In [9]:
## Merge df_abstracts_group and df_leadership_group
abstracts_with_leadership_corr = df_abstracts_group_top40.merge(df_leadership_group, on='institution', how='left')

## Rename count_x and count_y to count_abstracts and count_leadership
abstracts_with_leadership_corr.rename(columns = {'count_x': 'count_abstracts', 'count_y': 'count_leadership'}, inplace = True)

## fill nas with 0
abstracts_with_leadership_corr['count_leadership'].fillna(0, inplace=True)

abstracts_with_leadership_corr

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  abstracts_with_leadership_corr['count_leadership'].fillna(0, inplace=True)


Unnamed: 0,institution,count_abstracts,is_top10,is_top20,count_leadership
0,University Of Alabama At Birmingham,77,True,True,4.0
1,University Of Michigan,56,True,True,2.0
2,University Of California - Los Angeles,52,True,True,2.0
3,Johns Hopkins University School Of Medicine,38,True,True,3.0
4,Louisiana State University Health Sciences Center,35,True,True,1.0
5,Massachusetts General Hospital,35,True,True,2.0
6,University Of Pittsburgh,34,True,True,2.0
7,University Of Texas Southwestern Medical Center,34,True,True,1.0
8,Indiana University School Of Medicine,33,True,True,0.0
9,Baylor College Of Medicine,31,True,True,1.0


In [10]:
# Calculate Spearman's rank correlation using spearmanr from scipy.stats
spearman_correlation_result = spearmanr(abstracts_with_leadership_corr['count_leadership'], abstracts_with_leadership_corr['count_abstracts'])

spearman_correlation_coefficient = spearman_correlation_result.correlation
spearman_p_value = spearman_correlation_result.pvalue

spearman_result = {
    "Spearman's Rank Correlation Coefficient": spearman_correlation_coefficient,
    "p-value": spearman_p_value
}

spearman_result

{"Spearman's Rank Correlation Coefficient": 0.49584027840738837,
 'p-value': 0.0011396081733157702}