In [2]:
import os, sys, re, ast
import pandas as pd
import numpy as np
import bct

In [4]:
sv_df = pd.read_csv("../SciVal Documents/Outputs/sv_shared_pubs.csv",sep="|")
gs_df = pd.read_csv("../../Example Outputs/shared_pubs.csv",sep="|")

In [5]:
print(sv_df.head())

                                               Title  \
0  Meso-Py: Dual Brain Cortical Calcium Imaging i...   
1  Chronic multiscale resolution of mouse brain n...   
2  Water-Reaching Platform for Longitudinal Asses...   
3  Multiscale imaging informs translational mouse...   
4  Towards a Visualizable, De-identified Syntheti...   

                              Authors  
0  ['Tim H. Murphy', 'Jeffrey LeDue']  
1   ['Tim H. Murphy', 'Lynn Raymond']  
2   ['Tim H. Murphy', 'Lynn Raymond']  
3  ['Tim H. Murphy', 'Jeffrey LeDue']  
4   ['Tim H. Murphy', 'Helge Rhodin']  


In [7]:
def standardize_title(title:str) -> str:
    '''return clean cleaned, lowercase string'''

    title = "".join(
        re.sub(r'[^\w ]', '', #replace anything that is not a letter with ""
               title.replace("<sup>","") #remove tags (mostly in scival data)
               .replace("</sup>","")
               .replace("<inf>","")
               .replace("</inf>","")
               .lower(), #lower all letters
               ) 
            .split(" "))
    return title

In [8]:
#clean titles
sv_df["Standard Titles"] = pd.DataFrame([standardize_title(title) for title in sv_df["Title"].tolist()])
gs_df["Standard Titles"] = pd.DataFrame([standardize_title(title) for title in gs_df["Title"].tolist()])

In [9]:
print(sv_df.head())

                                               Title  \
0  Meso-Py: Dual Brain Cortical Calcium Imaging i...   
1  Chronic multiscale resolution of mouse brain n...   
2  Water-Reaching Platform for Longitudinal Asses...   
3  Multiscale imaging informs translational mouse...   
4  Towards a Visualizable, De-identified Syntheti...   

                              Authors  \
0  ['Tim H. Murphy', 'Jeffrey LeDue']   
1   ['Tim H. Murphy', 'Lynn Raymond']   
2   ['Tim H. Murphy', 'Lynn Raymond']   
3  ['Tim H. Murphy', 'Jeffrey LeDue']   
4   ['Tim H. Murphy', 'Helge Rhodin']   

                                     Standard Titles  
0  mesopydualbraincorticalcalciumimaginginmicedur...  
1  chronicmultiscaleresolutionofmousebrainnetwork...  
2  waterreachingplatformforlongitudinalassessment...  
3  multiscaleimaginginformstranslationalmousemode...  
4  towardsavisualizabledeidentifiedsyntheticbioma...  


In [10]:
#turn into a dictionary of keys with coauthors as values

# s = [ast.literal_eval(coauthors) for coauthors in sv_df["Authors"].tolist()]
# print(s)

# s = [ast.literal_eval(coauthors) for coauthors in sv_df["Authors"].tolist()]

sv_dict = sv_df.set_index('Standard Titles').T.to_dict('list')
gs_dict = gs_df.set_index('Standard Titles').T.to_dict('list')

  gs_dict = gs_df.set_index('Standard Titles').T.to_dict('list')


In [11]:
sv_only = list(set(sv_dict.keys()) - set(gs_dict.keys()))
gs_only = list(set(gs_dict.keys()) - set(sv_dict.keys()))
shared_titles = list(set(sv_dict.keys()).intersection(gs_dict.keys()))
print("total sv publications:", len(sv_dict.keys()))
print("total gs publications:", len(gs_dict.keys()))
print()
print("sv only publications:",len(sv_only))
print("gs only publications:",len(gs_only))
print("shared publications:",len(shared_titles))

total sv publications: 374
total gs publications: 484

sv only publications: 67
gs only publications: 177
shared publications: 307


In [13]:
fill_space = lambda x: [""]*(len(shared_titles)-len(x))
out_df = pd.DataFrame(
    {
        "GS ONLY publications":gs_only 
                                + fill_space(gs_only),
        "GS ONLY title": [gs_dict[title][0] for title in gs_only] 
                                + fill_space(gs_only),
        "GS ONLY coauthors": [gs_dict[title][1] for title in gs_only] 
                                + fill_space(gs_only),
        "SV ONLY publications": sv_only 
                                + fill_space(sv_only),
        "SV ONLY title": [sv_dict[title][0] for title in sv_only] 
                                + fill_space(sv_only),
        "SV ONLY coauthors": [sv_dict[title][1] for title in sv_only] 
                                + fill_space(sv_only),
        "shared publications": shared_titles,
        "shared gs title": [gs_dict[title][0] for title in shared_titles],
        "shared sv title": [sv_dict[title][0] for title in shared_titles],
        "shared gs coauthors": [gs_dict[title][1] for title in shared_titles],
        "shared sv coauthors": [sv_dict[title][1] for title in shared_titles],
    })

In [14]:
out_df.to_csv("SVGS_comparison.csv",index=False)