In [33]:
import pandas as pd
import networkx as nx
import sys

# get year interval
year_from = 2008
year_to = 2024

# access corresponding csv files and configure dataFrame
df = pd.concat(map(pd.read_csv, [f'queryResult-{year}.csv' for year in range(year_from,year_to+1)]))
df.drop(df.columns[df.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

df_summary = pd.DataFrame(df, columns=['doi', 'pubmed_id', 'title', 'subtype', 'affilname', 'coverDate', 'publicationName', 'citedby_count'])
df_summary = df_summary[df_summary['subtype'] == 'ar']
df_summary = df_summary.dropna()
df_summary = df_summary.sort_values(by='citedby_count', ascending=False)
df_summary.to_csv(f'contribution_{year_from}_{year_to}.csv', sep=',')

# G1: institutional cooperation network
# G2: country cooperation network
# since G1 is a multigraph, a node can have multiple edges.
G1 = nx.MultiGraph()
G2 = nx.Graph()

for index, row in df.iterrows():
    """
    nodes are unique by: affilName (e.g., institution name)
        * each node is assigned its country
    edges are unique by: publication (e.g., journal name), subtype (e.g., Article/Letter/Editorial)
        * edge weight is simply edge re-visit count (e.g., cooperation of affil_from and affil_to happened X times)
    """

    publication = str(row['publicationName']).lower()
    subtype = str(row['subtypeDescription']).lower()
    edge_id = f'{publication};{subtype}'

    # since affil_list and its country list are in-order, we can iterate through simultaneously.
    affil_list = str(row['affilname']).lower().split(';')
    country_by_affil_list = str(row['affiliation_country']).lower().split(';')

    """
    below loop creates a perfect subgraph of all affiliations in affil_list and append it to G1.
    for example, if an affiliation contain institutions X, Y and Z, then each institution is associated with all the others.
    so, the resulting associations are: (X, Y), (Y, Z), (X, Z).
    """

    for affil_from, country_by_affil_from in zip(affil_list, country_by_affil_list):
        if not G1.has_node(affil_from):
            G1.add_node(affil_from)
            G1.nodes[affil_from]['country'] = country_by_affil_from
        for affil_to, country_by_affil_to in zip(affil_list, country_by_affil_list):
            if not G1.has_node(affil_to):
                G1.add_node(affil_to)
                G1.nodes[affil_to]['country'] = country_by_affil_to
            if affil_from == affil_to:
                continue
            if G1.has_edge(affil_from, affil_to, key=edge_id):
                G1[affil_from][affil_to][edge_id]['weight'] += 1
            else:
                G1.add_edge(affil_from, affil_to, key=edge_id, weight= 1)

# exclude nodes with low weight value (e.g., the same cooperation happened less than X times)
threshold = 5
low_weight_edges = [(u, v) for u, v, d in G1.edges(data=True) if d['weight'] < threshold]
G1.remove_edges_from(low_weight_edges)
G1.remove_nodes_from(list(nx.isolates(G1)))

"""
for index, row in df.iterrows():
    country_list = str(row['affiliation_country']).lower().split(';')
    for country_from in country_list:
        for country_to in country_list:
            if country_from == country_to:
                continue
            if G2.has_edge(country_from, country_to):
                G2[country_from][country_to]['weight'] += 1
            else:
                G2.add_edge(country_from, country_to, weight=1)
"""

# save to .gexf
nx.write_gexf(G1, f'graph_institution_{year_from}_{year_to}.gexf')
# nx.write_gexf(G2, 'graph_country.gexf')