# Library

In [14]:
import os
import pandas as pd
from itertools import combinations
import numpy as np
from scipy.stats import entropy
import networkx as nx
import collections
from collections import Counter
import matplotlib.pyplot as plt
import ast #ast.literal_eval() 
from tqdm.notebook import tqdm
tqdm.pandas()



# DATA IMPORT

In [None]:
DATA = pd.read_csv('data_main.csv') #main data
ref = pd.read_csv('data_reference.csv') #reference
cit = pd.read_csv('data_citation.csv') #citation


to repeat the same analysis on Authors :
- 1) DATA['AUTORI'] = DATA[DATA['AUTORI']!=0].dropna().progress_apply(lambda x: ast.literal_eval(x))
- 2) Authors = DATA.explode('AUTORI')[['ego_id','AUTORI']]
- obs. you need to retrieve the journal data of the Authors from OpenAlex like that:
    
    
    ''' 
    Authors['journal_id'] = ''
    for i in tqdm(AUTORI_UNICI[AUTORI_UNICI['journal_id']==''].index):
        response = requests.get(f'https://api.openalex.org/works?page=1&filter=authorships.author.id:{AUTORI_UNICI["AUTORI_ID"][i]},publication_year:-2019&sort=cited_by_count:desc&per_page=5&mailto=youremail@email.it')
        if response.status_code == 200:
            cu = response.json()
            journal_ids = []
            for x in cu.get('results', []):  # Use .get() to avoid KeyError if 'results' is missing
                primary_location = x.get('primary_location')
                if primary_location is not None:
                    source = primary_location.get('source')
                    if source is not None and 'id' in source:
                        journal_ids.append(source['id'])
            Authors.at[i, 'journal_id'] = journal_ids  # Use .at for setting values by label
        else:
            print(f"Failed to fetch data for index {i} with status code {response.status_code}") '''
            
- obs(2) then with the journal_id is possibile to retrieve the concepts of each journal


# Method

In [None]:
# Balance
def calculate_balance(group):
    counts = Counter(group)
    distribution = [counts[x]/len(group) for x in counts]
    return entropy(distribution) / entropy([1/len(counts)]*len(counts))

# Calculate Disparity
def calculate_disparity(group):
    group_set = set(group)
    if len(group_set) < 2:
        return 0
    disparity_sum = 0
    for combo in combinations(group_set, 2):
        disparity_sum += disparity_matrix.loc[combo[0], combo[1]]
    return disparity_sum / (len(group_set) * (len(group_set) - 1))

#Calculate Pointwise
def avDistPmi(clist):
    lista = []
    for (u,v) in clist:
        try:
            lista.append(1-G[u][v]['pmi'])
        except:
            continue
    return np.mean(lista)

# Flattens a list of lists into a single list 

def flattenList(original_list):
    ll=[element for sublist in original_list for element in sublist]
    return ll



# Measure of Interdisciplinary

_All measures calculated in the following cells refer to the cit dataframe, i.e. the citations received by each ego_id._

## Balance, Disparity, Variety

Below is an example of how to calculate balance, disparity and variety using the **citations** received by each publication at the level of journal concepts lvl0.

To be used, the dataframe must have the following structure :
 - ego_id = representing the publication on which Balance, Disparity and Variety are to be obtained
 - citation_works = or referenced_works representing the individual publication that in this case cited (or was cited in the case of referenced) the paper ego_id
 - journal_concepts = representing in this case the lvl 0 concept associated with the paper, can also be replaced by individual journal ids or at a more granular level by lvl1 journal concepts

In [29]:
cit.head(2) #obs. ego_id is repeated for all times it has been cited (citation_works)

Unnamed: 0,ego_id,citation_works,journal_concepts
0,https://openalex.org/W3105918387,https://openalex.org/W3118344881,Medicine
0,https://openalex.org/W3105918387,https://openalex.org/W4226048267,Medicine


In [None]:
# For each ego_id get all unique pairs of journal_concepts
co_occurrences = cit.groupby('ego_id')['journal_concepts'].apply(lambda x: list(combinations(set(x), 2)))
pairs_df = pd.DataFrame([item for sublist in co_occurrences for item in sublist], columns=['Field1', 'Field2'])

# Count occurrences of each pair
pair_counts = pairs_df.groupby(['Field1', 'Field2']).size().reset_index(name='counts')


# Inverse of counts as a basic measure of disparity
pair_counts['disparity'] = 1 / pair_counts['counts']

# Create the Disparity Matrix
fields = cit['journal_concepts'].unique()
disparity_matrix = pd.DataFrame(np.zeros((len(fields), len(fields))), index=fields, columns=fields)

# Populate the matrix
for index, row in pair_counts.iterrows():
    field1, field2, disparity = row['Field1'], row['Field2'], row['disparity']
    disparity_matrix.loc[field1, field2] = disparity
    disparity_matrix.loc[field2, field1] = disparity # Ensure the matrix is symmetrical

# Fill diagonal with a high disparity value as a field has no disparity with itself
np.fill_diagonal(disparity_matrix.values, np.max(disparity_matrix.values))



# Balance
balance_df = cit.groupby('ego_id')['journal_concepts'].apply(calculate_balance)
# Disparity
disparity_df = cit.groupby('ego_id')['journal_concepts'].apply(calculate_disparity)
# Variety
variety_df = cit.groupby('ego_id')['journal_concepts'].nunique()

# Dataframe where for each ego_id is associated a measure of Balance, Disparity and Variety
measure_citation_concept = pd.DataFrame({
    'Balance': balance_df,
    'Disparity': disparity_df,
    'Variety': variety_df,})
#reset index in order to have ego_id as a column and not an index

measure_citation_concept.reset_index(inplace = True, drop = False) 

## Pointwise Mutual Information

In [198]:
# Convert journal concepts associated with each ego_id into a set to remove duplicates
dfList = cit.groupby('ego_id')['journal_concepts'].apply(set).reset_index()

# Generate all unique pairs (combinations) of journal concepts for each ego_id.
dfList['combinations'] = [list(combinations(test_list, 2)) for test_list in dfList['id_journal']]
allEdges = flattenList(list(dfList['combinations']))

# Count the occurrence of each unique pair using a Counter, which helps in later calculating weights for the graph edges.
dEd = collections.Counter(allEdges)

# Initialize an undirected graph
G = nx.Graph()

# Add edges to the graph, where keys of dEd are the edges and values are used as weights
G.add_edges_from(dEd.keys())

# Set the weight for each edge in the graph based on the maximum occurrence of each pair (considering both (u,v) and (v,u) directions).
for (u,v) in G.edges():
    G[u][v]['weight'] = max(dEd[(u,v)], dEd[(v,u)])

# Calculate the total weight of all edges in the graph
totW = sum([G[u][v]['weight'] for (u,v) in G.edges()])

# Normalize the weight of each edge by the total weight of all edges
for (u,v) in G.edges():
    G[u][v]['wNorm'] = G[u][v]['weight'] / totW

# Calculate the weighted degree of each node, then scale it by 0.5. This pk dict represents the probability of each node
pk = dict(G.degree(weight='wNorm'))
for i in pk.keys():
    pk[i] = 0.5 * pk[i]

# Calculate the PMI for each edge in the graph based on the normalized weights and the probabilities of the nodes
for (u,v) in G.edges():
    pmi = -(np.log2(G[u][v]['wNorm'] / (pk[u] * pk[v]))) / (np.log2(G[u][v]['wNorm']))
    G[u][v]['pmi'] = max(pmi, 0)  # Assign PMI value to the edge, ensuring it's non-negative

# Calculate the average PMI-based distance for the combinations associated with each ego_i. 
dfList['pmi_distance_citation'] = dfList['combinations'].progress_apply(lambda x: avDistPmi(x))
