In [1]:

import pandas as pd
import torch
from torch_geometric.data import Data
import itertools
import json


In [2]:
network_data = pd.read_csv('/home/student/FinalProject/PaperFeedback/Datasets/acm_citation_network_v8_labeled.csv')
# print(network_data.count())

In [17]:
temp = network_data['references'].dropna()

In [20]:
temp.head(2)

42                              5390877920f70186a0d2ce74
203    5390877920f70186a0d2cdc1;5390877f20f70186a0d30...
Name: references, dtype: object

In [5]:
# print(network_data['references'][network_data['references'].notna()].count())
# print(network_data['index'].count() - network_data['references'][network_data['references'].notna()].count())

In [21]:
def build_graph_dataset(data, relation_type='citation'):
    edges = {}
    nodes = set()

    if relation_type == 'citation':
        for _, row in data.iterrows():
            paper_index = row['index']
            refs = str(row['references']).split(';')
            if refs:
                nodes.update([paper_index]+refs)
                for cited_paper in refs:
                    edges[(paper_index, cited_paper)] = 1
            else:
                continue        

    elif relation_type == 'co-citation': 
        for _, row in data.iterrows():
            paper_index = row['index']
            refs = str(row['references']).split(';')
            if refs:
                nodes.update([paper_index]+refs)
                if len(refs) > 1:
                    combs = itertools.combinations(refs, 2)
                    for paper1, paper2 in combs:
                        if (paper1, paper2) in edges.keys():
                            edges[(paper1, paper2)] += 1
                        else:
                            edges[(paper1, paper2)] = 1
            else:
                continue  

    elif relation_type == 'bibliographic_coupling':
        over_threshold = 200
        paper_references = defaultdict(set)
        reference_counts = Counter()

        # The new graph with references that have more than one citation
        nodes = set()  # Include all papers as nodes
        edges = defaultdict(int)  # Dictionary to store directed edges with incremented weights

        # Step 1: Populate `paper_references` dictionary and count references
        for _, row in data.iterrows():
            paper_index = row['index']
            nodes.add(paper_index)
            refs = str(row['references']).split(', ')
            for cited_paper in refs:
                paper_references[cited_paper].add(paper_index)
                reference_counts[cited_paper] += 1

        # Identify references that have more than one citation
        references_with_multiple_cites = {ref for ref, count in reference_counts.items() if count > 1}
        
        # Generate edges for references with more than one citation
        for cited_paper in references_with_multiple_cites:  # Iterate over references with more than one citation
            citing_papers = paper_references.get(cited_paper, set())
            
            # Only process references with fewer citing papers than the threshold
            if len(citing_papers) < over_threshold:
                # Generate directed combinations between citing papers
                for paper1, paper2 in itertools.permutations(citing_papers, 2):
                    edges[(paper1, paper2)] += 1  # Increment the weight by 1 each time

        return nodes, edges

    elif relation_type == 'author_collaboration':
        author_papers = defaultdict(set)  # Store which papers each author wrote
        edges = defaultdict(int)  # Store edges with their weights, initialized to 0
        max_paper_threshold = 80  # Define the max number of papers an author can have before collaboration is considered
        collaboration_threshold = 2  # Define a minimum threshold for how many papers are needed for collaboration
        
        # Step 1: Build the map from authors to papers they wrote
        for _, row in data.iterrows():
            paper_index = row['index']
            authors = str(row['authors']).split(', ')
            for author in authors:
                author_papers[author].add(paper_index)

        # Step 2: Process collaborations only for authors with fewer papers than the threshold
        for author, written_papers in author_papers.items():
            # Proceed only if the author has written fewer than the maximum paper threshold and at least 2 papers
            if len(written_papers) >= collaboration_threshold and len(written_papers) <= max_paper_threshold:
                nodes.update(written_papers)  # Add papers to nodes

                # Generate all combinations of papers written by the same author
                for paper1, paper2 in itertools.combinations(written_papers, 2):
                    if paper1 != paper2:  # Avoid self-loops (paper citing itself)
                        # Add the edge and increment the weight by 1
                        edges[(paper1, paper2)] += 1  # paper1 -> paper2
                        edges[(paper2, paper1)] += 1  # paper2 -> paper1

        return nodes, edges

    else:
        raise Exception('Illegal relation type')           

    
    return data['index'], edges






    
    
    # elif relation_type == 'bibliographic_coupling':
    #     paper_references = {}  # Dictionary to store which papers cite which references
        
    #     # First, build a map from references to the papers that cite them
    #     for _, row in data.iterrows():
    #         paper_index = row['index']
    #         refs = str(row['references']).split(', ')
    #         if refs:
    #             for cited_paper in refs:
    #                 if cited_paper in paper_references.keys():
    #                     paper_references[cited_paper].append([paper_index])
    #                 else:
    #                     paper_references[cited_paper] = [paper_index]
    #         else:
    #             continue        
        
    #     # Now, connect papers that cite the same references
    #     for refs, citing_papers in paper_references.items():
    #         nodes.update(citing_papers)
    #         combs = itertools.combinations(citing_papers, 2)
    #         for paper1, paper2 in combs:
    #             if (paper1, paper2) in edges.keys():
    #                 edges[(paper1, paper2)] += 1
    #             else:
    #                 edges[(paper1, paper2)] = 1




        # elif relation_type == 'author_collaboration':
        # author_papers = {}  # Dictionary to store which papers wrote each authors
        
        # # First, build a map from authors to the papers he wrote
        # for _, row in data.iterrows():
        #     paper_index = row['index']
        #     authors = str(row['authors']).split(', ')
        #     if authors:
        #         for author in authors:
        #             if author in author_papers.keys():
        #                 author_papers[author].append([paper_index])
        #             else:
        #                 author_papers[author] = [paper_index]
        #     else:
        #         continue        
        
        # # Now, connect papers that cite the same references
        # for author, written_papers in author_papers.items():
        #     nodes.update(citing_papers)
        #     combs = itertools.combinations(written_papers, 2)
        #     for paper1, paper2 in combs:
        #         if (paper1, paper2) in edges.keys():
        #             edges[(paper1, paper2)] += 1
        #         else:
        #             edges[(paper1, paper2)] = 1



In [11]:
def save_dict_to_file(dictionary, file_name):
    # Convert tuples to lists for JSON serialization
    dict_with_str_keys = {key[0]+';'+key[1]: value for key, value in dictionary.items()}
    
    # Save the converted dictionary to a file in JSON format
    with open(file_name, 'w') as file:
        json.dump(dict_with_str_keys, file, indent=4)

def save_set_to_file(my_set, file_name):
    # Convert the set to a list for serialization
    list_representation = list(my_set)
    
    # Write the list to a file in JSON format
    with open(file_name, 'w') as file:
        json.dump(list_representation, file, indent=4)

In [27]:
relation_type = 'co-citation'
nodes, edges = build_graph_dataset(data=network_data, relation_type=relation_type)


In [32]:
dict_with_str_keys = {key[0]+';'+key[1]: value for key, value in edges.items()}

In [36]:
with open(f'{relation_type}_edge_index', 'w') as file:
        json.dump(dict_with_str_keys, file, indent=4)

In [37]:
save_set_to_file(nodes ,f'{relation_type}_node_index')

In [3]:
from collections import defaultdict, Counter
import pandas as pd

def analyze_reference_data(data):
    paper_references = defaultdict(set)
    
    # Populate the `paper_references` dictionary
    for _, row in data.iterrows():
        paper_index = row['index']
        refs = str(row['references']).split(', ')
        for cited_paper in refs:
            paper_references[cited_paper].add(paper_index)
        

    # Calculate R and M metrics
    R = len(paper_references)  # Total number of distinct references
    M_values = [len(citing_papers) for citing_papers in paper_references.values()]  # List of citing counts per reference
    M_average = sum(M_values) / R if R > 0 else 0  # Average citing count per reference
    
    # Summary of M distribution
    M_distribution = Counter(M_values)
    max_citations = max(M_values) if M_values else 0
    min_citations = min(M_values) if M_values else 0

    print("Total distinct references (R):", R)
    print("Average number of papers citing each reference (Average M):", M_average)
    print("Max papers citing a single reference:", max_citations)
    print("Min papers citing a single reference:", min_citations)
    print("Distribution of citing counts (M):")
    for count, num_references in sorted(M_distribution.items()):
        print(f"  References cited by {count} papers: {num_references}")

analyze_reference_data(data=network_data)


Total distinct references (R): 884063
Average number of papers citing each reference (Average M): 2.694010494727186
Max papers citing a single reference: 1464772
Min papers citing a single reference: 1
Distribution of citing counts (M):
  References cited by 1 papers: 865905
  References cited by 2 papers: 13973
  References cited by 3 papers: 2102
  References cited by 4 papers: 714
  References cited by 5 papers: 385
  References cited by 6 papers: 254
  References cited by 7 papers: 148
  References cited by 8 papers: 102
  References cited by 9 papers: 71
  References cited by 10 papers: 52
  References cited by 11 papers: 46
  References cited by 12 papers: 35
  References cited by 13 papers: 30
  References cited by 14 papers: 30
  References cited by 15 papers: 23
  References cited by 16 papers: 23
  References cited by 17 papers: 19
  References cited by 18 papers: 12
  References cited by 19 papers: 14
  References cited by 20 papers: 17
  References cited by 21 papers: 12
  

In [17]:
relation_type = 'bibliographic_coupling'
nodes, edges = build_graph_dataset(data=network_data, relation_type=relation_type)

In [19]:
save_dict_to_file(edges, f'{relation_type}_edge_index')
save_set_to_file(nodes ,f'{relation_type}_node_index')

In [14]:
# Assuming 'data' is your dataframe
def check_reference_data(data):
    paper_references = defaultdict(set)
    
    # Step 1: Populate the `paper_references` dictionary
    for _, row in data.iterrows():
        paper_index = row['index']
        refs = str(row['references']).split(', ')
        for cited_paper in refs:
            paper_references[cited_paper].add(paper_index)

    # Step 2: Find a cited paper that is cited by exactly 3 papers
    for cited_paper, citing_papers in paper_references.items():
        if len(citing_papers) == 3:
            print(f"Cited paper: {cited_paper}")
            print(f"Papers citing it: {list(citing_papers)}")
            break  # Only get the first one found, remove if you want to find all


check_reference_data(network_data)

Cited paper: 5390880720f70186a0d789af
Papers citing it: ['53908b6c20f70186a0dbd87a', '5390972920f70186a0dfa5a1', '5390a2e920f70186a0e67686']


In [15]:
import json

# List of papers citing the target paper
citing_papers = ['53908b6c20f70186a0dbd87a', '5390972920f70186a0dfa5a1', '5390a2e920f70186a0e67686']

# Load the edges from the JSON file
with open(f'{relation_type}_edge_index', 'r') as file:
    edges = json.load(file)

# Filter edges where one of the citing papers is the left node (source)
filtered_edges = [
    (left_node, right_node, weight) 
    for edge, weight in edges.items()
    for left_node, right_node in [edge.split(';')]  # Split string keys into left and right nodes
    if left_node in citing_papers
]

# Print the filtered edges
for edge in filtered_edges:
    print(f"Left Node: {edge[0]}, Right Node: {edge[1]}, Weight: {edge[2]}")

Left Node: 53908b6c20f70186a0dbd87a, Right Node: 5390972920f70186a0dfa5a1, Weight: 1
Left Node: 53908b6c20f70186a0dbd87a, Right Node: 5390a2e920f70186a0e67686, Weight: 1
Left Node: 5390972920f70186a0dfa5a1, Right Node: 53908b6c20f70186a0dbd87a, Weight: 1
Left Node: 5390972920f70186a0dfa5a1, Right Node: 5390a2e920f70186a0e67686, Weight: 1
Left Node: 5390a2e920f70186a0e67686, Right Node: 53908b6c20f70186a0dbd87a, Weight: 1
Left Node: 5390a2e920f70186a0e67686, Right Node: 5390972920f70186a0dfa5a1, Weight: 1


In [20]:
from collections import defaultdict, Counter
import pandas as pd

def analyze_author_collaboration(data, paper_threshold=2, max_papers=10):
    author_papers = defaultdict(set)  # Store which papers each author wrote
    collaboration_counts = defaultdict(int)  # Store number of collaborations between authors

    # Step 1: Build the map from authors to papers they wrote
    for _, row in data.iterrows():
        paper_index = row['index']
        authors = str(row['authors']).split(', ')
        for author in authors:
            author_papers[author].add(paper_index)

    # Step 2: Analyze collaboration only for authors with papers within the threshold range
    for author, written_papers in author_papers.items():
        # Filter authors who have at least 'paper_threshold' papers and no more than 'max_papers'
        if len(written_papers) >= paper_threshold and len(written_papers) <= max_papers:
            # Create collaborations (pairs of co-authored papers)
            for paper1, paper2 in itertools.combinations(written_papers, 2):
                # Increment collaboration count (collaboration is bidirectional)
                collaboration_counts[(paper1, paper2)] += 1

    # Step 3: Calculate metrics
    total_collaborations = len(collaboration_counts)  # Total collaborations (edges)
    collaboration_values = list(collaboration_counts.values())  # Collaboration counts per pair
    avg_collaboration_count = sum(collaboration_values) / total_collaborations if total_collaborations > 0 else 0  # Average collaboration count
    
    # Distribution of collaborations
    collaboration_distribution = Counter(collaboration_values)
    max_collaborations = max(collaboration_values) if collaboration_values else 0
    min_collaborations = min(collaboration_values) if collaboration_values else 0

    # Print summary
    print("Total collaborations (edges):", total_collaborations)
    print("Average number of collaborations per paper pair:", avg_collaboration_count)
    print("Max collaborations between two papers:", max_collaborations)
    print("Min collaborations between two papers:", min_collaborations)
    print("Distribution of collaboration counts:")
    for count, num_collaborations in sorted(collaboration_distribution.items()):
        print(f"  Paper pairs with {count} collaborations: {num_collaborations}")

# Example of calling the function with your data
analyze_author_collaboration(data=network_data, paper_threshold=2, max_papers=10)

Total collaborations (edges): 3618456
Average number of collaborations per paper pair: 1.08448548220567
Max collaborations between two papers: 77
Min collaborations between two papers: 1
Distribution of collaboration counts:
  Paper pairs with 1 collaborations: 3376357
  Paper pairs with 2 collaborations: 197784
  Paper pairs with 3 collaborations: 32806
  Paper pairs with 4 collaborations: 7654
  Paper pairs with 5 collaborations: 2246
  Paper pairs with 6 collaborations: 799
  Paper pairs with 7 collaborations: 359
  Paper pairs with 8 collaborations: 201
  Paper pairs with 9 collaborations: 88
  Paper pairs with 10 collaborations: 53
  Paper pairs with 11 collaborations: 31
  Paper pairs with 12 collaborations: 17
  Paper pairs with 13 collaborations: 18
  Paper pairs with 14 collaborations: 15
  Paper pairs with 15 collaborations: 6
  Paper pairs with 16 collaborations: 3
  Paper pairs with 17 collaborations: 1
  Paper pairs with 18 collaborations: 3
  Paper pairs with 19 collabora

In [22]:
relation_type = 'author_collaboration'
nodes, edges = build_graph_dataset(data=network_data, relation_type=relation_type)

In [23]:
save_dict_to_file(edges, f'{relation_type}_edge_index')
save_set_to_file(nodes ,f'{relation_type}_node_index')

In [24]:
from collections import defaultdict

# Assuming 'data' is your dataframe
def check_author_collaboration(data):
    author_papers = defaultdict(set)
    
    # Step 1: Populate the `author_papers` dictionary
    for _, row in data.iterrows():
        paper_index = row['index']
        authors = str(row['authors']).split(', ')
        for author in authors:
            author_papers[author].add(paper_index)

    # Step 2: Find authors who have written exactly 3 papers
    for author, written_papers in author_papers.items():
        if len(written_papers) == 3:  # Author has written exactly 3 papers
            print(f"Author: {author}")
            print(f"Papers written by this author: {list(written_papers)}")
            break  # Stop after finding the first author with exactly 3 papers

# Example call to the function
check_author_collaboration(network_data)

Author: Keith Brian Gallagher
Papers written by this author: ['53908cde20f70186a0dcd63e', '539087a520f70186a0d4811f', '5390879920f70186a0d422ab']


In [25]:
# List of papers written by the target author
papers_written_by_author = ['53908cde20f70186a0dcd63e', '539087a520f70186a0d4811f', '5390879920f70186a0d422ab']

# Load the edges from the JSON file
with open(f'{relation_type}_edge_index', 'r') as file:
    edges = json.load(file)

# Filter edges where one of the papers written by the author is the left node (source)
filtered_edges = [
    (left_node, right_node, weight) 
    for edge, weight in edges.items()
    for left_node, right_node in [edge.split(';')]  # Split string keys into left and right nodes
    if left_node in papers_written_by_author or right_node in papers_written_by_author
]

# Print the filtered edges
for edge in filtered_edges:
    print(f"Left Node: {edge[0]}, Right Node: {edge[1]}, Weight: {edge[2]}")


Left Node: 53908cde20f70186a0dcd63e, Right Node: 539087a520f70186a0d4811f, Weight: 1
Left Node: 53908cde20f70186a0dcd63e, Right Node: 5390879920f70186a0d422ab, Weight: 1
Left Node: 539087a520f70186a0d4811f, Right Node: 5390879920f70186a0d422ab, Weight: 1
Left Node: 539087e120f70186a0d66c5e, Right Node: 539087a520f70186a0d4811f, Weight: 1
Left Node: 5390b61e20f70186a0f14cbf, Right Node: 539087a520f70186a0d4811f, Weight: 1
Left Node: 5390b61e20f70186a0f14bfe, Right Node: 539087a520f70186a0d4811f, Weight: 1
Left Node: 5390b61e20f70186a0f14c1d, Right Node: 539087a520f70186a0d4811f, Weight: 1
Left Node: 5390b5df20f70186a0f0b4a8, Right Node: 539087a520f70186a0d4811f, Weight: 1
Left Node: 539087a520f70186a0d4811f, Right Node: 5390879220f70186a0d3d486, Weight: 1
