In [1]:
# Import libraries
import pickle
import networkx as nx
import preprocessing as ps
import matplotlib.pyplot as plt
import random
from collections import Counter

In [2]:
path = "./data/dblp_data.txt"
# # Process and save the data as a pickle file
dataset = ps.parse_and_save_paper_data(path, "./data/dblp_data.pkl")

In [3]:
# # load the dataset
# dataset = pickle.load(open("./data/dblp_data.pkl", "rb"))

In [4]:
dataset[0]

{'title': 'OQL[C++]: Extending C++ with an Object Query Capability.',
 'authors': ['José A. Blakeley'],
 'year': 1995,
 'venue': 'Modern Database Systems'}

In [5]:
# Only keep the data with all the values
dataset = [data for data in dataset if all(value != '' for value in data.values())]
dataset = [data for data in dataset if all(attr in data for attr in ['title', 'authors', 'year', 'venue'])]

In [6]:
# Sample by venue - Only keep papers published in one of the following conference
venues = ['VLDB', 'ICDE', 'ICDT', 'EDBT', 'PODS', 'SIGMOD Conference', 'ICML', 'ECML', 'COLT', 'UAI', 'SODA', 'STOC', 'FOCS', 'STACS', 'KDD', 'ICDM', 'PKDD', 'WWW', 'SDM']
dataset = ps.sample_by_venue(dataset, venues)
len(dataset)

29051

In [7]:
# Get the data for each author
authors = set()
for data in dataset:
    if 'authors' in data:
        authors.update(data['authors'][0].split(','))

unique_authors = list(authors)

# Extract the author data
dataset = ps.get_author_data(unique_authors, dataset)
# clean empty values from the dataset
dataset = [data for data in dataset if all(value != '' or None for value in data.values())]
len(dataset)

10113

In [8]:
dataset[40]

{'id': 103,
 'author': 'Angus Macintyre',
 'coauthors': ['Marek Karpinski', 'Eduardo D. Sontag'],
 'venues': ['STOC'],
 'papers': {'Finiteness results for sigmoidal "neural" networks.',
  'Polynomial bounds for VC dimension of sigmoidal neural networks.'},
 'num_of_papers': 2,
 'venue_papers': {'STOC': 2},
 'venue_dates': {'STOC': 1993}}

In [9]:
# Create a network of authors, add a weighted edge if they have authored the same paper or have published at the same conference. 
# Remove all self loops from the network. 

# Initialize an empty graph
G = nx.Graph()

# Precompute the number of papers for each author
num_papers = {author: ps.get_num_papers(dataset, author) for author in unique_authors}

# Iterate over the dataset
for data in dataset:
    author = data['author']
    coauthors = data['coauthors']
    num_of_papers = data['num_of_papers']
    venue_dates = data['venue_dates']

    # Count the occurrences of each coauthor
    coauthor_counts = Counter(coauthors)

    # Add edges between the author and coauthors
    for coauthor, count in coauthor_counts.items():
        # Only put an edge between two nodes if they have two or more papers together
        if count > 1:
            # Calculate the weight of the edge
            weight = round(count / (num_of_papers + num_papers[coauthor]), 4) * 100
            if not G.has_edge(author, coauthor):
                G.add_edge(author, coauthor, weight=weight)

    # Add edges based on venue and year
    for data2 in dataset:
        author2 = data2['author']
        venue_dates2 = data2['venue_dates']

        # Check if they published at the same venue in the same year
        for venue, year in venue_dates.items():
            if venue in venue_dates2 and venue_dates2[venue] == year:
                if author != author2 and not G.has_edge(author, author2):
                    # Calculate the weight of the edge
                    common_publications = sum(1 for data in dataset if data['author'] in {author, author2} and venue in data['venues'] and data['venue_dates'][venue] == year)
                    weight = round(common_publications / (num_papers[author] + num_papers[author2]), 4) * 100
                    G.add_edge(author, author2, weight=weight)

# Remove self-loops
G.remove_edges_from(nx.selfloop_edges(G))

In [10]:
# Add labels to the nodes to put them in teams.

# Convert the database to a dictionary for faster lookup
database_dict = {item['author']: item for item in dataset}

for node in G.nodes():
    # Get the corresponding item from the database
    item = database_dict.get(node)
    if item is not None:
        # Assign the label to the node
        G.nodes[node]['label'] = ps.assign_labels(item)

# Remove nodes without a label attribute from the Graph
nodes_without_label = [node for node in G.nodes if 'label' not in G.nodes[node]]
G.remove_nodes_from(nodes_without_label)

In [11]:
# Using the largest connected components of in each group to create the network.

# Get the largest connected component for each label
largest_components = {}
for label in set(nx.get_node_attributes(G, 'label').values()):
    label_nodes = [node for node, attr in G.nodes(data=True) if attr['label'] == label]
    subgraph = G.subgraph(label_nodes)
    largest_component = max(nx.connected_components(subgraph), key=len)
    largest_components[label] = largest_component

lcc = []
# Print the largest connected component for each label
for label, component in largest_components.items():
    lcc.append({label: component})

# Create a list of all the sets in the dictionaries
result = [list(item.values())[0] for item in lcc]

# Flatten the list of sets into a list of strings
result = [item for sublist in result for item in sublist]

G = G.subgraph(result)

In [12]:
# Save network G in a pickle file
with open('./networks/network.pkl', 'wb') as file:
    pickle.dump(G, file)

In [15]:
G.number_of_edges()

1503252

In [16]:
G.number_of_nodes()

10108