In [1]:
# Import libraries
import pickle
import networkx as nx
import preprocessing as ps
import matplotlib.pyplot as plt
import random
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

In [2]:
path = "./data/dblp_data.txt"
# # # Process and save the data as a pickle file
ps.parse_and_save_paper_data(path, "./data/dblp_data.pkl")

In [3]:
# load the dataset
dataset = pickle.load(open("./data/dblp_data.pkl", "rb"))
len(dataset)

1632444

In [4]:
dataset[0]

{'title': 'OQL[C++]: Extending C++ with an Object Query Capability.',
 'authors': ['JosÃ© A. Blakeley'],
 'year': 1995,
 'venue': 'Modern Database Systems'}

In [5]:
# Only keep the data with all the values
dataset = [data for data in dataset if all(value != '' for value in data.values())]
dataset = [data for data in dataset if all(attr in data for attr in ['title', 'authors', 'year', 'venue'])]
len(dataset)

1630752

In [6]:
max_year = max([data['year'] for data in dataset])

years = []
min_year = sorted([data['year'] for data in dataset])
for i in min_year:
    if i > 0:
        years.append(i)

print("Latest Year:", max_year)
print("First Year:", min(years))
print("Num without a year:", len(min_year) - len(years))

Latest Year: 2011
First Year: 1936
Num without a year: 162810


In [7]:
# Sample by venue - Only keep papers published in one of the following conference
venues = ['VLDB', 'ICDE', 'ICDT', 'EDBT', 'PODS', 'SIGMOD Conference', 'ICML', 'ECML', 'COLT', 'UAI', 'SODA', 'STOC', 'FOCS', 'STACS', 'KDD', 'ICDM', 'PKDD', 'WWW', 'SDM']
dataset = ps.sample_by_venue(dataset, venues)
dataset = ps.sample_date(dataset, 2010, 2010)
len(dataset)

1371

In [8]:
# Get the data for each author
authors = set()
for data in dataset:
    if 'authors' in data:
        authors.update(data['authors'][0].split(','))

unique_authors = list(authors)

# Extract the author data
dataset = ps.get_author_data(unique_authors, dataset)
# clean empty values from the dataset
dataset = [data for data in dataset if all(value != '' or None for value in data.values())]
len(dataset)

219

In [9]:
import pprint as pp
pp.pprint(dataset[139])

{'author': 'Alin Deutsch',
 'coauthors': ['Yannis Katsis',
               'Vasilis Vassalos',
               'Yannis Papakonstantinou',
               'Richard Hull',
               'Avinash Vyas',
               'Kevin Keliang Zhao',
               'Emiran Curtmola',
               'Divesh Srivastava',
               'K. K. Ramakrishnan'],
 'id': 2208,
 'num_of_papers': 3,
 'papers': {'Inconsistency resolution in online databases.',
            'Load-balanced query dissemination in privacy-aware online '
            'communities.',
            'Policy-aware sender anonymity in location based services.'},
 'venue_dates': {'ICDE': 2010, 'SIGMOD Conference': 2010},
 'venue_papers': {'ICDE': 2, 'SIGMOD Conference': 1},
 'venues': ['SIGMOD Conference', 'ICDE']}


In [10]:
# # Create a network of authors, add a weighted edge if they have authored the same paper or have published at the same conference. 
# # Remove all self loops from the network. 

# # Initialize an empty graph
# G = nx.Graph()

# # Precompute the number of papers for each author
# num_papers = {author: ps.get_num_papers(dataset, author) for author in unique_authors}

# # Iterate over the dataset
# for data in dataset:
#     author = data['author']
#     coauthors = data['coauthors']
#     num_of_papers = data['num_of_papers']
#     venue_dates = data['venue_dates']

#     # Count the occurrences of each coauthor
#     coauthor_counts = Counter(coauthors)

#     # Add edges between the author and coauthors
#     for coauthor, count in coauthor_counts.items():
#         # Only put an edge between two nodes if they have two or more papers together
#         if count > 1:
#             # Calculate the weight of the edge
#             weight = round(count / (num_of_papers + num_papers[coauthor]), 4) * 100
#             if not G.has_edge(author, coauthor):
#                 G.add_edge(author, coauthor, weight=weight)

#     # # Add edges based on venue and year
#     # for data2 in dataset:
#     #     author2 = data2['author']
#     #     venue_dates2 = data2['venue_dates']

#     #     # Check if they published at the same venue in the same year
#     #     for venue, year in venue_dates.items():
#     #         if venue in venue_dates2 and venue_dates2[venue] == year:
#     #             if author != author2:
#     #                 # Calculate the weight of the edge
#     #                 common_publications = sum(1 for data in dataset if data['author'] in {author, author2} and venue in data['venues'] and data['venue_dates'][venue] == year)
#     #                 weight = round(common_publications / (num_papers[author] + num_papers[author2]), 4) * 50
                    
#     #                 if G.has_edge(author, author2):
#     #                     # If the edge exists, add the weight to the existing weight
#     #                     G[author][author2]['weight'] += weight
#     #                 else:
#     #                     # If the edge doesn't exist, create a new edge with the weight
#     #                     G.add_edge(author, author2, weight=weight)

# # Remove self-loops
# G.remove_edges_from(nx.selfloop_edges(G))

In [11]:
# Create a network of authors, add a weighted edge if they have authored the same paper or have published at the same conference. 
# Remove all self loops from the network. 

# Initialize an empty graph
G = nx.Graph()

# Precompute the number of papers for each author
num_papers = {author: ps.get_num_papers(dataset, author) for author in unique_authors}

# Iterate over the dataset
for data in dataset:
    author = data['author']
    coauthors = data['coauthors']
    num_of_papers = data['num_of_papers']
    venue_dates = data['venue_dates']

    # Count the occurrences of each coauthor
    coauthor_counts = Counter(coauthors)

    # Add edges between the author and coauthors
    for coauthor, count in coauthor_counts.items():
        # Only put an edge between two nodes if they have two or more papers together
        if count > 1:
            # Calculate the weight of the edge
            weight = round(1 - (count / (num_of_papers + num_papers[coauthor])), 4) * 100 # Using the Jaccard distance: Minimization problem
            if not G.has_edge(author, coauthor):
                G.add_edge(author, coauthor, weight=weight)

min_weight = min([data['weight'] for _, _, data in G.edges(data=True)])

# Create a dictionary to map venues to authors
venue_to_authors = {}

# Populate the dictionary with authors for each venue
for data in dataset:
    author = data['author']
    venues = data['venues']
    
    for venue in venues:
        if venue not in venue_to_authors:
            venue_to_authors[venue] = set()
        venue_to_authors[venue].add(author)

# Add edges between authors who published in the same venue
for authors in venue_to_authors.values():
    authors = list(authors)
    for i in range(len(authors)):
        for j in range(i + 1, len(authors)):
            author1 = authors[i]
            author2 = authors[j]
            if not G.has_edge(author1, author2):
                G.add_edge(author1, author2, weight=round(min_weight/10,4))

# Remove self-loops
G.remove_edges_from(nx.selfloop_edges(G))

In [12]:
min_weight = min([data['weight'] for _, _, data in G.edges(data=True)])
print(min_weight)

33.33


In [13]:
# Add labels to the nodes to put them in teams.
G = G.copy()
# Convert the database to a dictionary for faster lookup
database_dict = {item['author']: item for item in dataset}

for node in G.nodes():
    # Get the corresponding item from the database
    item = database_dict.get(node)
    if item is not None:
        # Assign the label to the node
        G.nodes[node]['label'] = ps.assign_labels(item)

# Remove nodes without a label attribute from the Graph
nodes_without_label = [node for node in G.nodes if 'label' not in G.nodes[node]]
G.remove_nodes_from(nodes_without_label)

In [14]:
G.number_of_nodes()

218

In [15]:
# G.nodes['Xiaoyong Du']

In [16]:
largest_component = max(nx.connected_components(G), key=len)
subgraph = G.subgraph(largest_component)

In [17]:
subgraph.number_of_nodes()

218

In [18]:
label_counts = nx.get_node_attributes(subgraph, 'label')
label_counts = dict(Counter(label_counts.values()))
label_counts

{'DB': 115, 'AI': 21, 'DM': 49, 'T': 33}

In [20]:
# Save network G in a pickle file
with open('./networks/min_network.pkl', 'wb') as file:
    pickle.dump(subgraph, file)