In [1]:
import pandas as pd
import networkx as nx
import random
from datetime import datetime, timedelta
import numpy as np
import time
import concurrent.futures

In [2]:
'''
Lazega Lawyers
'''
start_time = time.time()
file_path = 'lazega_lawyers_edges.txt'
edges_df = pd.read_csv(file_path, header=None, names=['source', 'target', 'type'])
G = nx.from_pandas_edgelist(edges_df, source='source', target='target', edge_attr='type')
# Load the node attributes (update the file path as needed)
node_attributes_file = 'lazega_lawyers_node_attributes.txt'
node_attr_df = pd.read_csv(node_attributes_file)
for _, row in node_attr_df.iterrows():
    node_id = row['Lawyer ID']
    G.nodes[node_id]['Lawyer ID'] = node_id
    G.nodes[node_id]['Seniority'] = row['Seniority']
    G.nodes[node_id]['Status'] = row['Status']
    G.nodes[node_id]['Gender'] = row['Gender']
    G.nodes[node_id]['Office'] = row['Office']
    G.nodes[node_id]['Years with Firm'] = row['Years with Firm']
    G.nodes[node_id]['Age'] = row['Age']
    G.nodes[node_id]['Practice Area'] = row['Practice Area']
    G.nodes[node_id]['Law School'] = row['Law School']
    
def add_missing_node_attributes(G):
    for node in G.nodes():
        G.nodes[node]['Name'] = f'Lawyer_{node}'
        G.nodes[node]['Hours Worked'] = int(np.random.normal(2000, 200))  # Gaussian distribution
        G.nodes[node]['Fees Brought In'] = int(np.random.normal(500000, 100000))  # Gaussian distribution
        G.nodes[node]['Attitudes'] = f'Attitude_{random.randint(1, 10)}'  # Example attitude

def generate_edge_attributes(G):
    for u, v, data in G.edges(data=True):
        data['Type of Relationship'] = data.get('type', 1)  # Default to 1 if 'type' is missing
        data['Collaboration Frequency'] = random.choice(['Daily', 'Weekly', 'Monthly'])
        data['Number of Cases Together'] = int(np.random.normal(10, 5))  # Gaussian distribution
        data['Communication Frequency'] = random.choice(['Daily', 'Weekly', 'Monthly'])
        data['Shared Clients'] = int(np.random.normal(5, 2))  # Gaussian distribution
        data['Last Interaction Date'] = datetime.now() - timedelta(days=random.randint(1, 365))

add_missing_node_attributes(G)
generate_edge_attributes(G)
node_data = [{'Lawyer ID': node, **data} for node, data in G.nodes(data=True)]
edge_data = [{'Edge (From, To)': (u, v), **data} for u, v, data in G.edges(data=True)]
node_df = pd.DataFrame(node_data)
edge_df = pd.DataFrame(edge_data)
node_df.to_csv('lazega_lawyers_node_attributes_complete.txt', index=False)
edge_df.to_csv('lazega_lawyers_edge_attributes.txt', index=False)
end_time = time.time()
time_taken = end_time - start_time
print(f"Time taken: {time_taken:.2f} seconds")

Time taken: 0.03 seconds


In [3]:
'''
Facebook
'''
start_time = time.time()
file_path = 'facebook.txt'  # Update with the actual file path
edges_df = pd.read_csv(file_path, header=None, names=['source', 'target'])
G = nx.from_pandas_edgelist(edges_df, source='source', target='target')
def generate_node_attributes_batch(nodes):
    locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
    educations = ['High School', 'Bachelor', 'Master', 'PhD']
    workplaces = ['Google', 'Facebook', 'Amazon', 'Microsoft', 'Apple']
    relationship_statuses = ['Single', 'In a Relationship', 'Married', 'Complicated']
    interests = ['Music', 'Movies', 'Sports', 'Reading', 'Traveling']
    languages = ['English', 'Spanish', 'French', 'German', 'Chinese']
    
    for node in nodes:
        G.nodes[node]['User ID'] = node
        G.nodes[node]['Name'] = f'user_{node}'
        G.nodes[node]['Age'] = int(np.random.normal(30, 10))  # Gaussian distribution
        G.nodes[node]['Gender'] = random.choice(['Male', 'Female'])
        G.nodes[node]['Location'] = random.choice(locations)
        G.nodes[node]['Hometown'] = random.choice(locations)
        G.nodes[node]['Education'] = random.choice(educations)
        G.nodes[node]['Work'] = random.choice(workplaces)
        G.nodes[node]['Relationship Status'] = random.choice(relationship_statuses)
        G.nodes[node]['Interests'] = random.sample(interests, 2)
        G.nodes[node]['Last Active Date'] = datetime.now() - timedelta(days=random.randint(1, 365))
        G.nodes[node]['Account Creation Date'] = datetime.now() - timedelta(days=random.randint(365, 3650))
        G.nodes[node]['Email'] = f'user{node}@example.com'
        G.nodes[node]['Phone Number'] = f'555-{random.randint(1000, 9999)}'
        G.nodes[node]['Birthday'] = datetime.now() - timedelta(days=random.randint(365*20, 365*50))
        G.nodes[node]['Favorite Quotes'] = f'Quote by user {node}'

def generate_edge_attributes_batch(edges):
    for u, v in edges:
        G.edges[u, v]['Friendship'] = 'yes'
        G.edges[u, v]['Messages Exchanged'] = int(np.random.normal(250, 100))  # Gaussian distribution
        G.edges[u, v]['Comments Exchanged'] = int(np.random.normal(50, 20))  # Gaussian distribution
        G.edges[u, v]['Events Attended Together'] = int(np.random.normal(5, 2))  # Gaussian distribution
        G.edges[u, v]['Interaction Frequency'] = random.choice(['Daily', 'Weekly', 'Monthly'])
        G.edges[u, v]['Last Interaction Date'] = datetime.now() - timedelta(days=random.randint(1, 365))

nodes_list = list(G.nodes())
edges_list = list(G.edges())
num_workers = 8  # Adjust this number based on your system's capabilities
node_batches = np.array_split(nodes_list, num_workers)
edge_batches = np.array_split(edges_list, num_workers)
start_time = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    node_futures = [executor.submit(generate_node_attributes_batch, batch) for batch in node_batches]
    edge_futures = [executor.submit(generate_edge_attributes_batch, batch) for batch in edge_batches]

    concurrent.futures.wait(node_futures)
    concurrent.futures.wait(edge_futures)
node_data = [{'User ID': node, **data} for node, data in G.nodes(data=True)]
edge_data = [{'Edge (From, To)': (u, v), **data} for u, v, data in G.edges(data=True)]
node_df = pd.DataFrame(node_data)
edge_df = pd.DataFrame(edge_data)
node_df.to_csv('facebook_node_attributes.txt', index=False)
edge_df.to_csv('facebook_edge_attributes.txt', index=False)
end_time = time.time()
time_taken = end_time - start_time
print(f"Time taken: {time_taken:.2f} seconds")

Time taken: 2.18 seconds


In [4]:
'''
European Deezer
'''
start_time = time.time()
file_path = 'EuropeanDeezer.txt'  # Update with the actual file path
edges_df = pd.read_csv(file_path, header=None, names=['source', 'target'])
G = nx.from_pandas_edgelist(edges_df, source='source', target='target')
def generate_node_attributes(G):
    countries = ['France', 'Germany', 'Spain', 'Italy', 'UK']
    genres = ['Rock', 'Pop', 'Jazz', 'Classical', 'Hip-Hop']
    moods = ['Love', 'Workout', 'Chill', 'Sad', 'Focus', 'Party']
    subscription_types = ['Free', 'Premium', 'Duo', 'Family', 'HiFi']
    for node in G.nodes():
        G.nodes[node]['User ID'] = node
        G.nodes[node]['Username'] = f'user_{node}'
        G.nodes[node]['Country'] = random.choice(countries)
        G.nodes[node]['Subscription Type'] = random.choice(subscription_types)
        G.nodes[node]['Registration Date'] = datetime.now() - timedelta(days=random.randint(1, 3650))
        G.nodes[node]['Last Login Date'] = datetime.now() - timedelta(days=random.randint(1, 365))
        G.nodes[node]['Playcount'] = int(np.random.normal(5000, 2000))  # Gaussian distribution
        G.nodes[node]['Top Artists'] = random.sample(range(1, 100), 5)  # Example artist IDs
        G.nodes[node]['Top Genres'] = random.sample(genres, 3)
        G.nodes[node]['Top Tracks'] = random.sample(range(1, 1000), 5)  # Example track IDs
        G.nodes[node]['Daily Playlists'] = int(np.random.normal(3, 1))  # Gaussian distribution
        G.nodes[node]['Editorial Playlists'] = int(np.random.normal(3, 1))  # Gaussian distribution
        G.nodes[node]['Flow Preferences'] = random.sample(moods, random.randint(1, 3))
        G.nodes[node]['Podcasts Listened'] = int(np.random.normal(25, 10))  # Gaussian distribution
        G.nodes[node]['Radio Stations Listened'] = int(np.random.normal(10, 5))  # Gaussian distribution

def generate_edge_attributes(G):
    for u, v in G.edges():
        G.edges[u, v]['Friendship'] = 'yes'
        G.edges[u, v]['Interaction Frequency'] = random.choice(['Daily', 'Weekly', 'Monthly'])
        G.edges[u, v]['Messages Exchanged'] = int(np.random.normal(50, 20))  # Gaussian distribution
        G.edges[u, v]['Common Artists'] = int(np.random.normal(5, 2))  # Gaussian distribution
        G.edges[u, v]['Common Genres'] = int(np.random.normal(3, 1))  # Gaussian distribution
        G.edges[u, v]['Shared Playlists'] = int(np.random.normal(2, 1))  # Gaussian distribution
        G.edges[u, v]['Recommendations Sent'] = int(np.random.normal(10, 5))  # Gaussian distribution

generate_node_attributes(G)
generate_edge_attributes(G)
node_data = [{'User ID': node, **data} for node, data in G.nodes(data=True)]
edge_data = [{'Edge (From, To)': (u, v), **data} for u, v, data in G.edges(data=True)]
node_df = pd.DataFrame(node_data)
edge_df = pd.DataFrame(edge_data)
node_df.to_csv('EuropeanDeezer_node_attributes.txt', index=False)
edge_df.to_csv('EuropeanDeezer_edge_attributes.txt', index=False)
end_time = time.time()
time_taken = end_time - start_time
print(f"Time taken: {time_taken:.2f} seconds")

Time taken: 3.58 seconds


In [5]:
'''
Twitter
'''
start_time = time.time()
node_df = pd.read_csv('twitter_attribute.txt', encoding='ISO-8859-1')
visibility_options = [True, False]
followers_range = range(1, 10000)  # Example range for followers count
gender_options = ['Male', 'Female']
age_range = range(20, 60)
node_df['Visibility'] = [random.choice(visibility_options) for _ in range(len(node_df))]
node_df['Followers'] = [random.choice(followers_range) for _ in range(len(node_df))]
node_df['Gender'] = [random.choice(gender_options) for _ in range(len(node_df))]
node_df['Age'] = [random.choice(age_range) for _ in range(len(node_df))]
node_df = node_df.loc[:, ~node_df.columns.str.contains('^Unnamed')]
node_df.to_csv('twitter_addAttrNode.txt', index=False)
edge_df = pd.read_csv('twitter_edgelist.txt', encoding='ISO-8859-1', names=['Source', 'Target'])
status_options = ['Family', 'Friend', 'Blocked', 'Muted']
disclose_options = ['Public', 'Secret']
edge_df['Status'] = [random.choice(status_options) for _ in range(len(edge_df))]
edge_df['Disclose'] = [random.choice(disclose_options) for _ in range(len(edge_df))]
edge_df.to_csv('twitter_addAttrEdge.txt', index=False)
end_time = time.time()
time_taken = end_time - start_time
print(f"Time taken: {time_taken:.2f} seconds")

Time taken: 0.19 seconds


In [6]:
'''
Lastfm
'''
start_time = time.time()
file_path = 'lastfm_asia_edges.txt'
edges_df = pd.read_csv(file_path, header=None, names=['source', 'target'])
G = nx.from_pandas_edgelist(edges_df, source='source', target='target')

def generate_node_attributes(G):
    for node in G.nodes():
        G.nodes[node]['User ID'] = node
        G.nodes[node]['Username'] = f'user_{node}'
        G.nodes[node]['Avatar'] = f'https://example.com/avatar/{node}.png'
        G.nodes[node]['Registration Date'] = datetime.now() - timedelta(days=random.randint(1, 3650))
        G.nodes[node]['Tracks Played'] = int(np.random.normal(5000, 2000))  # Gaussian distribution
        G.nodes[node]['Top Artists'] = random.sample(range(1, 100), 5)  # Example artist IDs
        G.nodes[node]['Top Tracks'] = random.sample(range(1, 1000), 5)  # Example track IDs
        G.nodes[node]['Recently Played Tracks'] = random.sample(range(1, 1000), 10)  # Example track IDs
        G.nodes[node]['Taste-o-Meter Score'] = np.random.uniform(1, 10)  # Uniform distribution
        G.nodes[node]['Pro User'] = random.choice(['yes', 'no'])
        G.nodes[node]['Private Messages Sent'] = int(np.random.normal(50, 20))  # Gaussian distribution
        G.nodes[node]['Shoutbox Messages'] = int(np.random.normal(25, 10))  # Gaussian distribution

def generate_edge_attributes(G):
    for u, v in G.edges():
        G.edges[u, v]['Friendship'] = 'yes'
        G.edges[u, v]['Interaction Frequency'] = random.choice(['Daily', 'Weekly', 'Monthly'])
        G.edges[u, v]['Messages Exchanged'] = int(np.random.normal(50, 20))  # Gaussian distribution
        G.edges[u, v]['Common Artists'] = int(np.random.normal(5, 2))  # Gaussian distribution
        G.edges[u, v]['Common Tracks'] = int(np.random.normal(5, 2))  # Gaussian distribution
        G.edges[u, v]['Shared Playlists'] = int(np.random.normal(2, 1))  # Gaussian distribution
        G.edges[u, v]['Recommendations Sent'] = int(np.random.normal(10, 5))  # Gaussian distribution

generate_node_attributes(G)
generate_edge_attributes(G)
node_data = [{'User ID': node, **data} for node, data in G.nodes(data=True)]
edge_data = [{'Edge (From, To)': (u, v), **data} for u, v, data in G.edges(data=True)]
node_df = pd.DataFrame(node_data)
edge_df = pd.DataFrame(edge_data)
node_df.to_csv('lastfm_asia_node_attributes.txt', index=False)
edge_df.to_csv('lastfm_asia_edge_attributes.txt', index=False)
end_time = time.time()
time_taken = end_time - start_time
print(f"Time taken: {time_taken:.2f} seconds")

Time taken: 1.14 seconds
