In [1]:
import pandas as pd
import numpy as np
import os

# set working directory
os.chdir('/Users/lijiayu/Desktop/MACS_30700_Final/')

# load eurobarometer dataset
eu_data = pd.read_csv("dataset/EU/eu.csv")

# extract distributions for sociodemographic attributes
gender_dist = eu_data["gender"].value_counts(normalize=True)
age_dist = eu_data["age_group"].value_counts(normalize=True)
education_dist = eu_data["education_level"].value_counts(normalize=True)
marital_dist = eu_data["marital_status"].value_counts(normalize=True)
children_dist = eu_data["children_status"].value_counts(normalize=True)

# validate probability distributions
assert np.isclose(gender_dist.sum(), 1), "Gender distribution does not sum to 1!"
assert np.isclose(age_dist.sum(), 1), "Age distribution does not sum to 1!"
assert np.isclose(education_dist.sum(), 1), "Education level distribution does not sum to 1!"
assert np.isclose(marital_dist.sum(), 1), "Marital status distribution does not sum to 1!"
assert np.isclose(children_dist.sum(), 1), "Children status distribution does not sum to 1!"

print("Eurobarometer distributions validated.")


Eurobarometer distributions validated.


In [2]:
# load facebook network edges
facebook_edges = pd.read_csv("dataset/facebook/facebook_preprocessed.csv")

# extract unique users from the network
facebook_nodes = set(facebook_edges["source"]).union(set(facebook_edges["target"]))

# convert to a sorted list for consistent processing
facebook_nodes_df = pd.DataFrame({"user_id": sorted(facebook_nodes)})

# validate node extraction
print(f"Total unique nodes extracted from Facebook SNAP: {len(facebook_nodes)}")


Total unique nodes extracted from Facebook SNAP: 4039


In [3]:
# set random seed for reproducibility
np.random.seed(42)

# assign attributes based on Eurobarometer distributions
facebook_nodes_df["gender"] = np.random.choice(gender_dist.index, size=len(facebook_nodes_df), p=gender_dist.values)
facebook_nodes_df["age_group"] = np.random.choice(age_dist.index, size=len(facebook_nodes_df), p=age_dist.values)
facebook_nodes_df["education_level"] = np.random.choice(education_dist.index, size=len(facebook_nodes_df), p=education_dist.values)
facebook_nodes_df["marital_status"] = np.random.choice(marital_dist.index, size=len(facebook_nodes_df), p=marital_dist.values)
facebook_nodes_df["children_status"] = np.random.choice(children_dist.index, size=len(facebook_nodes_df), p=children_dist.values)

# validate assignments
print(facebook_nodes_df.head())


   user_id  gender age_group education_level marital_status children_status
0        0  Female     26-44          Middle         Single              No
1        1    Male       65+            High         Single              No
2        2    Male     45-64          Middle         Single              No
3        3    Male     26-44          Middle        Married              No
4        4  Female     26-44          Middle        Married              No


In [4]:
# ensure education level aligns with age
facebook_nodes_df.loc[facebook_nodes_df["age_group"] == "15-18", "education_level"] = "Low"
facebook_nodes_df.loc[(facebook_nodes_df["age_group"] == "19-25") & (facebook_nodes_df["education_level"] == "High"), "education_level"] = "Middle"

# ensure marital status aligns with age
facebook_nodes_df.loc[facebook_nodes_df["age_group"] == "15-18", "marital_status"] = "Single"

In [6]:
# merge demographic attributes with edges dataset
facebook_edges_enriched = facebook_edges.merge(facebook_nodes_df, left_on="source", right_on="user_id", how="left")
facebook_edges_enriched = facebook_edges_enriched.merge(facebook_nodes_df, left_on="target", right_on="user_id", how="left", suffixes=("_source", "_target"))

# check column names to verify existence
print("Columns in facebook_edges_enriched after merging:")
print(facebook_edges_enriched.columns)

Columns in facebook_edges_enriched after merging:
Index(['source', 'target', 'user_id_source', 'gender_source',
       'age_group_source', 'education_level_source', 'marital_status_source',
       'children_status_source', 'user_id_target', 'gender_target',
       'age_group_target', 'education_level_target', 'marital_status_target',
       'children_status_target'],
      dtype='object')


In [7]:
# drop redundant user_id columns while preserving correct format
facebook_edges_enriched.drop(columns=["user_id_source", "user_id_target"], inplace=True)

# validate merged dataset
display(facebook_edges_enriched.head())

Unnamed: 0,source,target,gender_source,age_group_source,education_level_source,marital_status_source,children_status_source,gender_target,age_group_target,education_level_target,marital_status_target,children_status_target
0,0,1,Female,26-44,Middle,Single,No,Male,65+,High,Single,No
1,0,2,Female,26-44,Middle,Single,No,Male,45-64,Middle,Single,No
2,0,3,Female,26-44,Middle,Single,No,Male,26-44,Middle,Married,No
3,0,4,Female,26-44,Middle,Single,No,Female,26-44,Middle,Married,No
4,0,5,Female,26-44,Middle,Single,No,Female,26-44,Middle,Divorced or Widowed,No


In [8]:
# check if edge structure is intact
print(f"Total edges in facebook_edges_enriched: {facebook_edges_enriched.shape[0]}")
print("Sample of edges with assigned attributes:")
display(facebook_edges_enriched.head())

Total edges in facebook_edges_enriched: 88234
Sample of edges with assigned attributes:


Unnamed: 0,source,target,gender_source,age_group_source,education_level_source,marital_status_source,children_status_source,gender_target,age_group_target,education_level_target,marital_status_target,children_status_target
0,0,1,Female,26-44,Middle,Single,No,Male,65+,High,Single,No
1,0,2,Female,26-44,Middle,Single,No,Male,45-64,Middle,Single,No
2,0,3,Female,26-44,Middle,Single,No,Male,26-44,Middle,Married,No
3,0,4,Female,26-44,Middle,Single,No,Female,26-44,Middle,Married,No
4,0,5,Female,26-44,Middle,Single,No,Female,26-44,Middle,Divorced or Widowed,No


In [10]:
# save final dataset containing both edges and assigned attributes
facebook_edges_enriched.to_csv("model/model_1.csv", index=False)

print(" Export completed: model/model_1.csv")

 Export completed: model/model_1.csv
