In [1]:
import json
import math
from collections import Counter, defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme(style='whitegrid')
plt.rcParams.update({'figure.figsize': (9, 5), 'axes.titlesize': 14})
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 160)

DATA = Path('TwiBot-22')
if not DATA.exists():
    raise FileNotFoundError(f"Expected TwiBot-22 under {Path.cwd() / 'TwiBot-22'}")

print('Data directory:', DATA.resolve())


Data directory: /sciclone/home/hwhyman/Graph_learning/TwiBot-22


# TwiBot-22 at a glance

This notebook explores the user, graph, and tweet data provided in TwiBot-22. The goal is to surface label balance, structural properties, and text characteristics to inform downstream graph-learning choices.


In [None]:
print("label.csv:")
label_df = pd.read_csv(DATA / "label.csv")
display(label_df.head())
print(label_df["label"].value_counts())

print("\nsplit.csv:")
split_df = pd.read_csv(DATA / "split.csv")
display(split_df.head())
print(split_df["split"].value_counts())

print("\nedge.csv:")
edge_df = pd.read_csv(DATA / "edge.csv")
display(edge_df.head())
print(edge_df["relation"].value_counts().head(20))



label.csv:


Unnamed: 0,id,label
0,u1217628182611927040,human
1,u2664730894,human
2,u1266703520205549568,human
3,u1089159225148882949,human
4,u36741729,bot


label
human    860057
bot      139943
Name: count, dtype: int64

split.csv:


Unnamed: 0,id,split
0,u2664730894,train
1,u1089159225148882949,train
2,u36741729,train
3,u1679822588,train
4,u1519144464,train


split
train    700000
val      200000
test     100000
Name: count, dtype: int64

edge.csv:


In [None]:
import json
from itertools import islice

def json_head(path, n=5):
    print(path.name + ":")
    with open(path, "r") as f:
        obj = json.load(f)
    if isinstance(obj, list):
        for row in obj[:n]:
            display(row)
        print(f"total: {len(obj)}")
    elif isinstance(obj, dict):
        # If dict of id->object
        items = list(obj.items())
        for k, v in items[:n]:
            print("id:", k)
            display(v)
        print(f"total keys: {len(items)}")
    else:
        print(type(obj))

for name in ["user.json", "list.json", "hashtag.json", "tweet_0.json"]:
    p = DATA / name
    if p.exists():
        json_head(p, n=3)
    else:
        print(name, "not found")



In [None]:
data 