In [1]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m530.4 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m
[?25hCollecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00

#### Load Dataset

In [11]:
import pandas as pd

def load_minimal_fakenewsnet(fake_path, real_path):
    fake_df = pd.read_csv(fake_path, header=0)
    real_df = pd.read_csv(real_path, header=0)
    
    fake_df['label'] = 0
    real_df['label'] = 1
    
    return pd.concat([fake_df, real_df], ignore_index=True)


In [14]:
df = load_minimal_fakenewsnet('dataset/politifact_fake.csv', 'dataset/politifact_real.csv')
df.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,0
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,0
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,0
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,0
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,0


In [15]:
def extract_tweets(tweet_str):
    if isinstance(tweet_str, str):
        return tweet_str.strip().split()
    return []


In [16]:
df['tweet_ids'] = df['tweet_ids'].apply(extract_tweets)
df.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,"[937349434668498944, 937379378006282240, 93738...",0
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,"[972666281441878016, 972678396575559680, 97282...",0
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,"[929405740732870656, 929439450400264192, 92943...",0
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,"[886941526458347521, 887011300278194176, 88702...",0
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,"[915205698212040704, 915242076681506816, 91524...",0


#### Create Nodes

In [22]:
news_nodes = list(df['id'])
tweet_nodes = list({t for lst in df['tweet_ids'] for t in lst})
print("No. of news nodes:", len(news_nodes))
print("No. of tweet nodes:", len(tweet_nodes))
print("Total No. of nodes:", len(news_nodes) + len(tweet_nodes))

No. of news nodes: 1056
No. of tweet nodes: 558937
Total No. of nodes: 559993


#### Create mappings (for indexing):

In [23]:
news2idx = {nid: i for i, nid in enumerate(news_nodes)}
tweet2idx = {tid: i + len(news_nodes) for i, tid in enumerate(tweet_nodes)}

#### Build Edges (Tweet → News)

In [32]:
edges_src = []
edges_dst = []

for _, row in df.iterrows():
    n_id = news2idx[row['id']]
    for t_id in row['tweet_ids']:
        if t_id in tweet2idx:
            edges_src.append(tweet2idx[t_id])  # tweet node index
            edges_dst.append(n_id)             # news node index


In [33]:
# Adding reverse edge for creating the undirected graph
edges_src_rev = edges_dst.copy()
edges_dst_rev = edges_src.copy()

edges_src = edges_src + edges_src_rev
edges_dst = edges_dst + edges_dst_rev

In [34]:
print(len(edges_dst), len(edges_src))

1167032 1167032


In [29]:
!pip install torch torch-geometric

Collecting torch-geometric
  Using cached torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
Collecting aiohttp (from torch-geometric)
  Downloading aiohttp-3.13.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)
Collecting pyparsing (from torch-geometric)
  Downloading pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)
Collecting requests (from torch-geometric)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm (from torch-geometric)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp->torch-geometric)
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.4.0 (from aiohttp->torch-geometric)
  Using cached aiosignal-1.4.0-py3-none-any.whl.metadata (3.7 kB)
Collecting attrs>=17.3.0 (from aiohttp->torch-geometric)
  Downloading attrs-25.4.0-py3-none-any.whl.metadata (10 kB)
Collecting frozenlist>=1.1.1 (

In [35]:
import torch
from torch_geometric.data import Data

edge_index = torch.tensor([edges_src, edges_dst], dtype=torch.long)
labels = torch.tensor(df['label'].values, dtype=torch.long)

data = Data(edge_index=edge_index)
data.y = labels
data.num_nodes = len(news_nodes) + len(tweet_nodes)


In [37]:
!pip install numpy scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (62 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m631.3 kB/s[0m eta [36m0:00:00[0m31m?[0m eta [36m-:--:--[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading joblib-1.5.2-py3-none-any.whl (308 kB)


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=512)
X_news = vectorizer.fit_transform(df['title']).toarray()

import numpy as np
X_tweets = np.zeros((len(tweet_nodes), 512))  # placeholder
X_all = np.vstack((X_news, X_tweets))

data.x = torch.tensor(X_all, dtype=torch.float)
