In [4]:
!pip install pandas



#### Load Dataset

In [5]:
import pandas as pd

def load_minimal_fakenewsnet(fake_path, real_path):
    fake_df = pd.read_csv(fake_path, header=0)
    real_df = pd.read_csv(real_path, header=0)
    
    fake_df['label'] = 0
    real_df['label'] = 1
    
    return pd.concat([fake_df, real_df], ignore_index=True)


In [6]:
df = load_minimal_fakenewsnet('dataset/politifact_fake.csv', 'dataset/politifact_real.csv')
df.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,0
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,0
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,0
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,0
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,0


In [7]:
def extract_tweets(tweet_str):
    if isinstance(tweet_str, str):
        return tweet_str.strip().split()
    return []


In [8]:
df['tweet_ids'] = df['tweet_ids'].apply(extract_tweets)
df.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,"[937349434668498944, 937379378006282240, 93738...",0
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,"[972666281441878016, 972678396575559680, 97282...",0
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,"[929405740732870656, 929439450400264192, 92943...",0
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,"[886941526458347521, 887011300278194176, 88702...",0
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,"[915205698212040704, 915242076681506816, 91524...",0


#### Preprocessing title

In [9]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
import spacy
import pandas as pd

# Load English model
nlp = spacy.load("en_core_web_sm")

def preprocess_title_spacy(title):
    """
    Preprocess news titles using spaCy:
    - Lowercase
    - Tokenize
    - Lemmatize
    - Remove stopwords, punctuations, numbers
    - Keep words longer than 2 characters
    """
    if not isinstance(title, str):
        return ""
    
    doc = nlp(title.lower())
    
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop       
        and not token.is_punct
        and token.is_alpha
        and len(token.lemma_) > 2
    ]
    
    return ' '.join(tokens)


In [11]:
df["preprocessed_title"] = df["title"].apply(preprocess_title_spacy)
print(df[['title', 'preprocessed_title']])

                                                  title  \
0     BREAKING: First NFL Team Declares Bankruptcy O...   
1     Court Orders Obama To Pay $400 Million In Rest...   
2     UPDATE: Second Roy Moore Accuser Works For Mic...   
3            Oscar Pistorius Attempts To Commit Suicide   
4           Trump Votes For Death Penalty For Being Gay   
...                                                 ...   
1051  Flake: “Religious tests should have no place i...   
1052                           Change We Can Believe In   
1053  deputy director of national health statistics ...   
1054  Romneys ProLife Conversion Myth or Reality Jun...   
1055                             Interest Group Ratings   

                                     preprocessed_title  
0       breaking nfl team declare bankruptcy kneel thug  
1             court order obama pay million restitution  
2     update second roy moore accuser work michelle ...  
3                oscar pistorius attempt commit suicide  
4

#### Create Nodes

In [12]:
news_nodes = list(df['id'])
tweet_nodes = list({t for lst in df['tweet_ids'] for t in lst})
print("No. of news nodes:", len(news_nodes))
print("No. of tweet nodes:", len(tweet_nodes))
print("Total No. of nodes:", len(news_nodes) + len(tweet_nodes))

No. of news nodes: 1056
No. of tweet nodes: 558937
Total No. of nodes: 559993


#### Create mappings (for indexing):

In [13]:
news2idx = {nid: i for i, nid in enumerate(news_nodes)}
tweet2idx = {tid: i + len(news_nodes) for i, tid in enumerate(tweet_nodes)}

#### Build Edges (Tweet → News)

In [14]:
edges_src = []
edges_dst = []

for _, row in df.iterrows():
    n_id = news2idx[row['id']]
    for t_id in row['tweet_ids']:
        if t_id in tweet2idx:
            edges_src.append(tweet2idx[t_id])  # tweet node index
            edges_dst.append(n_id)             # news node index


In [15]:
# Adding reverse edge for creating the undirected graph
edges_src_rev = edges_dst.copy()
edges_dst_rev = edges_src.copy()

edges_src = edges_src + edges_src_rev
edges_dst = edges_dst + edges_dst_rev

In [16]:
print(len(edges_dst), len(edges_src))

1167032 1167032


In [17]:
!pip install torch torch-geometric



In [18]:
import torch
from torch_geometric.data import Data

edge_index = torch.tensor([edges_src, edges_dst], dtype=torch.long)
labels = torch.tensor(df['label'].values, dtype=torch.long)

data = Data(edge_index=edge_index)
data.y = labels
data.num_nodes = len(news_nodes) + len(tweet_nodes)


In [19]:
!pip install numpy scikit-learn



In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=512)
X_news = vectorizer.fit_transform(df['preprocessed_title']).toarray()

import numpy as np
X_tweets = np.zeros((len(tweet_nodes), 512))  # placeholder
X_all = np.vstack((X_news, X_tweets))

data.x = torch.tensor(X_all, dtype=torch.float)
