In [2]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import torch 
import numpy as np 

In [3]:
def load_cora_data(cora_content_path, cora_cites_path):
    """Load Cora Dataset"""
    column_names = ['id'] + [f'word_{i}' for i in range(1433)] + ['class_label'] # number of columns in cora.content
    cora_content = pd.read_csv(cora_content_path, sep='\s+', header=None, names=column_names)
    cora_cites = pd.read_csv(cora_cites_path, sep='\t', header=None, names=['citing_paper', 'cited_paper'])
    
    return cora_content, cora_cites


def create_citation_graph(cora_cites):
    """Creates Citation Graph"""
    G = nx.Graph()
    edges = list(zip(cora_cites['citing_paper'], cora_cites['cited_paper']))
    G.add_edges_from(edges)
    
    return G


def visualize_graph(G):
    """Viusalize Citation Graph"""
    plt.figure(figsize=(10, 10))
    pos = nx.spring_layout(G, seed=42) 
    nx.draw(G, pos, node_size=10, node_color='blue', edge_color='gray', with_labels=False)
    plt.show()

In [4]:
cora_content_path = 'cora\cora.content'
cora_cites_path = 'cora\cora.cites'

cora_content, cora_cites = load_cora_data(cora_content_path=cora_content_path, cora_cites_path=cora_cites_path)

print("Columns of cora.content:")
print(cora_content.columns)

print("\nColumns of cora.cites:")
print(cora_cites.columns)


Columns of cora.content:
Index(['id', 'word_0', 'word_1', 'word_2', 'word_3', 'word_4', 'word_5',
       'word_6', 'word_7', 'word_8',
       ...
       'word_1424', 'word_1425', 'word_1426', 'word_1427', 'word_1428',
       'word_1429', 'word_1430', 'word_1431', 'word_1432', 'class_label'],
      dtype='object', length=1435)

Columns of cora.cites:
Index(['citing_paper', 'cited_paper'], dtype='object')


In [5]:
# Examine a few samples from CORA Content

def sample_cora_content(cora_content, num_rows=5, num_features=10):
    sample_columns = ['id'] + [f'word_{i}' for i in range(num_features)] + ['class_label']
    sample_data = cora_content[sample_columns].head(num_rows)
    print("sample:")
    print(sample_data)

sample_cora_content(cora_content, num_rows=5, num_features=10)

sample:
        id  word_0  word_1  word_2  word_3  word_4  word_5  word_6  word_7  \
0    31336       0       0       0       0       0       0       0       0   
1  1061127       0       0       0       0       0       0       0       0   
2  1106406       0       0       0       0       0       0       0       0   
3    13195       0       0       0       0       0       0       0       0   
4    37879       0       0       0       0       0       0       0       0   

   word_8  word_9             class_label  
0       0       0         Neural_Networks  
1       0       0           Rule_Learning  
2       0       0  Reinforcement_Learning  
3       0       0  Reinforcement_Learning  
4       0       0   Probabilistic_Methods  


#### Format of data: 
Source: Citing Paper | 
Target: Cited Paper


### Examining the number of Nodes and Edges 

In [6]:
G = create_citation_graph(cora_cites=cora_cites)

print(f"Number nodes: {G.number_of_nodes()}")
print(f"Number edges: {G.number_of_edges()}")
print(f"Ave degree: {sum(dict(G.degree()).values()) / G.number_of_nodes():.2f}")


Number nodes: 2708
Number edges: 5278
Ave degree: 3.90


### Checking proportion of classes

In [7]:
def get_labels(cora_content):
    labels = cora_content['class_label']
    print(len(labels))
    
    return labels

def check_class_proportion(labels):
    if isinstance(labels, torch.Tensor):
        labels = labels.numpy()
    
    unique_labels, counts = np.unique(labels, return_counts=True)

    total_samples = len(labels)
    proportions = counts / total_samples

    print("Proportion in CORA Dataset:")
    for label, count, proportion in zip(unique_labels, counts, proportions):
        print(f"{label}: {count} samples, {proportion:.4f} proportion")
    
    return unique_labels, counts, proportions

labels = get_labels(cora_content)
check_class_proportion(labels)

2708
Proportion in CORA Dataset:
Case_Based: 298 samples, 0.1100 proportion
Genetic_Algorithms: 418 samples, 0.1544 proportion
Neural_Networks: 818 samples, 0.3021 proportion
Probabilistic_Methods: 426 samples, 0.1573 proportion
Reinforcement_Learning: 217 samples, 0.0801 proportion
Rule_Learning: 180 samples, 0.0665 proportion
Theory: 351 samples, 0.1296 proportion


(array(['Case_Based', 'Genetic_Algorithms', 'Neural_Networks',
        'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning',
        'Theory'], dtype=object),
 array([298, 418, 818, 426, 217, 180, 351], dtype=int64),
 array([0.11004431, 0.15435746, 0.30206795, 0.15731167, 0.08013294,
        0.06646972, 0.12961595]))

### Visualizing Graph 

In [9]:
# visualize_graph(G)

In [14]:
def calculate_homophily_score(cora_content, cora_cites):
    cora_content = cora_content[['id', 'class_label']]
    
    # Merge citing paper's class
    cora_cites = cora_cites.merge(cora_content, left_on='citing_paper', right_on='id', how='left')
    cora_cites.rename(columns={'class_label': 'citing_class'}, inplace=True)
    
    # Merge cited paper's class
    cora_cites = cora_cites.merge(cora_content, left_on='cited_paper', right_on='id', how='left')
    cora_cites.rename(columns={'class_label': 'cited_class'}, inplace=True)
    
    # Count same-class connections
    same_class_links = (cora_cites['citing_class'] == cora_cites['cited_class']).sum()
    total_links = len(cora_cites)
    
    homophily_score = same_class_links / total_links
    
    print(f"Total Links: {total_links}")
    print(f"Same Class Links: {same_class_links}")
    print(f"Homophily Score: {homophily_score:.4f}")
    
    return homophily_score

homophily_score = calculate_homophily_score(cora_content, cora_cites)


Total Links: 5429
Same Class Links: 4418
Homophily Score: 0.8138
