In [11]:
import pandas as pd
import numpy as np

# 读取论文数据，假设列名为 ['title', 'abstract', 'category', 'year']
papers = pd.read_csv('data/papers.csv')
print("Papers DataFrame Shape:", papers.shape)

# 读取特征向量数据，假设有128个特征
feats = pd.read_csv('data/feats.csv').values.astype(np.float32)
column_names1 = [f'feature_{i}' for i in range(1, 129)]

# 将 NumPy 数组转换为 DataFrame 并设置列名
feats = pd.DataFrame(feats, columns=column_names1)
print("Feature Vector Shape:", feats.shape)

# 将修改后的特征向量 DataFrame 保存回原文件
feats.to_csv('data/feats.csv', index=False, header=True)

# 读取引用关系数据，假设列名为 ['citing_paper_id', 'cited_paper_id']
edges = pd.read_csv('data/edge.csv')
column_names2 = ['citing_paper_id', 'cited_paper_id']
edges.columns = column_names2
print("Edges DataFrame Shape:", edges.shape)

# 将修改后的引用关系 DataFrame 保存回原文件
edges.to_csv('data/edge.csv', index=False, header=True)

Papers DataFrame Shape: (169343, 4)
Feature Vector Shape: (169342, 128)
Edges DataFrame Shape: (1166242, 2)


In [12]:
# 检查缺失值
print("Missing values in Papers DataFrame:")
print(papers.isnull().sum())

# 删除含有缺失值的行
papers_cleaned = papers.dropna()
print("Cleaned Papers DataFrame Shape:", papers_cleaned.shape)

# 检查重复项
duplicates_papers = papers_cleaned.duplicated().sum()
print("Number of duplicate papers:", duplicates_papers)

# 删除重复数据
papers_cleaned = papers_cleaned.drop_duplicates()
print("Papers DataFrame after duplicates removal:", papers_cleaned.shape)

Missing values in Papers DataFrame:
title       0
abstract    0
category    0
year        0
dtype: int64
Cleaned Papers DataFrame Shape: (169343, 4)
Number of duplicate papers: 2
Papers DataFrame after duplicates removal: (169341, 4)


In [13]:
# 检查特征向量的维度
if feats.ndim != 2:
    raise ValueError("Feature vectors should be 2-dimensional (samples x features).")

# 查看特征向量的统计信息
feats_df = pd.DataFrame(feats)
print("Features Descriptive Statistics:")
print(feats_df.describe())

# 检查异常值，可以设定阈值，比如超过3个标准差
outliers = np.any(np.abs(feats) > 3, axis=1)
print("Number of outlier feature vectors: ", np.sum(outliers))

# 去除异常值
feats_cleaned = feats[~outliers]
print("Scaled Features Shape after outlier removal:", feats_cleaned.shape)

Features Descriptive Statistics:
           feature_1      feature_2      feature_3      feature_4  \
count  169342.000000  169342.000000  169342.000000  169342.000000   
mean       -0.095499       0.018615      -0.200258      -0.050083   
std         0.104758       0.107905       0.114009       0.117657   
min        -0.753942      -0.711038      -1.239961      -0.826783   
25%        -0.161587      -0.050625      -0.270516      -0.127759   
50%        -0.095250       0.016597      -0.197415      -0.054509   
75%        -0.029251       0.085527      -0.126237       0.023044   
max         0.698438       0.709360       0.610937       0.878995   

           feature_5      feature_6      feature_7      feature_8  \
count  169342.000000  169342.000000  169342.000000  169342.000000   
mean        0.049300      -0.085222      -0.369258      -0.111833   
std         0.106438       0.107369       0.114723       0.130831   
min        -0.899320      -1.006146      -1.050989      -1.061832   


In [14]:
import pandas as pd

# 假设您已经读取了 CSV 文件
df = pd.read_csv('data/feats.csv')  # 替换为您的文件路径

# 显示列名
print(df.columns)

Index(['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
       'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10',
       ...
       'feature_119', 'feature_120', 'feature_121', 'feature_122',
       'feature_123', 'feature_124', 'feature_125', 'feature_126',
       'feature_127', 'feature_128'],
      dtype='object', length=128)


In [15]:
import pandas as pd

# 假设 edges 是你的引用数据 DataFrame
# 这里可以用一个例子 DataFrame 初始化 edges
# edges = pd.DataFrame({'citing_paper_id': [1, 2, 0, 4], 'cited_paper_id': [3, 0, 5, 6]})

# 1. 检查缺失值
print("Missing values in Edges DataFrame:")
print(edges.isnull().sum())

# 2. 检查重复项
duplicates_edges = edges.duplicated().sum()
print("Number of duplicate citation edges:", duplicates_edges)

# 3. 删除重复项
edges_cleaned = edges.drop_duplicates()
print("Cleaned Edges DataFrame Shape after removing duplicates:", edges_cleaned.shape)

# 4. 删除引用和被引用其中之一为 0 的行
initial_shape = edges_cleaned.shape
edges_cleaned = edges_cleaned[(edges_cleaned['citing_paper_id'] != 0) & 
                              (edges_cleaned['cited_paper_id'] != 0)]
removed_zero_edges = initial_shape[0] - edges_cleaned.shape[0]
print("Number of edges removed with 0 IDs:", removed_zero_edges)

print("Cleaned Edges DataFrame Shape after removing edges with 0 IDs:", edges_cleaned.shape)

# 5. 保存处理后的 edges 到 CSV 文件
edges_cleaned.to_csv('data/edge.csv', index=False)

Missing values in Edges DataFrame:
citing_paper_id    0
cited_paper_id     0
dtype: int64
Number of duplicate citation edges: 0
Cleaned Edges DataFrame Shape after removing duplicates: (1166242, 2)
Number of edges removed with 0 IDs: 291
Cleaned Edges DataFrame Shape after removing edges with 0 IDs: (1165951, 2)
