In [1]:
import pandas as pd
import os
import sys

print("=== SciSciNet Validation Notebook ===")
print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}\n")

DATA_DIR = '../data'

=== SciSciNet Validation Notebook ===
Python version: 3.12.9 (main, Feb 12 2025, 14:39:53) [GCC 6.3.0 20170516]
Pandas version: 2.3.3



In [2]:
required_files = [
    'SciSciNet_Affiliations.tsv',
    'SciSciNet_Fields.tsv',
    'SciSciNet_Papers.tsv',
    'SciSciNet_PaperDetails.tsv',
    'SciSciNet_PaperAuthorAffiliations.tsv',
    'SciSciNet_PaperFields.tsv',
    'SciSciNet_PaperReferences.tsv'
]

print("=== Dataset File Validation ===\n")
print(f"Data directory: {DATA_DIR}\n")

missing_files = []
existing_files = []

for file in required_files:
    file_path = os.path.join(DATA_DIR, file)
    if os.path.exists(file_path):
        size_mb = os.path.getsize(file_path) / (1024**2)
        size_gb = size_mb / 1024
        if size_gb >= 1:
            print(f"[OK] {file:45} {size_gb:>8.2f} GB")
        else:
            print(f"[OK] {file:45} {size_mb:>8.2f} MB")
        existing_files.append(file)
    else:
        print(f"[MISSING] {file}")
        missing_files.append(file)

if missing_files:
    print(f"\nMissing {len(missing_files)} required files")
    validation_passed = False
else:
    print(f"\nAll {len(required_files)} required files found")
    validation_passed = True

=== Dataset File Validation ===

Data directory: ../data

[OK] SciSciNet_Affiliations.tsv                        3.26 MB
[OK] SciSciNet_Fields.tsv                              0.01 MB
[OK] SciSciNet_Papers.tsv                             16.46 GB
[OK] SciSciNet_PaperDetails.tsv                       28.72 GB
[OK] SciSciNet_PaperAuthorAffiliations.tsv            11.68 GB
[OK] SciSciNet_PaperFields.tsv                        11.62 GB
[OK] SciSciNet_PaperReferences.tsv                    32.41 GB

All 7 required files found


In [3]:
print("\n=== Loading Affiliations Dataset ===\n")

affiliations_file = os.path.join(DATA_DIR, 'SciSciNet_Affiliations.tsv')
df_affiliations = pd.read_csv(affiliations_file, sep='\t')

print(f"Total affiliations: {len(df_affiliations):,}")
print(f"Columns: {list(df_affiliations.columns)}")
print(f"\nFirst 5 rows:")
print(df_affiliations.head())


=== Loading Affiliations Dataset ===

Total affiliations: 26,998
Columns: ['AffiliationID', 'Affiliation_Name', 'GridID', 'Official_Page', 'ISO3166Code', 'Latitude', 'Longitude', 'H-index', 'Productivity', 'Average_C10', 'Average_LogC10']

First 5 rows:
   AffiliationID                             Affiliation_Name         GridID  \
0       20455151                                  Air Liquide  grid.476009.c   
1       24386293     Hellenic National Meteorological Service            NaN   
2       32956416              Catholic University of the West  grid.448708.7   
3       35926432                       Mackay Medical College  grid.452449.a   
4       37448385  Chinese People's Public Security University            NaN   

                                       Official_Page ISO3166Code   Latitude  \
0  https://web.archive.org/web/20100205175402/htt...          GB  52.503593   
1         http://www.hnms.gr/hnms/english/index_html          GR  37.976140   
2                          

In [4]:
print("\n=== Searching for Virginia Tech (Main Campus) ===\n")

name_column = 'Affiliation_Name'
id_column = 'AffiliationID'

# Search for main Virginia Tech campus only
vt_main = df_affiliations[
    (df_affiliations[id_column] == 859038795)
]

if len(vt_main) > 0:
    print(f"FOUND: Virginia Tech main campus\n")
    print("="*120)
    display_cols = [id_column, name_column, 'ISO3166Code', 'Official_Page']
    available_cols = [col for col in display_cols if col in vt_main.columns]
    print(vt_main[available_cols].to_string(index=False))
    print("="*120)
    
    vt_affiliation_ids = [859038795]
    print(f"\nVirginia Tech Affiliation ID: {vt_affiliation_ids}")
    
    affiliation_id_col = id_column
    affiliation_name_col = name_column
    all_vt_results = vt_main
    
    validation_passed = validation_passed and True
else:
    print("\nWARNING: Virginia Tech main campus not found")
    validation_passed = False


=== Searching for Virginia Tech (Main Campus) ===

FOUND: Virginia Tech main campus

 AffiliationID Affiliation_Name ISO3166Code      Official_Page
     859038795    Virginia Tech          US http://www.vt.edu/

Virginia Tech Affiliation ID: [859038795]


In [5]:
print("\n=== Loading Fields Dataset ===\n")

fields_file = os.path.join(DATA_DIR, 'SciSciNet_Fields.tsv')
df_fields = pd.read_csv(fields_file, sep='\t')

print(f"Total fields: {len(df_fields):,}")
print(f"Columns: {list(df_fields.columns)}")
print(f"\nFirst 10 rows:")
print(df_fields.head(10))


=== Loading Fields Dataset ===

Total fields: 311
Columns: ['FieldID', 'Field_Name', 'Field_Type']

First 10 rows:
     FieldID                         Field_Name Field_Type
0    3079626            Quantum electrodynamics        Sub
1   37914503               Mathematical physics        Sub
2  159047783                           Virology        Sub
3   70410870                Clinical psychology        Sub
4  187212893                         Pediatrics        Sub
5   61434518                    General surgery        Sub
6   73484699                        Criminology        Sub
7  200601418            Reliability engineering        Sub
8   95457728                            History        Top
9  107826830  Environmental resource management        Sub


In [7]:
print("\n=== Defining All Computer Science Related Fields ===\n")

# Comprehensive list of CS-related field IDs
cs_related_field_ids = [
    # Core Computer Science
    41008148,    # Computer science (Top)
    113775141,   # Computer engineering
    80444323,    # Theoretical computer science
    459310,      # Computational science
    
    # AI & Machine Learning
    154945302,   # Artificial intelligence
    119857082,   # Machine learning
    178980831,   # Pattern recognition
    28490314,    # Speech recognition
    31972630,    # Computer vision
    204321447,   # Natural language processing
    
    # Data & Information
    124101348,   # Data mining
    2522767166,  # Data science
    77088390,    # Database
    23123220,    # Information retrieval
    56739046,    # Knowledge management
    
    # Software & Programming
    115903868,   # Software engineering
    199360897,   # Programming language
    111919701,   # Operating system
    11413529,    # Algorithm
    
    # Systems & Architecture
    118524514,   # Computer architecture
    9390403,     # Computer hardware
    120314980,   # Distributed computing
    173608175,   # Parallel computing
    79403827,    # Real-time computing
    149635348,   # Embedded system
    
    # Networks & Security
    31258907,    # Computer network
    38652104,    # Computer security
    108827166,   # Internet privacy
    76155785,    # Telecommunications
    136764020,   # World Wide Web
    
    # Graphics & Multimedia
    121684516,   # Computer graphics (images)
    49774154,    # Multimedia
    44154836,    # Simulation
    
    # Human-Computer Interaction
    107457646,   # Human-computer interaction
    188147891,   # Cognitive science
    
    # Computational Methods (Interdisciplinary)
    70721500,    # Computational biology
    60644358,    # Bioinformatics
    147597530,   # Computational chemistry
    30475298,    # Computational physics
]

print(f"Defined {len(cs_related_field_ids)} CS-related field IDs")

# Identify columns
field_name_column = None
field_id_column = None

for col in df_fields.columns:
    col_lower = col.lower()
    if 'name' in col_lower and field_name_column is None:
        field_name_column = col
    if 'id' in col_lower and 'field' in col_lower and field_id_column is None:
        field_id_column = col

print(f"Using name column: '{field_name_column}'")
print(f"Using ID column: '{field_id_column}'")

# Filter for CS-related fields
cs_fields = df_fields[df_fields[field_id_column].isin(cs_related_field_ids)]

if len(cs_fields) > 0:
    print(f"\nFOUND: {len(cs_fields)} Computer Science related fields\n")
    print("="*120)
    print(cs_fields[[field_id_column, field_name_column]].to_string(index=False))
    print("="*120)
    
    cs_field_ids = [int(x) for x in cs_fields[field_id_column].values]
    print(f"\nTotal CS Field IDs: {len(cs_field_ids)}")
    
    validation_passed = validation_passed and True
else:
    print("WARNING: No Computer Science fields found")
    validation_passed = False


=== Defining All Computer Science Related Fields ===

Defined 39 CS-related field IDs
Using name column: 'Field_Name'
Using ID column: 'FieldID'

FOUND: 39 Computer Science related fields

   FieldID                   Field_Name
 113775141         Computer engineering
 124101348                  Data mining
  56739046         Knowledge management
 149635348              Embedded system
 107457646   Human–computer interaction
  11413529                    Algorithm
  28490314           Speech recognition
 111919701             Operating system
  31972630              Computer vision
  77088390                     Database
 108827166             Internet privacy
  76155785           Telecommunications
   9390403            Computer hardware
 154945302      Artificial intelligence
  49774154                   Multimedia
    459310        Computational science
 199360897         Programming language
  38652104            Computer security
  79403827          Real-time computing
  41008148

In [8]:
print("\n=== Sampling Virginia Tech Papers ===\n")

if 'vt_affiliation_ids' not in locals() or len(vt_affiliation_ids) == 0:
    print("Skipping - No Virginia Tech affiliations found")
else:
    paa_file = os.path.join(DATA_DIR, 'SciSciNet_PaperAuthorAffiliations.tsv')
    
    sample_peek = pd.read_csv(paa_file, sep='\t', nrows=1)
    print(f"PaperAuthorAffiliations columns: {list(sample_peek.columns)}\n")
    
    print("Reading sample (first 2M rows)...")
    df_paa_sample = pd.read_csv(paa_file, sep='\t', nrows=2000000)
    
    print(f"Sample size: {len(df_paa_sample):,} rows")
    
    # Identify columns
    paa_affil_col = None
    paa_paper_col = None
    paa_author_col = None
    
    for col in df_paa_sample.columns:
        col_lower = col.lower()
        if 'affiliation' in col_lower and 'id' in col_lower:
            paa_affil_col = col
        if 'paper' in col_lower and 'id' in col_lower:
            paa_paper_col = col
        if 'author' in col_lower and 'id' in col_lower:
            paa_author_col = col
    
    print(f"Using columns: paper='{paa_paper_col}', author='{paa_author_col}', affiliation='{paa_affil_col}'")
    
    vt_papers_sample = df_paa_sample[df_paa_sample[paa_affil_col].isin(vt_affiliation_ids)]
    
    if len(vt_papers_sample) > 0:
        print(f"\nFOUND: {len(vt_papers_sample):,} VT paper-author records")
        print(f"Unique papers: {vt_papers_sample[paa_paper_col].nunique():,}")
        print(f"Unique authors: {vt_papers_sample[paa_author_col].nunique():,}")
        
        sample_paper_ids = vt_papers_sample[paa_paper_col].unique()[:10].tolist()
        print(f"\nSample Paper IDs: {sample_paper_ids[:5]}...")
        
        paper_id_col = paa_paper_col
        author_id_col = paa_author_col
        
        validation_passed = validation_passed and True
    else:
        print("WARNING: No VT papers found in sample")
        validation_passed = False


=== Sampling Virginia Tech Papers ===

PaperAuthorAffiliations columns: ['PaperID', 'AuthorID', 'AffiliationID', 'AuthorSequenceNumber']

Reading sample (first 2M rows)...
Sample size: 2,000,000 rows
Using columns: paper='PaperID', author='AuthorID', affiliation='AffiliationID'

FOUND: 733 VT paper-author records
Unique papers: 413
Unique authors: 608

Sample Paper IDs: [62584, 224206, 365369, 380602, 648144]...


In [9]:
print("\n=== Fetching Sample Paper Details ===\n")

if 'sample_paper_ids' in locals() and len(sample_paper_ids) > 0:
    papers_file = os.path.join(DATA_DIR, 'SciSciNet_Papers.tsv')
    
    print("Reading Papers.tsv sample (first 5M rows)...")
    df_papers_sample = pd.read_csv(papers_file, sep='\t', nrows=5000000)
    
    print(f"Sample size: {len(df_papers_sample):,} rows")
    print(f"Columns: {list(df_papers_sample.columns)}")
    
    sample_papers = df_papers_sample[df_papers_sample[paper_id_col].isin(sample_paper_ids)]
    
    if len(sample_papers) > 0:
        print(f"\nFound {len(sample_papers)} sample VT papers\n")
        print("="*120)
        
        title_col = None
        for col in ['PaperTitle', 'OriginalTitle', 'Title']:
            if col in sample_papers.columns:
                title_col = col
                break
        
        year_col = None
        for col in ['Year', 'PublicationYear']:
            if col in sample_papers.columns:
                year_col = col
                break
        
        cols_to_show = [paper_id_col]
        if year_col:
            cols_to_show.append(year_col)
        if title_col:
            cols_to_show.append(title_col)
        
        print(sample_papers[cols_to_show].to_string(index=False, max_colwidth=80))
        print("="*120)
        
        if year_col:
            vt_with_years = vt_papers_sample.merge(
                df_papers_sample[[paper_id_col, year_col]], 
                on=paper_id_col, 
                how='left'
            )
            year_counts = vt_with_years[year_col].value_counts().sort_index()
            print(f"\nYear distribution (last 15 years):")
            print(year_counts.tail(15))
    else:
        print("Sample papers not found in first 5M rows")
else:
    print("Skipping - No sample paper IDs available")


=== Fetching Sample Paper Details ===

Reading Papers.tsv sample (first 5M rows)...
Sample size: 5,000,000 rows
Columns: ['PaperID', 'DOI', 'DocType', 'Year', 'Date', 'JournalID', 'ConferenceSeriesID', 'Citation_Count', 'C10', 'Reference_Count', 'C5', 'Team_Size', 'Institution_Count', 'Disruption', 'Atyp_10pct_Z', 'Atyp_Pairs', 'Atyp_Median_Z', 'SB_B', 'SB_T', 'Patent_Count', 'Newsfeed_Count', 'Tweet_Count', 'NCT_Count', 'NIH_Count', 'NSF_Count', 'WSB_mu', 'WSB_sigma', 'WSB_Cinf']

Found 1 sample VT papers

 PaperID   Year
  365369 2002.0

Year distribution (last 15 years):
Year
1973.0    2
1975.0    2
1979.0    1
1991.0    2
1993.0    4
1995.0    3
1997.0    1
2002.0    1
2003.0    1
2004.0    1
2006.0    5
2007.0    5
2008.0    1
2010.0    1
2013.0    3
Name: count, dtype: int64


In [10]:
print("\n=== Checking for CS Papers at VT ===\n")

if 'sample_paper_ids' in locals() and 'cs_field_ids' in locals():
    paper_fields_file = os.path.join(DATA_DIR, 'SciSciNet_PaperFields.tsv')
    
    print("Reading PaperFields.tsv sample (first 5M rows)...")
    df_paper_fields_sample = pd.read_csv(paper_fields_file, sep='\t', nrows=5000000)
    
    print(f"Sample size: {len(df_paper_fields_sample):,} rows")
    print(f"Columns: {list(df_paper_fields_sample.columns)}")
    
    pf_paper_col = None
    pf_field_col = None
    
    for col in df_paper_fields_sample.columns:
        col_lower = col.lower()
        if 'paper' in col_lower and 'id' in col_lower:
            pf_paper_col = col
        if 'field' in col_lower and 'id' in col_lower:
            pf_field_col = col
    
    print(f"Using columns: paper='{pf_paper_col}', field='{pf_field_col}'")
    
    vt_cs_sample = df_paper_fields_sample[
        (df_paper_fields_sample[pf_paper_col].isin(sample_paper_ids)) &
        (df_paper_fields_sample[pf_field_col].isin(cs_field_ids))
    ]
    
    if len(vt_cs_sample) > 0:
        print(f"\nFound {len(vt_cs_sample)} VT CS paper-field records")
        print(f"Unique CS papers: {vt_cs_sample[pf_paper_col].nunique()}")
        
        # Show which CS fields are represented
        cs_fields_in_sample = vt_cs_sample[pf_field_col].unique()
        matching_fields = cs_fields[cs_fields[field_id_column].isin(cs_fields_in_sample)]
        print(f"\nCS fields found in sample:")
        print(matching_fields[[field_id_column, field_name_column]].to_string(index=False))
    else:
        print("No VT CS papers found in this sample")
else:
    print("Skipping - Missing prerequisites")


=== Checking for CS Papers at VT ===

Reading PaperFields.tsv sample (first 5M rows)...
Sample size: 5,000,000 rows
Columns: ['PaperID', 'FieldID', 'Hit_1pct', 'Hit_5pct', 'Hit_10pct', 'C_f']
Using columns: paper='PaperID', field='FieldID'

Found 2 VT CS paper-field records
Unique CS papers: 2

CS fields found in sample:
 FieldID       Field_Name
41008148 Computer science


In [11]:
print("\n=== Data Scope Estimation ===\n")

if 'vt_papers_sample' in locals() and len(vt_papers_sample) > 0:
    sample_size = 2000000
    vt_count_in_sample = len(vt_papers_sample)
    estimated_ratio = vt_count_in_sample / sample_size
    
    estimated_total_vt_records = int(estimated_ratio * 413_000_000)
    estimated_vt_papers = int(vt_papers_sample[paper_id_col].nunique() * (413_000_000 / sample_size))
    
    print(f"VT papers ratio in sample: {estimated_ratio*100:.4f}%")
    print(f"Estimated total VT paper-author records: ~{estimated_total_vt_records:,}")
    print(f"Estimated unique VT papers: ~{estimated_vt_papers:,}")
    
    if 'vt_cs_sample' in locals() and len(vt_cs_sample) > 0:
        cs_ratio = vt_cs_sample[pf_paper_col].nunique() / vt_papers_sample[paper_id_col].nunique()
        estimated_cs_papers = int(estimated_vt_papers * cs_ratio)
        print(f"Estimated VT CS papers: ~{estimated_cs_papers:,}")
    
    if 'year_counts' in locals():
        recent_years = year_counts[year_counts.index >= 2015]
        print(f"\nPapers from 2015-present in sample: {recent_years.sum()}")
        print(f"Papers from 2020-present in sample: {year_counts[year_counts.index >= 2020].sum()}")
else:
    print("Cannot estimate - no sample data available")


=== Data Scope Estimation ===

VT papers ratio in sample: 0.0367%
Estimated total VT paper-author records: ~151,364
Estimated unique VT papers: ~85,284
Estimated VT CS papers: ~412

Papers from 2015-present in sample: 0
Papers from 2020-present in sample: 0


In [12]:
print("\n" + "="*120)
print("VALIDATION SUMMARY")
print("="*120)

if validation_passed:
    print("All validation checks PASSED")
    print(f"Virginia Tech: Main campus only (ID: {vt_affiliation_ids[0]})")
    print(f"Computer Science fields: {len(cs_fields)} fields")
    if 'vt_papers_sample' in locals():
        print(f"Sample VT papers found: {len(vt_papers_sample):,} records")
    if 'vt_cs_sample' in locals() and len(vt_cs_sample) > 0:
        print(f"Sample VT CS papers found: {vt_cs_sample[pf_paper_col].nunique()} unique papers")
    print("Dataset is valid and ready for processing")
    
    config_content = f"""# Auto-generated from 01_validation.ipynb

# Virginia Tech Configuration (Main Campus Only)
VT_AFFILIATION_IDS = {vt_affiliation_ids}
AFFILIATION_ID_COL = '{affiliation_id_col}'
AFFILIATION_NAME_COL = '{affiliation_name_col}'

# Computer Science Configuration (All CS-Related Fields)
CS_FIELD_IDS = {cs_field_ids}
FIELD_ID_COL = '{field_id_column}'
FIELD_NAME_COL = '{field_name_column}'

# Column mappings
PAPER_ID_COL = '{paper_id_col if 'paper_id_col' in locals() else 'PaperID'}'
AUTHOR_ID_COL = '{author_id_col if 'author_id_col' in locals() else 'AuthorID'}'
PAA_AFFIL_COL = '{paa_affil_col if 'paa_affil_col' in locals() else 'AffiliationID'}'

# Data directory
DATA_DIR = '{DATA_DIR}'

# Display names
VT_AFFILIATION_NAMES = {list(all_vt_results[affiliation_name_col].values)}
CS_FIELD_NAMES = {list(cs_fields[field_name_column].values)}
"""
    
    config_file = 'validation_config.py'
    with open(config_file, 'w') as f:
        f.write(config_content)
    
    print(f"\nConfiguration saved to: {config_file}")
    print(f"Total CS-related fields included: {len(cs_field_ids)}")
    print("\nReady to proceed with 02_preprocessing.ipynb")
    
else:
    print("Validation FAILED")
    print("Please review errors above")

print("="*120)


VALIDATION SUMMARY
All validation checks PASSED
Virginia Tech: Main campus only (ID: 859038795)
Computer Science fields: 39 fields
Sample VT papers found: 733 records
Sample VT CS papers found: 2 unique papers
Dataset is valid and ready for processing

Configuration saved to: validation_config.py
Total CS-related fields included: 39

Ready to proceed with 02_preprocessing.ipynb
