# Loading the raw dataset 

In [None]:
import pandas as pd
import os
raw_arxiv_json_path="your-project-path/arxiv-metadata-oai-snapshot.json"
dataset_dir = 'your-project-path/datasets'
raw_pyalex_df_path = os.path.join(dataset_dir, 'pyalex_df.csv') #the base openalex dataset to unify arxiv records with
df = pd.read_csv(raw_pyalex_df_path)


In [None]:
import json
import os
from tqdm import tqdm
import re

# Keywords to search for
keywords = ['antisemitism', 'antisemitic','anti-semitism', 'antizionist', 'antizionism', 'anti-zionism', 'jew', 'jews','jewish','holocaust', 'nazi', 'nazism']
# Fields to check for keywords
fields_to_check = ['title', 'abstract']
# Fields to keep in the output
fields_to_keep = ['id', 'title', 'abstract', 'authors', 'authors_parsed', 'doi']


def contains_keywords(text, keywords_list):
    """Check if any keyword appears as full words in the text"""
    if not text or not isinstance(text, str):
        return False
    
    text = text.lower()
    # Create a pattern that matches whole words only
    for keyword in keywords_list:
        # Create a regex pattern that ensures the keyword is surrounded by word boundaries
        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
        if re.search(pattern, text):
            return True
    
    return False
    
def extract_year_from_date(date_str):
    """Extract year from date string in format yyyy-mm-dd"""
    if not date_str or not isinstance(date_str, str):
        return None
    
    try:
        # Simple split to get the year part (first 4 characters)
        return date_str.split('-')[0]
    except Exception:
        return None

def process_json_file(json_path, output_path, chunk_size=1000):
    """Process JSON file line by line and filter for keywords"""
    
    # Count lines for progress bar
    print("Counting lines in file...")
    with open(json_path, 'r') as f:
        total_lines = sum(1 for _ in f)
    
    filtered_records = []
    records_processed = 0
    matches_found = 0
    
    print(f"Processing and filtering JSON file...")
    with open(json_path, 'r') as f:
        for line in tqdm(f, total=total_lines):
            try:
                # Parse JSON record
                record = json.loads(line.strip())
                records_processed += 1
                
                # Check if any field contains keywords
                match_found = False
                for field in fields_to_check:
                    if field in record and contains_keywords(record.get(field), keywords):
                        match_found = True
                        break
                
                # If match found, keep only the specified fields
                if match_found:
                    matches_found += 1
                    filtered_record = {field: record.get(field, '') for field in fields_to_keep}
                    
                    # Extract year from update_date and add as publication_year
                    update_date = record.get('update_date', '')
                    publication_year = extract_year_from_date(update_date)
                    if publication_year:
                        filtered_record['publication_year'] = publication_year
                    
                    filtered_records.append(filtered_record)
                
                # Write chunks to disk to save memory
                if len(filtered_records) >= chunk_size:
                    write_chunk_to_file(filtered_records, output_path, 
                                        append=(matches_found > chunk_size))
                    filtered_records = []
                    
            except json.JSONDecodeError:
                print(f"Warning: Error parsing JSON at line {records_processed+1}")
                continue
            except Exception as e:
                print(f"Error processing record: {e}")
                continue
    
    # Write any remaining records
    if filtered_records:
        write_chunk_to_file(filtered_records, output_path, 
                           append=(matches_found > chunk_size))
    
    print(f"Processing complete. {records_processed} records processed, {matches_found} matches found.")
    print(f"Filtered data saved to {output_path}")
    return matches_found

def write_chunk_to_file(records, output_path, append=False):
    """Write a chunk of records to the output file"""
    mode = 'a' if append else 'w'
    with open(output_path, mode) as f:
        for record in records:
            f.write(json.dumps(record) + '\n')

# Execute the processing
output_file = 'filtered_arxiv_data.jsonl'
process_json_file(raw_arxiv_json_path, output_file)

# Sample code to read the filtered data (for verification)
def peek_at_results(output_path, num_samples=3):
    """Show a few sample records from the output file"""
    print(f"\nSample records from {output_path}:")
    with open(output_path, 'r') as f:
        for i, line in enumerate(f):
            if i >= num_samples:
                break
            record = json.loads(line)
            print(f"Record {i+1}:")
            print(f"  Title: {record.get('title', '')[:100]}...")
            print(f"  Authors: {record.get('authors', '')[:100]}...")
            print(f"  DOI: {record.get('doi', 'N/A')}")
            print(f"  Publication Year: {record.get('publication_year', 'N/A')}")
            print()

# Peek at the results
peek_at_results(output_file)

# Unify datasets

In [None]:
import json
import pandas as pd
from tqdm import tqdm

def unify_datasets(json_path, df, output_path='unified_data.csv'):
    """
    Unify the arXiv JSON dataset with the PyAlex dataframe
    
    Args:
        json_path: Path to the filtered arXiv JSON file
        df: PyAlex dataframe
        output_path: Path to save the unified dataframe
    """
    # Create a copy of the dataframe to avoid modifying the original
    unified_df = df.copy()
    
    # Convert DOI column to lowercase for case-insensitive comparison
    # and create a title lookup dictionary for faster searches
    doi_lookup = {}
    title_lookup = {}
    
    print("Creating lookup dictionaries...")
    for index, row in unified_df.iterrows():
        if pd.notna(row['DOI']) and row['DOI'] != '':
            doi_lookup[row['DOI'].lower()] = index
        
        if pd.notna(row['Title']):
            # Use lowercase title for matching
            title_lookup[row['Title'].lower()] = index
    
    # Track changes
    updated_records = 0
    new_records = 0
    mismatched_titles = 0
    
    # Process the JSON file line by line
    print(f"Processing JSON file and unifying with dataframe...")
    with open(json_path, 'r') as f:
        for line_num, line in enumerate(tqdm(f), 1):
            try:
                record = json.loads(line.strip())
                
                # Extract needed fields
                arxiv_doi = record.get('doi', '').lower() if record.get('doi') else ''
                arxiv_title = record.get('title', '')
                arxiv_abstract = record.get('abstract', '')
                arxiv_year = record.get('publication_year', '')
                arxiv_authors_raw = record.get('authors', '')
                
                # Process authors - convert to list of strings
                if isinstance(arxiv_authors_raw, list):
                    # If already a list
                    arxiv_authors = arxiv_authors_raw
                elif isinstance(arxiv_authors_raw, str):
                    # Split by commas if it's a string
                    arxiv_authors = [a.strip() for a in arxiv_authors_raw.split(',')]
                else:
                    # Default empty list
                    arxiv_authors = []
                
                # Join with comma+space for standard format
                arxiv_authors_str = ", ".join(arxiv_authors)
                
                # Case 1: Check if DOI exists and is in the dataframe
                if arxiv_doi and arxiv_doi in doi_lookup:
                    index = doi_lookup[arxiv_doi]
                    existing_title = unified_df.at[index, 'Title']
                    
                    # Compare titles
                    if existing_title.lower() != arxiv_title.lower():
                        print(f"Line {line_num} - Title mismatch for DOI {arxiv_doi}:")
                        print(f"  PyAlex: {existing_title}")
                        print(f"  arXiv: {arxiv_title}")
                        mismatched_titles += 1
                    else:
                        # Titles match, update record if needed
                        # Update abstract if empty
                        if (pd.isna(unified_df.at[index, 'Abstract']) or 
                            unified_df.at[index, 'Abstract'] == '' or 
                            unified_df.at[index, 'Abstract'] == 'No abstract available'):
                            if arxiv_abstract:
                                unified_df.at[index, 'Abstract'] = arxiv_abstract
                        
                        # Update authors if new ones found
                        if arxiv_authors_str:
                            existing_authors = unified_df.at[index, 'Authors']
                            # Add new authors not in the existing list
                            if existing_authors:
                                existing_author_list = [a.strip() for a in existing_authors.split(',')]
                                new_authors = [a for a in arxiv_authors if a not in existing_author_list]
                                if new_authors:
                                    unified_df.at[index, 'Authors'] = existing_authors + ", " + ", ".join(new_authors)
                            else:
                                unified_df.at[index, 'Authors'] = arxiv_authors_str
                        
                        updated_records += 1
                
                # Case 2: Check if title exists in the dataframe
                elif arxiv_title and arxiv_title.lower() in title_lookup:
                    index = title_lookup[arxiv_title.lower()]
                    
                    # Update abstract if empty
                    if (pd.isna(unified_df.at[index, 'Abstract']) or 
                        unified_df.at[index, 'Abstract'] == '' or 
                        unified_df.at[index, 'Abstract'] == 'No abstract available'):
                        if arxiv_abstract:
                            unified_df.at[index, 'Abstract'] = arxiv_abstract
                    
                    # Update authors if new ones found
                    if arxiv_authors_str:
                        existing_authors = unified_df.at[index, 'Authors']
                        # Add new authors not in the existing list
                        if existing_authors:
                            existing_author_list = [a.strip() for a in existing_authors.split(',')]
                            new_authors = [a for a in arxiv_authors if a not in existing_author_list]
                            if new_authors:
                                unified_df.at[index, 'Authors'] = existing_authors + ", " + ", ".join(new_authors)
                        else:
                            unified_df.at[index, 'Authors'] = arxiv_authors_str
                    
                    updated_records += 1
                
                # Case 3: New record - doesn't exist in dataframe
                else:
                    # Create a new record with arxiv data
                    new_record = {
                        'Title': arxiv_title,
                        'DOI': arxiv_doi,
                        'OpenAlex ID': '',
                        'Publication Year': int(arxiv_year) if arxiv_year and arxiv_year.isdigit() else None,
                        'Type': 'preprint',  # Default type for arXiv papers
                        'Citation Count': 0,  # Default citation count
                        'Abstract': arxiv_abstract,
                        'Journal': '',
                        'Publisher': 'arXiv',
                        'Authors': arxiv_authors_str,
                        'Institutions': [],
                        'Countries': [],
                        'Concepts': [],
                        'Sub-fields': [],
                        'Topics': [],
                        'Domains': [],
                        'Fields': [],
                        'concept_dict': {},
                    }
                    
                    # Calculate decade from publication year
                    if arxiv_year and arxiv_year.isdigit():
                        new_record['decade'] = (int(arxiv_year) // 10) * 10
                    else:
                        new_record['decade'] = None
                    
                    # Add new record to dataframe
                    unified_df = pd.concat([unified_df, pd.DataFrame([new_record])], ignore_index=True)
                    
                    # Update lookup dictionaries
                    if arxiv_doi:
                        doi_lookup[arxiv_doi] = len(unified_df) - 1
                    title_lookup[arxiv_title.lower()] = len(unified_df) - 1
                    
                    new_records += 1
                
            except json.JSONDecodeError:
                print(f"Warning: Error parsing JSON at line {line_num}")
                continue
            except Exception as e:
                print(f"Error processing line {line_num}: {e}")
                continue
    
    # Save the unified dataframe
    print(f"Saving unified dataset to {output_path}...")
    unified_df.to_csv(output_path, index=False)
    
    print(f"Unification complete:")
    print(f"  - Updated records: {updated_records}")
    print(f"  - New records added: {new_records}")
    print(f"  - Title mismatches found: {mismatched_titles}")
    print(f"  - Final dataframe size: {len(unified_df)}")
    
    return unified_df

# Example usage:
unified_df = unify_datasets('filtered_arxiv_data.jsonl', df, 'unified_dataset_2.csv')