In [1]:
import zipfile

# Specify the path to your .zip file
zip_file_path = "./2401.zip"

# List to store tar file names
tar_files = []

# Open the .zip file and extract tar file names
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    tar_files = [f for f in zip_file.namelist() if f.endswith('.tar.gz')]

# Print the extracted tar file names
print("Extracted .tar.gz file names:")
for tar_name in tar_files:
    print(tar_name)

# Optionally, save the list of tar file names to a file for later use
output_txt_path = "extracted_tar_files.txt"
with open(output_txt_path, 'w') as f:
    for tar_name in tar_files:
        f.write(tar_name + '\n')

print(f"\nTotal .tar.gz files found: {len(tar_files)}")
print(f"List saved to {output_txt_path}")


Extracted .tar.gz file names:
2401/2401.00006v1.tar.gz
2401/2401.00006v2.tar.gz
2401/2401.00009v1.tar.gz
2401/2401.00016v1.tar.gz
2401/2401.00023v1.tar.gz
2401/2401.00025v1.tar.gz
2401/2401.00028v1.tar.gz
2401/2401.00028v2.tar.gz
2401/2401.00029v1.tar.gz
2401/2401.00031v1.tar.gz
2401/2401.00039v1.tar.gz
2401/2401.00050v1.tar.gz
2401/2401.00069v1.tar.gz
2401/2401.00087v1.tar.gz
2401/2401.00105v1.tar.gz
2401/2401.00109v1.tar.gz
2401/2401.00110v1.tar.gz
2401/2401.00110v2.tar.gz
2401/2401.00110v3.tar.gz
2401/2401.00124v1.tar.gz
2401/2401.00129v1.tar.gz
2401/2401.00129v2.tar.gz
2401/2401.00131v1.tar.gz
2401/2401.00132v1.tar.gz
2401/2401.00144v1.tar.gz
2401/2401.00153v1.tar.gz
2401/2401.00155v1.tar.gz
2401/2401.00157v1.tar.gz
2401/2401.00165v1.tar.gz
2401/2401.00169v1.tar.gz
2401/2401.00204v1.tar.gz
2401/2401.00213v1.tar.gz
2401/2401.00215v1.tar.gz
2401/2401.00216v1.tar.gz
2401/2401.00219v1.tar.gz
2401/2401.00221v1.tar.gz
2401/2401.00232v1.tar.gz
2401/2401.00236v1.tar.gz
2401/2401.00250v1.ta

In [10]:
import pandas as pd
import zipfile

# Path to the CSV file and the .zip file
csv_file_path = "./2401_scopus_1128.csv"
zip_file_path = "./2401.zip"

# Load the CSV file into a DataFrame
scopus_df = pd.read_csv(csv_file_path)

# Ensure all paper IDs have the correct formatting with leading zeros preserved
scopus_paper_ids = set(scopus_df['paper_id'].apply(lambda x: f"{x:.5f}" if isinstance(x, (float, int)) else str(x).strip()).unique())

# Debug print: show a few sample IDs from the CSV after conversion to strings
print("Sample paper IDs from CSV (formatted):")
print(list(scopus_paper_ids)[:10])

# Extract the unique paper IDs from the .zip file tar file names
zip_paper_ids = set()
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    tar_files = [f for f in zip_file.namelist() if f.endswith('.tar.gz')]
    for tar_name in tar_files:
        # Remove any '._' prefix from the file name and extract only the paper_id part
        paper_id = tar_name.split('/')[-1].split('v')[0].replace('._', '').strip()
        zip_paper_ids.add(paper_id)

# Debug print: show a few sample IDs from the ZIP file
print("\nSample paper IDs from ZIP (cleaned):")
print(list(zip_paper_ids)[:10])

# Compute the comparison
only_in_scopus = scopus_paper_ids - zip_paper_ids
only_in_zip = zip_paper_ids - scopus_paper_ids
in_both = scopus_paper_ids & zip_paper_ids

# Debug prints for counts
print("\nNumber of unique paper IDs in Scopus CSV:", len(scopus_paper_ids))
print("Number of unique paper IDs in ZIP file:", len(zip_paper_ids))
print("Number of paper IDs present in both:", len(in_both))

# Print some IDs present only in CSV and only in ZIP for validation
print("\nSample paper IDs only in Scopus CSV:")
print(list(only_in_scopus)[:10])

print("\nSample paper IDs only in ZIP file:")
print(list(only_in_zip)[:10])

# Overview of results
results = {
    'Only in Scopus CSV': len(only_in_scopus),
    'Only in Zip File': len(only_in_zip),
    'In Both': len(in_both)
}

results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Count'])

# Save unique paper IDs for reference
only_in_scopus_output_path = "only_in_scopus.csv"
only_in_zip_output_path = "only_in_zip.csv"
in_both_output_path = "in_both.csv"

pd.DataFrame({'Paper ID': list(only_in_scopus)}).to_csv(only_in_scopus_output_path, index=False)
pd.DataFrame({'Paper ID': list(only_in_zip)}).to_csv(only_in_zip_output_path, index=False)
pd.DataFrame({'Paper ID': list(in_both)}).to_csv(in_both_output_path, index=False)

print("\nComparison complete. Results saved to CSV files.")
print(results_df)

Sample paper IDs from CSV (formatted):
['2401.00276', '2401.00878', '2401.00060', '2401.08582', '2401.06166', '2401.00656', '2401.08624', '2401.00872', '2401.08645', '2401.00074']

Sample paper IDs from ZIP (cleaned):
['2401.04854', '2401.08224', '2401.06506', '2401.01621', '2401.01512', '2401.09242', '2401.10063', '2401.15255', '2401.10488', '2401.15204']

Number of unique paper IDs in Scopus CSV: 1128
Number of unique paper IDs in ZIP file: 2171
Number of paper IDs present in both: 117

Sample paper IDs only in Scopus CSV:
['2401.00276', '2401.00060', '2401.00878', '2401.08582', '2401.06166', '2401.00656', '2401.08624', '2401.00872', '2401.10898', '2401.08645']

Sample paper IDs only in ZIP file:
['2401.04854', '2401.08224', '2401.06506', '2401.01621', '2401.01512', '2401.09242', '2401.10063', '2401.15255', '2401.10488', '2401.15204']

Comparison complete. Results saved to CSV files.
                    Count
Only in Scopus CSV   1011
Only in Zip File     2054
In Both               1

In [9]:
import pandas as pd
import zipfile

# Path to the CSV file and the .zip file
csv_file_path = "./2201.00_scopus_931.csv"
zip_file_path = "./2201_00_all.zip"

# Load the CSV file into a DataFrame
scopus_df = pd.read_csv(csv_file_path)

# Ensure all paper IDs have the correct formatting with leading zeros preserved
scopus_paper_ids = set(scopus_df['paper_id'].apply(lambda x: f"{x:.5f}" if isinstance(x, (float, int)) else str(x).strip()).unique())

# Debug print: show a few sample IDs from the CSV after conversion to strings
print("Sample paper IDs from CSV (formatted):")
print(list(scopus_paper_ids)[:10])

# Extract the unique paper IDs from the .zip file tar file names
zip_paper_ids = set()
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    tar_files = [f for f in zip_file.namelist() if f.endswith('.tar.gz')]
    for tar_name in tar_files:
        # Remove any '._' prefix from the file name and extract only the paper_id part
        paper_id = tar_name.split('/')[-1].split('v')[0].replace('._', '').strip()
        zip_paper_ids.add(paper_id)

# Debug print: show a few sample IDs from the ZIP file
print("\nSample paper IDs from ZIP (cleaned):")
print(list(zip_paper_ids)[:10])

# Compute the comparison
only_in_scopus = scopus_paper_ids - zip_paper_ids
only_in_zip = zip_paper_ids - scopus_paper_ids
in_both = scopus_paper_ids & zip_paper_ids

# Debug prints for counts
print("\nNumber of unique paper IDs in Scopus CSV:", len(scopus_paper_ids))
print("Number of unique paper IDs in ZIP file:", len(zip_paper_ids))
print("Number of paper IDs present in both:", len(in_both))

# Print some IDs present only in CSV and only in ZIP for validation
print("\nSample paper IDs only in Scopus CSV:")
print(list(only_in_scopus)[:10])

print("\nSample paper IDs only in ZIP file:")
print(list(only_in_zip)[:10])

# Overview of results
results = {
    'Only in Scopus CSV': len(only_in_scopus),
    'Only in Zip File': len(only_in_zip),
    'In Both': len(in_both)
}

results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Count'])

# Save unique paper IDs for reference
only_in_scopus_output_path = "only_in_scopus.csv"
only_in_zip_output_path = "only_in_zip.csv"
in_both_output_path = "in_both.csv"

pd.DataFrame({'Paper ID': list(only_in_scopus)}).to_csv(only_in_scopus_output_path, index=False)
pd.DataFrame({'Paper ID': list(only_in_zip)}).to_csv(only_in_zip_output_path, index=False)
pd.DataFrame({'Paper ID': list(in_both)}).to_csv(in_both_output_path, index=False)

print("\nComparison complete. Results saved to CSV files.")
print(results_df)

Sample paper IDs from CSV (formatted):
['2201.00072', '2201.00926', '2201.00570', '2201.00403', '2201.00971', '2201.00860', '2201.00689', '2201.00073', '2201.00581', '2201.00002']

Sample paper IDs from ZIP (cleaned):
['2201.00072', '2201.00804', '2201.00570', '2201.00417', '2201.00689', '2201.00625', '2201.00876', '2201.00805', '2201.00749', '2201.00073']

Number of unique paper IDs in Scopus CSV: 931
Number of unique paper IDs in ZIP file: 293
Number of paper IDs present in both: 282

Sample paper IDs only in Scopus CSV:
['2201.00926', '2201.00971', '2201.00860', '2201.00403', '2201.00581', '2201.00002', '2201.00727', '2201.00794', '2201.00726', '2201.00554']

Sample paper IDs only in ZIP file:
['2201.00149', '2201.00965', '2201.00490', '2201.00354', '2201.00263', '2201.00724', '2201.00789', '2201.00188', '2201.00398', '2201.00145']

Comparison complete. Results saved to CSV files.
                    Count
Only in Scopus CSV    649
Only in Zip File       11
In Both               282

In [12]:
import pandas as pd
import zipfile

# Path to the CSV file and the .zip file
csv_file_path = "./2312_scopus_16493.csv"
zip_file_path = "./2312.zip"

print("------ 2312 results: -------")
# Load the CSV file into a DataFrame
scopus_df = pd.read_csv(csv_file_path)

# Ensure all paper IDs have the correct formatting with leading zeros preserved
scopus_paper_ids = set(scopus_df['paper_id'].apply(lambda x: f"{x:.5f}" if isinstance(x, (float, int)) else str(x).strip()).unique())

# Debug print: show a few sample IDs from the CSV after conversion to strings
print("Sample paper IDs from CSV (formatted):")
print(list(scopus_paper_ids)[:10])

# Extract the unique paper IDs from the .zip file tar file names
zip_paper_ids = set()
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    tar_files = [f for f in zip_file.namelist() if f.endswith('.tar.gz')]
    for tar_name in tar_files:
        # Remove any '._' prefix from the file name and extract only the paper_id part
        paper_id = tar_name.split('/')[-1].split('v')[0].replace('._', '').strip()
        zip_paper_ids.add(paper_id)

# Debug print: show a few sample IDs from the ZIP file
print("\nSample paper IDs from ZIP (cleaned):")
print(list(zip_paper_ids)[:10])

# Compute the comparison
only_in_scopus = scopus_paper_ids - zip_paper_ids
only_in_zip = zip_paper_ids - scopus_paper_ids
in_both = scopus_paper_ids & zip_paper_ids

# Debug prints for counts
print("\nNumber of unique paper IDs in Scopus CSV:", len(scopus_paper_ids))
print("Number of unique paper IDs in ZIP file:", len(zip_paper_ids))
print("Number of paper IDs present in both:", len(in_both))

# Print some IDs present only in CSV and only in ZIP for validation
print("\nSample paper IDs only in Scopus CSV:")
print(list(only_in_scopus)[:10])

print("\nSample paper IDs only in ZIP file:")
print(list(only_in_zip)[:10])

# Overview of results
results = {
    'Only in Scopus CSV': len(only_in_scopus),
    'Only in Zip File': len(only_in_zip),
    'In Both': len(in_both)
}

results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Count'])

# Save unique paper IDs for reference
only_in_scopus_output_path = "only_in_scopus.csv"
only_in_zip_output_path = "only_in_zip.csv"
in_both_output_path = "in_both.csv"

pd.DataFrame({'Paper ID': list(only_in_scopus)}).to_csv(only_in_scopus_output_path, index=False)
pd.DataFrame({'Paper ID': list(only_in_zip)}).to_csv(only_in_zip_output_path, index=False)
pd.DataFrame({'Paper ID': list(in_both)}).to_csv(in_both_output_path, index=False)

print("\nComparison complete. Results saved to CSV files.")
print(results_df)

------ 2312 results: -------
Sample paper IDs from CSV (formatted):
['2312.01370', '2312.05836', '2312.06818', '2312.17512', '2312.13059', '2312.06572', '2312.09129', '2312.09172', '2312.03374', '2312.05893']

Sample paper IDs from ZIP (cleaned):
['2312.14745', '2312.04321', '2312.09306', '2312.07913', '2312.05893', '2312.06572', '2312.08040', '2312.15942', '2312.02463', '2312.15154']

Number of unique paper IDs in Scopus CSV: 16493
Number of unique paper IDs in ZIP file: 2651
Number of paper IDs present in both: 2471

Sample paper IDs only in Scopus CSV:
['2312.01370', '2312.05836', '2312.17512', '2312.03374', '2312.13563', '2312.10475', '2312.01882', '2312.04183', '2312.12995', '2312.01661']

Sample paper IDs only in ZIP file:
['2312.14745', '2312.10010', '2312.00737', '2312.04781', '2312.08688', '2312.01988', '2312.08533', '2312.03408', '2312.11976', '2312.01106']

Comparison complete. Results saved to CSV files.
                    Count
Only in Scopus CSV  14022
Only in Zip File  