In [1]:
## Get visualization/assets/data/xxx/rankxxx.csv and visualization/assets/data/xxx/relxxx.csv
## data, rankidrelsourcetarget, exists

# Attempt to fix pandas import issue
# try:
# # Reinstall pandas
# import sys
# import subprocess
# print("Attempting to fix pandas...")
# subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "pandas"])
# subprocess.check_call([sys.executable, "-m", "pip", "install", "--force-reinstall", "pandas"])
# print("Reinstalled pandas!")
# except Exception as e:
# print(f"Failed to fix pandas: {e}")

try:
 import pandas as pd
 import os
 from pathlib import Path
 print("Successfully imported pandas and other libraries")
except ImportError as e:
 print(f"Unable to import pandas: {e}")
 print("Attempt to use fallback method for CSV files...")
 import csv
 from pathlib import Path
  # Define fallback functions to mimic basic pandas features
 def read_csv_manually(filepath):
 data = []
 headers = []
 with open(filepath, 'r', encoding='utf-8') as f:
 reader = csv.reader(f)
 headers = next(reader)
 for row in reader:
 data.append(dict(zip(headers, row)))
 return {'data': data, 'headers': headers}
  def write_csv_manually(data, headers, filepath):
 with open(filepath, 'w', newline='', encoding='utf-8') as f:
 writer = csv.writer(f)
 writer.writerow(headers)
 for item in data:
 writer.writerow([item.get(h, '') for h in headers])

# Periods to process
time_periods = [f"2024{str(i).zfill(2)}" for i in range(1, 11)]

# Check if pandas has been imported
pandas_available = 'pd' in globals()

for period in time_periods:
 print(f"\nProcessing period: {period}")
  # Build file paths
 base_path = Path("../visualization/assets/data")
 rank_file = base_path / period / f"rank{period}.csv"
 rel_file = base_path / period / f"rel{period}.csv"

 # Check if files exist
 if not rank_file.exists() or not rel_file.exists():
 print(f"fileexists, skipperiod {period}")
 continue
  # Read data
 try:
 if pandas_available:
 # Process with pandas
 rank_df = pd.read_csv(rank_file)
 rel_df = pd.read_csv(rel_file)
  print(f"Raw data: {len(rank_df)} nodes, {len(rel_df)} relations")
  # Extract all unique IDs
 rank_ids = set(rank_df['id'].astype(str))
 rel_sources = set(rel_df['source'].astype(str))
 rel_targets = set(rel_df['target'].astype(str))
 rel_ids = rel_sources.union(rel_targets)
  # IDs in rank but not in rel
 ids_in_rank_not_in_rel = rank_ids - rel_ids
 if ids_in_rank_not_in_rel:
 print(f" {len(ids_in_rank_not_in_rel)} rankrelID")
 # Remove these IDs from rank
 rank_df = rank_df[~rank_df['id'].astype(str).isin(ids_in_rank_not_in_rel)]
  # IDs in rel but not in rank
 ids_in_rel_not_in_rank = rel_ids - rank_ids
 if ids_in_rel_not_in_rank:
 print(f" {len(ids_in_rel_not_in_rank)} relrankID")
 # Remove edges in rel containing these IDs
 rel_df = rel_df[
 ~(rel_df['source'].astype(str).isin(ids_in_rel_not_in_rank) |  rel_df['target'].astype(str).isin(ids_in_rel_not_in_rank))
 ]
  # Print processing results
 print(f"Processed data: {len(rank_df)} nodes, {len(rel_df)} relations")
 print(f"Removed {len(ids_in_rank_not_in_rel)} nodesand containing {len(ids_in_rel_not_in_rank)} IDs")
  # Save processed data
 rank_df.to_csv(rank_file, index=False)
 rel_df.to_csv(rel_file, index=False)
 print(f"Save processed datapath")
 else:
 # Process with fallback method
 rank_data = read_csv_manually(rank_file)
 rel_data = read_csv_manually(rel_file)
  rank_items = rank_data['data']
 rel_items = rel_data['data']
  print(f"Raw data: {len(rank_items)} nodes, {len(rel_items)} relations")
  # Extract all unique IDs
 rank_ids = set(str(item['id']) for item in rank_items)
 rel_sources = set(str(item['source']) for item in rel_items)
 rel_targets = set(str(item['target']) for item in rel_items)
 rel_ids = rel_sources.union(rel_targets)
  # IDs in rank but not in rel
 ids_in_rank_not_in_rel = rank_ids - rel_ids
 if ids_in_rank_not_in_rel:
 print(f" {len(ids_in_rank_not_in_rel)} rankrelID")
 # Remove these IDs from rank
 rank_items = [item for item in rank_items if str(item['id']) not in ids_in_rank_not_in_rel]
  # IDs in rel but not in rank
 ids_in_rel_not_in_rank = rel_ids - rank_ids
 if ids_in_rel_not_in_rank:
 print(f" {len(ids_in_rel_not_in_rank)} relrankID")
 # Remove edges in rel containing these IDs
 rel_items = [item for item in rel_items  if str(item['source']) not in ids_in_rel_not_in_rank  and str(item['target']) not in ids_in_rel_not_in_rank]
  # Print processing results
 print(f"Processed data: {len(rank_items)} nodes, {len(rel_items)} relations")
 print(f"Removed {len(ids_in_rank_not_in_rel)} nodesand containing {len(ids_in_rel_not_in_rank)} IDs")
  # Save processed data
 write_csv_manually(rank_items, rank_data['headers'], rank_file)
 write_csv_manually(rel_items, rel_data['headers'], rel_file)
 print(f"Save processed datapath")
  except Exception as e:
 print(f"Processing period {period} error occurred: {str(e)}")

print("\nAll periods processed")


Successfully imported pandas and other libraries

Processing period: 202401
Raw data: 58668 nodes, 547798 relations
Processed data: 58668 nodes, 547798 relations
Removed 0 nodesand containing 0 IDs
Save processed datapath

Processing period: 202402
Raw data: 58447 nodes, 522426 relations
Processed data: 58447 nodes, 522426 relations
Removed 0 nodesand containing 0 IDs
Save processed datapath

Processing period: 202403
Raw data: 58742 nodes, 543897 relations
Processed data: 58742 nodes, 543897 relations
Removed 0 nodesand containing 0 IDs
Save processed datapath

Processing period: 202404
Raw data: 58902 nodes, 527637 relations
Processed data: 58902 nodes, 527637 relations
Removed 0 nodesand containing 0 IDs
Save processed datapath

Processing period: 202405
Raw data: 58733 nodes, 650737 relations
Processed data: 58733 nodes, 650737 relations
Removed 0 nodesand containing 0 IDs
Save processed datapath

Processing period: 202406
Raw data: 58575 nodes, 656538 relations
Processed data: 585