In [None]:
import chardet

file_path = "/content/new_dataset.csv"

with open(file_path, "rb") as f:
    result = chardet.detect(f.read(100000))
    print("Detected encoding:", result["encoding"])


Detected encoding: UTF-8-SIG


In [None]:
! pip install pycountry

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1


In [None]:
import csv
import ast

INPUT_FILE = 'new_dataset.csv'
EDGES_FILE = 'edges_undirected.csv'
NODES_FILE = 'nodes_undirected.csv'

pair_collab_count = {}

nodes = {}

def update_node_attributes(author, display_name, affiliations_str, country_str, works_count, cited_by_count):
    if author not in nodes:
        nodes[author] = {
            "Display Name": display_name.strip() if display_name else "",
            "Affiliations": set(),
            "Country Codes": set(),
            "Works Count": 0,
            "Cited By Count": 0
        }

    if affiliations_str:
        for aff in affiliations_str.split(';'):
            aff = aff.strip()
            if aff:
                nodes[author]["Affiliations"].add(aff)

    if country_str:
        for code in country_str.split(';'):
            code = code.strip()
            if code:
                nodes[author]["Country Codes"].add(code)

    try:
        nodes[author]["Works Count"] = int(works_count)
    except (ValueError, TypeError):
        pass
    try:
        nodes[author]["Cited By Count"] = int(cited_by_count)
    except (ValueError, TypeError):
        pass

with open(INPUT_FILE, 'r', encoding='utf-8-sig') as f:
    reader = csv.DictReader(f)

    for row in reader:
        author_queried = row.get("Author Queried", "").strip()
        if not author_queried:
            continue

        display_name = row.get("Display Name (Best Match)", "")
        affiliations = row.get("Affiliations", "")
        country_codes = row.get("Country Codes", "")
        works_count = row.get("Works Count", "")
        cited_by_count = row.get("Cited By Count", "")

        update_node_attributes(author_queried, display_name, affiliations, country_codes, works_count, cited_by_count)

        coauthors_str = row.get("Coauthor Collaboration Details", "").strip()
        if not coauthors_str or coauthors_str == "{}":
            continue

        try:
            coauthors_dict = ast.literal_eval(coauthors_str)
        except Exception as e:
            print(f"Error parsing coauthor details for {author_queried}: {e}")
            continue

        if not coauthors_dict:
            continue

        for coauthor_name, collab_list in coauthors_dict.items():
            coauthor_name = coauthor_name.strip()
            if not coauthor_name or not collab_list:
                continue

            if coauthor_name not in nodes:
                nodes[coauthor_name] = {
                    "Display Name": "",
                    "Affiliations": set(),
                    "Country Codes": set(),
                    "Works Count": 0,
                    "Cited By Count": 0
                }

            pair_key = tuple(sorted([author_queried, coauthor_name]))
            if author_queried == pair_key[0]:
                count = len(collab_list)
                pair_collab_count[pair_key] = pair_collab_count.get(pair_key, 0) + count

with open(EDGES_FILE, 'w', newline='', encoding='utf-8-sig') as f:
    writer = csv.writer(f)
    writer.writerow(["Source", "Target", "Weight"])
    for (author1, author2), count in pair_collab_count.items():
        writer.writerow([author1, author2, count])

with open(NODES_FILE, 'w', newline='', encoding='utf-8-sig') as f:
    writer = csv.writer(f)
    writer.writerow([
        "Id", "Label", "Display Name", "Affiliations",
        "Country Codes", "Works Count", "Cited By Count"
    ])
    for node, attrs in nodes.items():
        affiliations_str = "; ".join(sorted(attrs["Affiliations"]))
        country_codes_str = "; ".join(sorted(attrs["Country Codes"]))
        writer.writerow([
            node, node, attrs["Display Name"], affiliations_str,
            country_codes_str, attrs["Works Count"], attrs["Cited By Count"]
        ])
