In [None]:
!pip install chardet




In [None]:
import chardet

file_path = "/content/CPSC 572 Project Raw Dataset(Sheet1).csv"

with open(file_path, "rb") as f:
    result = chardet.detect(f.read(100000))
    print("Detected encoding:", result["encoding"])


Detected encoding: Windows-1252


In [None]:
import requests
import pandas as pd
import chardet
import time

def get_coauthors(author_id, per_page=200):
    """
    Fetch co-authors for a given author.
    This function handles pagination and exceptions.
    Note: Currently not used in the main workflow.
    """
    url = f"https://api.openalex.org/works?filter=authorships.author.id:{author_id}&per_page={per_page}"
    coauthors = set()
    while url:
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching co-authors for {author_id}: {e}")
            break

        data = response.json()
        for work in data.get("results", []):
            for authorship in work.get("authorships", []):
                author = authorship.get("author", {})
                if author and author.get("id") != author_id:
                    coauthors.add(author.get("display_name", "Unknown Co-author"))

        meta = data.get("meta", {})
        next_cursor = meta.get("next_cursor")
        if next_cursor:
            url = (f"https://api.openalex.org/works?filter=authorships.author.id:{author_id}"
                   f"&per_page={per_page}&cursor={next_cursor}")
            time.sleep(1)
        else:
            url = None

    return list(coauthors)

def detect_encoding(file_path):
    """
    Detect file encoding for reading CSV reliably.
    """
    with open(file_path, "rb") as f:
        result = chardet.detect(f.read(100000))
        return result["encoding"]

def get_author_details(author_id):
    """
    Fetch detailed metadata about an author from OpenAlex.
    """
    url = f"https://api.openalex.org/authors/{author_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching author details for {author_id}: {e}")
        return {}

def search_openalex_authors(name):
    """
    Search authors by name.
    Return the list of matching authors from the OpenAlex query.
    """
    url = f"https://api.openalex.org/authors?filter=display_name.search:{name}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        return data.get("results", [])
    except requests.exceptions.RequestException as e:
        print(f"Error fetching author {name}: {e}")
        return []

def get_author_works(author_id, per_page=200):
    """
    Fetch works for a given author.
    This function handles pagination to retrieve all pages of results.
    """
    works = []
    url = f"https://api.openalex.org/works?filter=authorships.author.id:{author_id}&per_page={per_page}"
    while url:
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching works for {author_id}: {e}")
            break

        data = response.json()
        works.extend(data.get("results", []))
        meta = data.get("meta", {})
        next_cursor = meta.get("next_cursor")
        if next_cursor:
            url = (f"https://api.openalex.org/works?filter=authorships.author.id:{author_id}"
                   f"&per_page={per_page}&cursor={next_cursor}")
            time.sleep(1)
        else:
            url = None
    return works

def get_affiliations_and_countries(author_dict):
    """
    Gather affiliation and country code info from the 'affiliations'
    field of the author record returned by a search query.
    """
    affiliations = []
    countries = []
    affiliations_data = author_dict.get("affiliations", [])
    if isinstance(affiliations_data, list):
        for affiliation in affiliations_data:
            institution = affiliation.get("institution", {})
            name = institution.get("display_name", "Unknown Institution")
            country = institution.get("country_code", "Unknown Country")
            affiliations.append(name if name else "")
            countries.append(country if country else "")
    return affiliations, countries

def analyze_coauthors_and_positions(author_id, works_data, valid_author_names):
    """
    Analyze the target author's position (first/middle/last) in each work
    and collect co-author relationships (including position data) only for those
    co-authors whose names are in 'valid_author_names'.

    Note: Currently filtering is done by display name because the CSV only
    provides names. If unique identifiers were available, matching by ID
    would be more reliable.
    """
    total_works = 0
    first_author_count = 0
    last_author_count = 0
    middle_author_count = 0
    coauthor_details = {}

    for work in works_data:
        work_id = work.get("id", "")
        authorships = work.get("authorships", [])
        if not authorships:
            continue

        all_authors = []
        target_index = None
        for i, a in enumerate(authorships):
            author_info = a.get("author", {})
            author_id_in_work = author_info.get("id")
            author_name = author_info.get("display_name", "Unknown")
            all_authors.append((author_id_in_work, author_name))
            if author_id_in_work == author_id:
                target_index = i

        if target_index is not None:
            total_works += 1
            num_authors = len(all_authors)
            if target_index == 0:
                first_author_count += 1
            elif target_index == num_authors - 1:
                last_author_count += 1
            else:
                middle_author_count += 1

            for i, (coauthor_id, coauthor_name) in enumerate(all_authors):
                if coauthor_id and coauthor_id != author_id:
                    if coauthor_name in valid_author_names:
                        if coauthor_name not in coauthor_details:
                            coauthor_details[coauthor_name] = []
                        coauthor_details[coauthor_name].append(
                            (work_id, target_index, i, num_authors)
                        )
    return {
        "total_works": total_works,
        "first_author_count": first_author_count,
        "last_author_count": last_author_count,
        "middle_author_count": middle_author_count,
        "coauthor_details": coauthor_details
    }

def process_authors_from_csv(input_csv, output_csv):
    """
    Reads author names from the first column of 'input_csv', fetches detailed data
    from OpenAlex, filters co-authors to only those in the same CSV's list of authors,
    and writes enriched data to 'output_csv'.
    """
    encoding = detect_encoding(input_csv)
    df = pd.read_csv(input_csv, encoding=encoding)
    df = df[df.iloc[:, 0].notna()].copy()
    df.iloc[:, 0] = df.iloc[:, 0].astype(str).str.strip()
    valid_author_names = set(df.iloc[:, 0].tolist())

    results = []

    for name in df.iloc[:, 0]:
        if not name:
            continue

        authors_found = search_openalex_authors(name)
        if not authors_found:
            print(f"No match found on OpenAlex for '{name}'.")
            results.append({
                "Author Queried": name,
                "OpenAlex ID": "",
                "Display Name (Best Match)": "",
                "Works Count": "",
                "Cited By Count": "",
                "Affiliations": "",
                "Country Codes": "",
                "Authorship Summary": "",
                "Coauthor Collaboration Details": ""
            })
            continue

        best_match = authors_found[0]
        target_author_id = best_match.get("id", "")
        display_name = best_match.get("display_name", "")

        author_details = get_author_details(target_author_id)
        works_count = author_details.get("works_count", 0)
        cited_by_count = author_details.get("cited_by_count", 0)

        works_data = get_author_works(target_author_id)
        affiliations, countries = get_affiliations_and_countries(best_match)
        authorship_stats = analyze_coauthors_and_positions(
            author_id=target_author_id,
            works_data=works_data,
            valid_author_names=valid_author_names
        )

        authorship_summary = (
            f"Total works: {authorship_stats['total_works']}; "
            f"First-author: {authorship_stats['first_author_count']}; "
            f"Middle-author: {authorship_stats['middle_author_count']}; "
            f"Last-author: {authorship_stats['last_author_count']}"
        )

        coauthor_collab_info = {}
        for coauthor, details in authorship_stats["coauthor_details"].items():
            coauthor_collab_info[coauthor] = [
                {
                    "work_id": d[0],
                    "target_author_position": d[1],
                    "coauthor_position": d[2],
                    "total_authors_in_work": d[3]
                }
                for d in details
            ]

        results.append({
            "Author Queried": name,
            "OpenAlex ID": target_author_id,
            "Display Name (Best Match)": display_name,
            "Works Count": works_count,
            "Cited By Count": cited_by_count,
            "Affiliations": "; ".join(filter(None, affiliations)),
            "Country Codes": "; ".join(filter(None, countries)),
            "Authorship Summary": authorship_summary,
            "Coauthor Collaboration Details": str(coauthor_collab_info)
        })

    output_df = pd.DataFrame(results)
    output_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"Enriched results saved to '{output_csv}'.")

input_csv = "/content/CPSC 572 Project Raw Dataset(Sheet1).csv"
output_csv = "new_dataset.csv"

process_authors_from_csv(input_csv, output_csv)


No match found on OpenAlex for 'Jean E. Sammet'.
No match found on OpenAlex for 'Manuela M. Veloso'.
No match found on OpenAlex for 'Kathleen Antonelli'.
No match found on OpenAlex for 'Rohini Kesavan Srihari'.
No match found on OpenAlex for 'Daniela L. Rus'.
No match found on OpenAlex for 'Frances Spence'.
No match found on OpenAlex for 'Julie Beth Lovins'.
No match found on OpenAlex for 'Cecilia R. Aragon'.
No match found on OpenAlex for 'Ellen Fetter'.
No match found on OpenAlex for 'Tracy Chou'.
No match found on OpenAlex for 'Leslie P. Kaelbling'.
No match found on OpenAlex for 'Jennifer Tour Chayes'.
No match found on OpenAlex for 'Helen Chan Wolf'.
No match found on OpenAlex for 'Milly Koss'.
No match found on OpenAlex for 'Veronika Megler'.
No match found on OpenAlex for 'Asuman Özda?lar'.
No match found on OpenAlex for 'Maja Panti?'.
No match found on OpenAlex for 'V?ra K?rková'.
No match found on OpenAlex for 'Borka Jerman Blaži?'.
No match found on OpenAlex for 'Nell B. Dale

In [None]:
output_df = pd.DataFrame(results)
output_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f"Results saved to {output_csv}")

Results saved to output.csv
