In [None]:
import pandas as pd
import json
from collections import Counter


def combine_dfs(df1, df2, df3, df4, df5, df6) -> pd.DataFrame:
    df1 = df1.dropna(how="all")
    df2 = df2.dropna(how="all")
    df3 = df3.dropna(how="all")
    df4 = df4.dropna(how="all")
    df5 = df5.dropna(how="all")
    df6 = df6.dropna(how="all")
    df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)
    return df


def get_authors_country_counts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generates a DataFrame with the counts and percentage of authors' countries (excluding 'BR')
    from the 'authorships' column of the input DataFrame.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing an 'authorships' column.

    Returns:
        pd.DataFrame: A DataFrame with three columns:
                      'authors_country', 'count', and 'percentage'
                      where 'percentage' represents the percentage of total collaborations.
    """
    country_counter = Counter()

    for _, row in df.iterrows():
        # Parse the JSON string from the authorships column
        try:
            authors_list = json.loads(row["authorships"])
        except json.JSONDecodeError:
            continue  # skip rows that cannot be parsed

        # Loop through each author in the list
        for author in authors_list:
            # Retrieve the list of countries for this author
            countries = author.get("countries", [])
            for country in countries:
                # Exclude 'BR' as instructed
                if country != "BR":
                    country_counter[country] += 1

    # Convert the counter to a DataFrame
    result_df = pd.DataFrame(
        list(country_counter.items()), columns=["authors_country", "count"]
    )

    # Calculate the total count of collaborations (all countries)
    total_count = result_df["count"].sum()

    # Compute the percentage that each country's count represents
    result_df["percentage"] = result_df["count"] / total_count * 100

    # Sort the DataFrame by count in descending order
    result_df.sort_values(by="count", ascending=False, inplace=True)
    result_df.reset_index(drop=True, inplace=True)
    result_df["position"] = result_df.index + 1

    return result_df


def filter_subfielf_publications(df: pd.DataFrame, subfiled: str) -> pd.DataFrame:
    """
    Filters the DataFrame to include only publications where the
    subfield's display_name is 'Artificial Intelligence'.

    The function assumes that the 'subfield' column contains a JSON string.
    """
    # Use a lambda to parse the JSON and check the display_name
    filtered_df = df[
        df["subfield"].apply(lambda x: json.loads(x).get("display_name") == subfiled)
    ]
    return filtered_df


def filter_publications_by_citation_count(
    df: pd.DataFrame, num_citations: int
) -> pd.DataFrame:
    """
    Filters the DataFrame to include only publications with more than the specified number of citations.
    """
    filtered_df = df[df["cited_by_count"] > num_citations]
    return filtered_df


def author_stats_per_subfield(df: pd.DataFrame) -> pd.DataFrame:
    """
    Computes average, median, and standard deviation of the number of authors per publication
    for each subfield and for the entire dataset.

    Assumptions:
      - 'authorships' is a JSON string representing a list of authors.
      - 'subfield' is a JSON string containing a 'display_name' key.

    Returns:
      A DataFrame with columns:
        'subfield'      : The subfield name (or 'All Subfields' for overall stats).
        'avg_authors'   : The average number of authors.
        'median_authors': The median number of authors.
        'stdv_authors'  : The standard deviation of the number of authors.
    """
    df = df.copy()

    # Calculate the number of authors per publication
    df["num_authors"] = df["authorships"].apply(lambda x: len(json.loads(x)))

    # Extract the subfield display name from the 'subfield' JSON string
    df["subfield_name"] = df["subfield"].apply(
        lambda x: json.loads(x).get("display_name")
    )

    # Group by subfield and compute average, median, and standard deviation
    stats_df = (
        df.groupby("subfield_name")["num_authors"]
        .agg(["mean", "median", "std"])
        .reset_index()
    )
    stats_df.rename(
        columns={
            "subfield_name": "subfield",
            "mean": "avg_authors",
            "median": "median_authors",
            "std": "stdv_authors",
        },
        inplace=True,
    )

    # Compute overall statistics for the entire dataset
    overall_avg = df["num_authors"].mean()
    overall_median = df["num_authors"].median()
    overall_std = df["num_authors"].std()

    overall_df = pd.DataFrame(
        {
            "subfield": ["All Subfields"],
            "avg_authors": [overall_avg],
            "median_authors": [overall_median],
            "stdv_authors": [overall_std],
        }
    )

    # Append the overall statistics to the per-subfield statistics
    result_df = pd.concat([stats_df, overall_df], ignore_index=True)
    result_df.reset_index(drop=True, inplace=True)
    result_df["position"] = result_df.index + 1

    return result_df

In [None]:
DATA_PATH = "../data/csv/openalex/br_publications_"

df_2019 = pd.read_csv(DATA_PATH + "2019.csv")
df_2020 = pd.read_csv(DATA_PATH + "2020.csv")
df_2021 = pd.read_csv(DATA_PATH + "2021.csv")
df_2022 = pd.read_csv(DATA_PATH + "2022.csv")
df_2023 = pd.read_csv(DATA_PATH + "2023.csv")
df_2024 = pd.read_csv(DATA_PATH + "2024.csv")

br_publications_df = combine_dfs(df_2019, df_2020, df_2021, df_2022, df_2023, df_2024)
br_publications_df.to_csv("../data/csv/openalex/br_publications.csv", index=False)

In [48]:
countries_collabs_df = get_authors_country_counts(br_publications_df)

In [54]:
countries_collabs_df.head(10).to_csv("../data/csv/countries_collabs.csv", index=False)

In [50]:
ai_publications_df = filter_subfielf_publications(
    br_publications_df, "Artificial Intelligence"
)

In [57]:
get_authors_country_counts(ai_publications_df).head(10)["authors_country"]

0    US
1    GB
2    ES
3    FR
4    PT
5    IT
6    DE
7    CA
8    CN
9    AU
Name: authors_country, dtype: object

In [52]:
authors_metrics_df = author_stats_per_subfield(br_publications_df).sort_values(
    by="avg_authors", ascending=False
)
authors_metrics_df

Unnamed: 0,subfield,avg_authors,median_authors,stdv_authors,position
6,Hardware and Architecture,4.280702,4.0,2.021686,7
5,Computer Vision and Pattern Recognition,4.242988,4.0,3.337106,6
9,Signal Processing,4.151786,4.0,2.25373,10
8,Information Systems,4.069209,3.0,2.74458,9
3,Computer Networks and Communications,4.044111,4.0,2.739825,4
1,Computational Theory and Mathematics,4.036394,3.0,4.81167,2
11,All Subfields,4.028716,3.0,3.072116,12
0,Artificial Intelligence,3.947218,3.0,3.532199,1
4,Computer Science Applications,3.741397,3.0,2.306885,5
7,Human-Computer Interaction,3.694374,3.0,2.513284,8


In [53]:
authors_metrics_df.to_csv("../data/csv/openalex/authors_metrics.csv", index=False)