# Graph data analysis with NetworkX and pydash

In [1]:
%pip install networkx requests pandas pydash pydantic

Note: you may need to restart the kernel to use updated packages.


In [2]:
import networkx as nx
import pandas as pd
from pydash import py_
import requests

In [3]:
API_URL = "https://api.nobelprize.org/2.1"

r = requests.get(f"{API_URL}/laureates", params={"limit": 2000})
laureates = r.json()["laureates"]

## Defining node models

In [4]:
from enum import (
    Enum,
    auto,
)

class NodeType(Enum):
    LAUREATE_PERSON = auto()
    LAUREATE_ORGANIZATION = auto()
    PRIZE = auto()
    INSTITUTION = auto()

In [5]:
import abc
from typing import (
    ClassVar,
    Optional,
)

from pydantic import (
    BaseModel,
    Field,
)


class Node(BaseModel, abc.ABC):
    node_type: ClassVar[NodeType]
    id: str

    def __hash__(self) -> int:
        return self.id.__hash__()

In [6]:
class LaureatePerson(Node):
    node_type = NodeType.LAUREATE_PERSON
    name: str
    gender: str
    birth_date: Optional[str] = None
    death_date: Optional[str] = None
    birth_location: Optional[str] = None
    birth_country: Optional[str] = None

    @classmethod
    def from_dict(cls, data: dict) -> "LaureatePerson":
        l = py_(data)
        full_name = l.get("fullName.en")

        return cls(
            id=full_name.kebab_case().value(),
            name=full_name.value(),
            gender=l.get("gender").value(),
            birth_date=l.get("birth.date").value(),
            death_date=l.get("death.date").value(),
            birth_location=l.get("birth.place.locationString.en").value(),
            birth_country=l.get("birth.place.countryNow.en").value(),
        )


class LaureateOrganization(Node):
    node_type = NodeType.LAUREATE_ORGANIZATION
    name: str

    @classmethod
    def from_dict(cls, data: dict) -> "LaureateOrganization":
        org_name = py_.get(data, "orgName.en")

        return cls(
            id=py_.kebab_case(org_name),
            name=org_name,
        )

In [7]:
class Prize(Node):
    node_type = NodeType.PRIZE
    award_year: str
    category_slug: str
    sort_order: int
    portion: str
    motivation: str
    prize_status: str
    date_awarded: Optional[str] = None

    @classmethod
    def from_dict(cls, data: dict) -> "Prize":
        p = py_(data)
        award_year = p.get("awardYear").value()
        category_slug = p.get("category.en").kebab_case().value()
        sort_order = p.get("sortOrder").value()

        return cls(
            id=f"{award_year}-{category_slug}-{sort_order}",
            award_year=award_year,
            category_slug=category_slug,
            sort_order=sort_order,
            portion=p.get("portion").value(),
            motivation=p.get("motivation.en").value(),
            prize_status=p.get("prizeStatus").value(),
            date_awarded=p.get("dateAwarded").value(),
        )

In [8]:
class Institution(Node):
    node_type = NodeType.INSTITUTION
    location: Optional[str] = None
    country: Optional[str] = None

    @classmethod
    def from_dict(cls, data: dict) -> "Institution":
        inst = py_(data)
        name = inst.get("name.en")

        return cls(
            id=name.kebab_case().value(),
            name=name.value(),
            location=inst.get("locationString.en").value(),
            country=inst.get("countryNow.en").value(),
        )

## Loading the dataset into a graph structure

In [9]:
G = nx.Graph()

In [10]:
for laureate in laureates:
    if laureate.get("fullName") is not None:
        laureate_node = LaureatePerson.from_dict(laureate)

    else:
        laureate_node = LaureateOrganization.from_dict(laureate)
    
    G.add_node(laureate_node)

    for prize in laureate.get("nobelPrizes"):
        prize_node = Prize.from_dict(prize)

        G.add_edge(prize_node, laureate_node)

        for affiliation in prize.get("affiliations", []):
            institution_node = Institution.from_dict(affiliation)
            G.add_edge(prize_node, institution_node)

## Querying the graph structure

### Female laureates, deceased

In [11]:
(
    py_(G.nodes.keys())
    .filter({"node_type": NodeType.LAUREATE_PERSON})
    .filter("death_date")
    .filter({"gender": "female"})
    .sort_by("death_date", reverse=True)
    .invoke_map("model_dump")
    .thru(pd.DataFrame)
    .value()
)

Unnamed: 0,id,name,gender,birth_date,death_date,birth_location,birth_country
0,elizabeth-williams,Elizabeth Williams,female,1943-05-22,2020-03-17,"Belfast, Northern Ireland",Northern Ireland
1,toni-morrison,Toni Morrison,female,1931-02-18,2019-08-05,"Lorain, OH, USA",USA
2,nadine-gordimer,Nadine Gordimer,female,1923-11-20,2014-07-13,"Springs, South Africa",South Africa
3,doris-lessing,Doris Lessing,female,1919-10-22,2013-11-17,"Kermanshah, Persia (now Iran)",Iran
4,rita-levi-montalcini,Rita Levi-Montalcini,female,1909-04-22,2012-12-30,"Turin, Italy",Italy
5,elinor-ostrom,Elinor Ostrom,female,1933-08-07,2012-06-12,"Los Angeles, CA, USA",USA
6,wislawa-szymborska,Wislawa Szymborska,female,1923-07-02,2012-02-01,"Bnin (now Kórnik), Poland",Poland
7,wangari-muta-maathai,Wangari Muta Maathai,female,1940-04-01,2011-09-25,"Nyeri, Kenya",Kenya
8,rosalyn-yalow,Rosalyn Yalow,female,1921-07-19,2011-05-30,"New York, NY, USA",USA
9,gertrude-b-elion,Gertrude B. Elion,female,1918-01-23,1999-02-21,"New York, NY, USA",USA


### Female laureates, living

In [12]:
(
    py_(G.nodes.keys())
    .filter({"node_type": NodeType.LAUREATE_PERSON})
    .reject("death_date")
    .filter({"gender": "female"})
    .invoke_map("model_dump")
    .thru(pd.DataFrame)
    .value()
)

Unnamed: 0,id,name,gender,birth_date,death_date,birth_location,birth_country
0,ada-e-yonath,Ada E. Yonath,female,1939-06-22,,"Jerusalem, British Mandate of Palestine (now I...",Israel
1,alice-munro,Alice Munro,female,1931-07-10,,"Wingham, Canada",Canada
2,andrea-ghez,Andrea Ghez,female,1965-06-16,,"New York, NY, USA",USA
3,annie-ernaux,Annie Ernaux,female,1940-09-01,,"Lillebonne, France",France
4,aung-san-suu-kyi,Aung San Suu Kyi,female,1945-06-19,,"Rangoon, Burma (now Yangon, Myanmar)",Myanmar
5,carol-w-greider,Carol W. Greider,female,1961-04-15,,"San Diego, CA, USA",USA
6,carolyn-r-bertozzi,Carolyn R. Bertozzi,female,1966-10-10,,"Boston, MA, USA",USA
7,christiane-nusslein-volhard,Christiane Nüsslein-Volhard,female,1942-10-20,,"Magdeburg, Germany",Germany
8,donna-strickland,Donna Strickland,female,1959-05-27,,"Guelph, Canada",Canada
9,elfriede-jelinek,Elfriede Jelinek,female,1946-10-20,,"Mürzzuschlag, Austria",Austria


### Person laureates which have been awarded multiple times

In [13]:
v_prize_laureate = nx.subgraph_view(
    G,
    filter_node=lambda n: n.node_type
    in [NodeType.PRIZE, NodeType.LAUREATE_PERSON, NodeType.LAUREATE_ORGANIZATION],
)

In [14]:
(
    py_(G.nodes.keys())
    .filter({"node_type": NodeType.LAUREATE_PERSON})
    .filter(lambda n: v_prize_laureate.degree(n) > 1)
    .invoke_map("model_dump")
    .thru(pd.DataFrame)
    .value()
)

Unnamed: 0,id,name,gender,birth_date,death_date,birth_location,birth_country
0,frederick-sanger,Frederick Sanger,male,1918-08-13,2013-11-19,"Rendcombe, United Kingdom",United Kingdom
1,john-bardeen,John Bardeen,male,1908-05-23,1991-01-30,"Madison, WI, USA",USA
2,k-barry-sharpless,K. Barry Sharpless,male,1941-04-28,,"Philadelphia, PA, USA",USA
3,linus-carl-pauling,Linus Carl Pauling,male,1901-02-28,1994-08-19,"Portland, OR, USA",USA
4,marie-curie-nee-sklodowska,"Marie Curie, née Sklodowska",female,1867-11-07,1934-07-04,"Warsaw, Russian Empire (now Poland)",Poland


In [15]:
import itertools as it

import matplotlib.pyplot as plt

In [16]:
n = list(G.nodes)[6]
n

LaureatePerson(id='aaron-ciechanover', name='Aaron Ciechanover', gender='male', birth_date='1947-10-01', death_date=None, birth_location='Haifa, British Protectorate of Palestine (now Israel)', birth_country='Israel')

In [17]:
for node, edges in it.islice(nx.single_source_shortest_path(G, n, 1).items(), 1, None):
    print(node.id)
    print([e.id for e in edges])

2004-chemistry-1
['aaron-ciechanover', '2004-chemistry-1']


In [19]:
to_joined_dataframe = (
    py_()
    .map_values(lambda rel, n: [rel_n.model_dump() | n.model_dump() for rel_n in rel])
    .values()
    .flatten()
    .thru(pd.DataFrame)
)

In [20]:
(
    py_(G.nodes.keys())
    .filter({"node_type": NodeType.LAUREATE_PERSON})
    .filter(lambda n: v_prize_laureate.degree(n) > 1)
    # .map(lambda n: (n, list(v_prize_laureate.adj[n].keys())))
    .reduce(lambda acc, n: acc | {n: list(v_prize_laureate.adj[n].keys())}, {})
    # .map_values(lambda prizes, n: [pn.model_dump() | {"fk": n.id} for pn in prizes])
    # .map_values(lambda prizes, n: [pn.model_dump() | n.model_dump() for pn in prizes])
    # .values()
    # .flatten()
    # .thru(pd.DataFrame)
    .thru(to_joined_dataframe)
    .value()
)

Unnamed: 0,id,award_year,category_slug,sort_order,portion,motivation,prize_status,date_awarded,name,gender,birth_date,death_date,birth_location,birth_country
0,frederick-sanger,1958,chemistry,1,1,"for his work on the structure of proteins, esp...",received,1958-10-28,Frederick Sanger,male,1918-08-13,2013-11-19,"Rendcombe, United Kingdom",United Kingdom
1,frederick-sanger,1980,chemistry,3,1/4,for their contributions concerning the determi...,received,1980-10-14,Frederick Sanger,male,1918-08-13,2013-11-19,"Rendcombe, United Kingdom",United Kingdom
2,john-bardeen,1956,physics,2,1/3,for their researches on semiconductors and the...,received,1956-11-01,John Bardeen,male,1908-05-23,1991-01-30,"Madison, WI, USA",USA
3,john-bardeen,1972,physics,1,1/3,for their jointly developed theory of supercon...,received,1972-10-20,John Bardeen,male,1908-05-23,1991-01-30,"Madison, WI, USA",USA
4,k-barry-sharpless,2001,chemistry,3,1/2,for his work on chirally catalysed oxidation r...,received,2001-10-10,K. Barry Sharpless,male,1941-04-28,,"Philadelphia, PA, USA",USA
5,k-barry-sharpless,2022,chemistry,3,1/3,for the development of click chemistry and bio...,received,2022-10-05,K. Barry Sharpless,male,1941-04-28,,"Philadelphia, PA, USA",USA
6,linus-carl-pauling,1962,peace,1,1,for his fight against the nuclear arms race be...,received,1963-10-10,Linus Carl Pauling,male,1901-02-28,1994-08-19,"Portland, OR, USA",USA
7,linus-carl-pauling,1954,chemistry,1,1,for his research into the nature of the chemic...,received,1954-11-03,Linus Carl Pauling,male,1901-02-28,1994-08-19,"Portland, OR, USA",USA
8,marie-curie-nee-sklodowska,1903,physics,3,1/4,in recognition of the extraordinary services t...,received,1903-11-12,"Marie Curie, née Sklodowska",female,1867-11-07,1934-07-04,"Warsaw, Russian Empire (now Poland)",Poland
9,marie-curie-nee-sklodowska,1911,chemistry,1,1,in recognition of her services to the advancem...,received,1911-11-07,"Marie Curie, née Sklodowska",female,1867-11-07,1934-07-04,"Warsaw, Russian Empire (now Poland)",Poland
