In [1]:
import pandas as pd

In [2]:
print(pd.__version__)

2.0.0


In [3]:
a_dict = {
    "school": "ABC primary school",
    "location": "London",
    "ranking": 2,
}
df = pd.json_normalize(a_dict)

df

Unnamed: 0,school,location,ranking
0,ABC primary school,London,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   school    1 non-null      object
 1   location  1 non-null      object
 2   ranking   1 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 156.0+ bytes


## Data as List of dictionaries

In [5]:
json_list = [
    {"class": "Year 1", "student number": 20, "room": "Yellow"},
    {"class": "Year 2", "student number": 25, "room": "Blue"},
]
df2 = pd.json_normalize(json_list)

df2

Unnamed: 0,class,student number,room
0,Year 1,20,Yellow
1,Year 2,25,Blue


In [6]:
json_list = [
    {"class": "Year 1", "num_of_students": 20, "room": "Yellow"},
    {"class": "Year 2", "room": "Blue"},  # no num_of_students
]
df3 = pd.json_normalize(json_list)

df3

Unnamed: 0,class,num_of_students,room
0,Year 1,20.0,Yellow
1,Year 2,,Blue


## When data is dictionary

In [7]:
json_obj = {
    "school": "ABC primary school",
    "location": "London",
    "ranking": 2,
    "info": {
        "president": "John Kasich",
        "contacts": {
            "email": {"admission": "admission@abc.com", "general": "info@abc.com"},
            "tel": "123456789",
        },
    },
}

df4 = pd.json_normalize(json_obj)

df4

Unnamed: 0,school,location,ranking,info.president,info.contacts.email.admission,info.contacts.email.general,info.contacts.tel
0,ABC primary school,London,2,John Kasich,admission@abc.com,info@abc.com,123456789


In [8]:
df5 = pd.json_normalize(json_obj, max_level=1)

df5

Unnamed: 0,school,location,ranking,info.president,info.contacts
0,ABC primary school,London,2,John Kasich,"{'email': {'admission': 'admission@abc.com', '..."


## When data is list of dictionaries

In [9]:
json_list = [
    {
        "class": "Year 1",
        "student count": 20,
        "room": "Yellow",
        "info": {"teachers": {"math": "Rick Scott", "physics": "Elon Mask"}},
    },
    {
        "class": "Year 2",
        "student count": 25,
        "room": "Blue",
        "info": {"teachers": {"math": "Alan Turing", "physics": "Albert Einstein"}},
    },
]

In [10]:
df5 = pd.json_normalize(json_list)

df5

Unnamed: 0,class,student count,room,info.teachers.math,info.teachers.physics
0,Year 1,20,Yellow,Rick Scott,Elon Mask
1,Year 2,25,Blue,Alan Turing,Albert Einstein


In [11]:
pd.json_normalize(json_list, max_level=1)

Unnamed: 0,class,student count,room,info.teachers
0,Year 1,20,Yellow,"{'math': 'Rick Scott', 'physics': 'Elon Mask'}"
1,Year 2,25,Blue,"{'math': 'Alan Turing', 'physics': 'Albert Ein..."


## Flatten JSON with nested lists

In [12]:
json_obj = {
    "school": "ABC primary school",
    "location": "London",
    "ranking": 2,
    "info": {
        "president": "John Kasich",
        "contacts": {
            "email": {"admission": "admission@abc.com", "general": "info@abc.com"},
            "tel": "123456789",
        },
    },
    "students": [{"name": "Tom"}, {"name": "James"}, {"name": "Jacqueline"}],
}

In [13]:
pd.json_normalize(json_obj)

Unnamed: 0,school,location,ranking,students,info.president,info.contacts.email.admission,info.contacts.email.general,info.contacts.tel
0,ABC primary school,London,2,"[{'name': 'Tom'}, {'name': 'James'}, {'name': ...",John Kasich,admission@abc.com,info@abc.com,123456789


In [14]:
# Flatten students
pd.json_normalize(json_obj, record_path=["students"])

Unnamed: 0,name
0,Tom
1,James
2,Jacqueline


In [15]:
pd.json_normalize(
    json_obj,
    record_path=["students"],
    meta=["school", ["info", "contacts", "tel"]],
)

Unnamed: 0,name,school,info.contacts.tel
0,Tom,ABC primary school,123456789
1,James,ABC primary school,123456789
2,Jacqueline,ABC primary school,123456789


In [16]:
json_list = [
    {
        "class": "Year 1",
        "student count": 20,
        "room": "Yellow",
        "info": {"teachers": {"math": "Rick Scott", "physics": "Elon Mask"}},
        "students": [
            {"name": "Tom", "sex": "M", "grades": {"math": 66, "physics": 77}},
            {"name": "James", "sex": "M", "grades": {"math": 80, "physics": 78}},
        ],
    },
    {
        "class": "Year 2",
        "student count": 25,
        "room": "Blue",
        "info": {"teachers": {"math": "Alan Turing", "physics": "Albert Einstein"}},
        "students": [
            {"name": "Tony", "sex": "M"},
            {"name": "Jacqueline", "sex": "F"},
        ],
    },
]
pd.json_normalize(json_list)

Unnamed: 0,class,student count,room,students,info.teachers.math,info.teachers.physics
0,Year 1,20,Yellow,"[{'name': 'Tom', 'sex': 'M', 'grades': {'math'...",Rick Scott,Elon Mask
1,Year 2,25,Blue,"[{'name': 'Tony', 'sex': 'M'}, {'name': 'Jacqu...",Alan Turing,Albert Einstein


In [17]:
pd.json_normalize(json_list, record_path=["students"])

Unnamed: 0,name,sex,grades.math,grades.physics
0,Tom,M,66.0,77.0
1,James,M,80.0,78.0
2,Tony,M,,
3,Jacqueline,F,,


In [18]:
pd.json_normalize(
    json_list,
    record_path=["students"],
    meta=["class", "room", ["info", "teachers", "math"]],
)

Unnamed: 0,name,sex,grades.math,grades.physics,class,room,info.teachers.math
0,Tom,M,66.0,77.0,Year 1,Yellow,Rick Scott
1,James,M,80.0,78.0,Year 1,Yellow,Rick Scott
2,Tony,M,,,Year 2,Blue,Alan Turing
3,Jacqueline,F,,,Year 2,Blue,Alan Turing


## Errors during flattenning

In [19]:
data = [
    {
        "class": "Year 1",
        "student count": 20,
        "room": "Yellow",
        "info": {
            "teachers": {
                "math": "Rick Scott",
                "physics": "Elon Mask",
            }
        },
        "students": [
            {"name": "Tom", "sex": "M"},
            {"name": "James", "sex": "M"},
        ],
    },
    {
        "class": "Year 2",
        "student count": 25,
        "room": "Blue",
        "info": {
            "teachers": {
                # no math teacher
                "physics": "Albert Einstein"
            }
        },
        "students": [
            {"name": "Tony", "sex": "M"},
            {"name": "Jacqueline", "sex": "F"},
        ],
    },
]

In [20]:
try:
    pd.json_normalize(
        data,
        record_path=["students"],
        meta=["class", "room", ["info", "teachers", "math"]],
    )
except Exception as ex:
    print(repr(ex))

KeyError("Key 'math' not found. To replace missing values of 'math' with np.nan, pass in errors='ignore'")


In [21]:
pd.json_normalize(
    data,
    record_path=["students"],
    meta=["class", "room", ["info", "teachers", "math"]],
    errors="ignore",
)

Unnamed: 0,name,sex,class,room,info.teachers.math
0,Tom,M,Year 1,Yellow,Rick Scott
1,James,M,Year 1,Yellow,Rick Scott
2,Tony,M,Year 2,Blue,
3,Jacqueline,F,Year 2,Blue,


## Custom Separator using the sep argument

In [22]:
pd.json_normalize(
    data,
    record_path=["students"],
    meta=["class", "room", ["info", "teachers", "math"]],
    errors="ignore",
    sep="->",
)

Unnamed: 0,name,sex,class,room,info->teachers->math
0,Tom,M,Year 1,Yellow,Rick Scott
1,James,M,Year 1,Yellow,Rick Scott
2,Tony,M,Year 2,Blue,
3,Jacqueline,F,Year 2,Blue,


## Adding prefix for meta and record data

In [23]:
pd.json_normalize(
    data,
    record_path=["students"],
    meta=["class"],
    meta_prefix="meta-",
    record_prefix="student-",
)

Unnamed: 0,student-name,student-sex,meta-class
0,Tom,M,Year 1
1,James,M,Year 1
2,Tony,M,Year 2
3,Jacqueline,F,Year 2


## working with remote data

In [24]:
import requests
import json

URL = "http://raw.githubusercontent.com/BindiChen/machine-learning/master/data-analysis/027-pandas-convert-json/data/simple.json"
data = json.loads(requests.get(URL).text)

# Flattening JSON data
pd.json_normalize(data)

Unnamed: 0,id,name,math,physics,chemistry
0,A001,Tom,60,66,61
1,A002,James,89,76,51
2,A003,Jenny,79,90,78


In [25]:
from pprint import pprint

json_obj = {
    {"a": 1, "b": 2, "c": []},
    {
        "a": 11,
        "b": 22,
        "c": [
            {"apple": 11, "banana": 22},
            {"apple": 111, "banana": 222},
        ],
    },
}

pprint(json_obj)

TypeError: unhashable type: 'dict'

In [26]:
pd.json_normalize(json_obj)

Unnamed: 0,school,location,ranking,students,info.president,info.contacts.email.admission,info.contacts.email.general,info.contacts.tel
0,ABC primary school,London,2,"[{'name': 'Tom'}, {'name': 'James'}, {'name': ...",John Kasich,admission@abc.com,info@abc.com,123456789
