# Nationality data shape analysis notebook

## Define default variables
Overwritten by widget values if the notebook is viewed in Mercury

In [292]:

selected_nationality = "American"
print(f"selected_nationality={selected_nationality}")


selected_nationality=American


# Linked Art - Data Shape - Persons in Exhibitions
This notebook shows the *shape* of data for persons associated with exhibitions, included in the selected Linked Art dataset.

## Purpose
Understanding the shape of data helps to identify features of the data that can be presented in a useable, browsable sectioned display.


It is proposed that a characteristic of the data that results i an even distribution of records, will correspond to a useable, browsable breakdown of the data that can be presented in a sectioned display.

## How-to

## Visual Studio Code - Jupyter plugin
https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter 

If you have the Jupyter notebook extension, you can run the notebooks in Visual Studio Code editor:
- open and run the `variables.ipynb` notebook
- open and run the `persons.ipynb` notebook

## Jupyter Notebook
https://jupyter.org/ 


The run the Jupyter notebook in the browser, use the following command:
- `jupyter notebook`
- open and run the `variables.ipynb` notebook
- open and run the `persons.ipynb` notebook

### Mercury Web application
https://mljar.com/mercury/ 

The run this notebook as a web app using Mercury, use the following commands in the root directory:

- `mercury run variables.ipynb`
- `mercury run persons.ipynb`

- Go the to URL provided by Mercury, e.g. http://127.0.0.1:8000/ 
- Run the Notebook variables notebook
- Open the Persons notebook, and click the green `Run` button in the left-hand column

## Get stored variables
Get stored variables - see variables.ipynb

In [293]:

%store -r summary_data_dir_activity
%store -r activity_all_file


%store -r summary_data_dir_person
%store -r persons_all_file

%store -r events_nonmoma

%store -r json_suffix

%store -r linked_data_filepath_group

%store -r datavis_venue_exhibitions 

In [294]:
import os
import json

try:
    import plotly.express as px
except:
    %pip install plotly.express
    import plotly.express as px


In [295]:


def create_histogram(data, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list, histogram_type, sort_type): 


    count_ignored = 0
    number_dict = {}
    # iterate through selected entity type
    for entity in data.get(entity_type):

        if selected_field in entity:
            property = entity.get(selected_field)
            if property is None:
                continue
            if histogram_type == "surname":
                property = property.split(" ")[0][:1].upper()
            if histogram_type == "born":
                property = property.split("-")[0]
            if histogram_type == "born_decade":
                property = property.split("-")[0][:3] + "0s"

            if property in ignore_list:
                count_ignored += 1
                continue

            if property in number_dict:
                number_dict[property] += 1
            else:
                number_dict[property] = 1

    # sort the dictionary by frequency or label - determined by var sort_type
    number_dict = dict(sorted(number_dict.items(), key=lambda item: item[sort_type]))


    fig = px.bar(x=number_dict.keys(), y=number_dict.values(), labels={"x": x_label, "y": y_label},title=histogram_title)
    return fig

def create_summary_datafile(data, entity_type, selected_field, ignore_list, histogram_type, sort_type): 

    null = None
    
    property_dict = {}
    number_dict = {}
    # iterate through selected entity type
    for entity in data.get(entity_type):

        if selected_field in entity:
            property = entity.get(selected_field)
            if property == null:
                continue
            if histogram_type == "surname":
                property = property.split(" ")[0][:1].upper()
            if histogram_type == "born":
                property = property.split("-")[0]
            if histogram_type == "born_decade":
                property = property.split("-")[0][:3] + "0s"

            if property in ignore_list:
                continue
 

            if property in number_dict:
                number_dict[property] += 1
            else:
                number_dict[property] = 1

            if property not in property_dict:
                property_dict[property] = []
                
            property_dict[property].append(entity)

    number_dict = dict(sorted(number_dict.items(), key=lambda item: item[sort_type]))

    property_dict = dict(sorted(property_dict.items(), key=lambda item: item[sort_type]))

    return ({"count": number_dict, entity_type : property_dict})


def getNonAmericanData(data, selected_field):
    data_selected = []
    entity_type="persons"
    ignore_list = ["Male", "Female" , "American", ""]
    
    # iterate through selected entity type
    for entity in data.get(entity_type):
        if selected_field in entity:
            property = entity.get(selected_field)
            if property == "":
                continue
            if property in ignore_list:
                continue
            if property is None:
                continue
            
            data_selected.append(entity)

    return data_selected

def getSelectedNationalityData(data, selected_field, selected_value):
    data_selected = []
    entity_type = "persons"
    ignore_list = ["Male", "Female", ""]
    null == None
    # iterate through selected entity type
    for entity in data.get(entity_type):
        if selected_field in entity:
            property = entity.get(selected_field)
            if property == "":
                continue
            if property != selected_value:
                continue
            if property in ignore_list:
                continue
            if property == null:
                continue
            
            data_selected.append(entity)

    return data_selected



In [296]:
## read person data file
file_dir = summary_data_dir_person
file_entity = persons_all_file

data = {}

# open summary data file containing all person records 
with open(os.path.join(file_dir, file_entity), 'r') as json_file: 
    # get data
    data = json.load(json_file)

    print(str(len(data.get("persons"))) +  " persons in data file - " + persons_all_file )


23684 persons in data file - persons_all.json


## Nationality:all 
excluding empty values

In [297]:
x_label = "Nationality"
y_label = "Number of people"
histogram_title = "Nationality:all"
entity_type = "persons"

selected_field = "nationality"

ignore_list = ["", "Male", "Female"]

histogram = create_histogram(data, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list,"", 1)
histogram.show()

### Nationality:all --> Surname

In [298]:
# get data for selected nationality

data_selected = []

entity_type = "persons"
selected_field = "nationality"

null = None
ignore_list = ["Male", "Female"]
data_dict = {}
# iterate through selected entity type
for entity in data.get(entity_type):
    if selected_field in entity:
        property = entity.get(selected_field)
        if property == "" or property in ignore_list:
            continue
        data_selected.append(entity)


x_label = "Surname"
y_label = "Number of people"
histogram_title = f"Nationality:all --> Surname"


selected_field = "name"
histogram_type = "surname"
ignore_list = [""]

if len(data_selected) > 0:

    histogram = create_histogram({"persons": data_selected}, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list,histogram_type,0)
    histogram.show()

### Nationality:selected - Surname

In [299]:
# get data for selected nationality

data_selected = getSelectedNationalityData(data, "nationality", selected_nationality)

entity_type = "persons"
x_label = "Surname"
y_label = "Number of people"
histogram_title = f"Nationality:{selected_nationality} --> Surname"

selected_field = "name"
histogram_type = "surname"
ignore_list = [""]

if len(data_selected) > 0:

    histogram = create_histogram({"persons": data_selected}, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list,histogram_type,0)
    histogram.show()

## Nationality:American

### Nationality:American -> Surname

In [300]:
selected_nationality = "American"
data_selected = getSelectedNationalityData(data, "nationality", selected_nationality)

x_label = "Surname"
y_label = "Number of people"
histogram_title = f"Nationality:American --> Surname"

selected_field = "name"
histogram_type = "surname"
ignore_list = [""]

if len(data_selected) > 0:

    histogram = create_histogram({"persons": data_selected}, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list,histogram_type,0)
    histogram.show()

### Nationality:American --> Birth year

In [307]:

selected_nationality = "American"
data_selected = getSelectedNationalityData(data, "nationality", selected_nationality)


x_label = "Surname"
y_label = "Number of people"
histogram_title = f"Nationality:American / Birth year"


selected_field = "born"
histogram_type = "born"
ignore_list = []

if len(data_selected) > 0:

    histogram = create_histogram({"persons": data_selected}, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list,histogram_type,0)
    histogram.show()

    summary_datafile = create_summary_datafile({"persons":data_selected}, entity_type, selected_field, ignore_list,histogram_type,0)

    with open(os.path.join(summary_data_dir_person, 'persons_american_birthyear.json'), 'w') as file:
        file.write(json.dumps(summary_datafile,indent=2))

### Nationality:American --> Birth decade

In [302]:

selected_nationality = "American"
data_selected = getSelectedNationalityData(data, "nationality", selected_nationality)

x_label = "Decade"
y_label = "Number of people"
histogram_title = f"Nationality:American --> Birth decade"


selected_field = "born"
histogram_type = "born_decade"
ignore_list = ["","0s"]

if len(data_selected) > 0:

    histogram = create_histogram({"persons": data_selected}, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list,histogram_type,0)
    histogram.show()

### Nationality:non-American 
excluding 'American' and empty values

In [303]:


histogram_title = "Nationality:non-American"
ignore_list = ["","Male", "Female", "American"]

x_label = "Nationality"
y_label = "Number of people"

selected_field = "nationality"


histogram = create_histogram(data, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list,"",1)
histogram.show()



### Nationality:non-American --> Surname

In [304]:
data_selected = getNonAmericanData(data, "nationality")

x_label = "Surname"
y_label = "Number of people"
histogram_title = f"Nationality:non-American --> Surname"

selected_field = "name"
histogram_type = "surname"
ignore_list = [""]

if len(data_selected) > 0:

    histogram = create_histogram({"persons": data_selected}, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list,histogram_type,0)
    histogram.show()

### Nationality:non-American --> Birth year

In [308]:
data_selected = getNonAmericanData(data, "nationality")


entity_type = "persons"
x_label = "Surname"
y_label = "Number of people"
histogram_title = f"Nationality:non-American --> Birth year"

selected_field = "born"
histogram_type = "born"
ignore_list = []

if len(data_selected) > 0:

    histogram = create_histogram({"persons": data_selected}, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list,histogram_type,0)
    histogram.show()

    summary_datafile = create_summary_datafile({"persons":data_selected}, entity_type, selected_field, ignore_list,histogram_type,0)

    with open(os.path.join(summary_data_dir_person, 'persons_nonus_birthyear.json'), 'w') as file:
        file.write(json.dumps(summary_datafile,indent=2))

### Nationality:non-American --> Birth decade

In [306]:

data_selected = getNonAmericanData(data, "nationality")

x_label = "Surname"
y_label = "Number of people"
histogram_title = f"Nationality:non-American --> Birth decade"

selected_field = "born"
histogram_type = "born_decade"
ignore_list = ["","0s"]

if len(data_selected) > 0:

    histogram = create_histogram({"persons": data_selected}, x_label, y_label, histogram_title, entity_type, selected_field, ignore_list,histogram_type,0)
    histogram.show()