# Data visualization project - WikiViz 
<hr style="border:3px solid gray">

# Imports 

In [10]:
import plotly.express as px
import pandas as pd
from dash import Dash, dcc, html, Input, Output, ctx
import plotly.graph_objects as go
import dash
import json
import numpy as np
import geopandas as gpd

# Loading data and preprocessing

In [7]:
# Loading the dataset
df = pd.read_csv("Data/AgesDatasetV4.csv")

# Loading geojson of world
with open('custom2.geo.json', 'r', encoding='utf-8') as f:
    geojson = json.load(f)

In [8]:
category_mapping = {
    'Arts and Entertainment': ['Artist', 'Novelist', 'Pianist', 'Film producer', 'Cinematographer', 'Designer', 'Playwright', 'Author'],
    'Politics and Public Service': ['Politician', 'Military personnel', 'Flying ace', 'Judge', 'Jurist', 'Police officer', 'Minister'],
    'Science and Academia': ['Researcher', 'Astronomer', 'Biologist', 'Academic', 'Anthropologist', 'Geographer', 'Psychologist', 'Philosopher'],
    'Religion': ['Religious figure', 'Rabbi'],
    'Sports': ['Athlete', 'Rower', 'Fencer', 'Amateur wrestler', 'Sailor'],
    'Business and Commerce': ['Businessperson', 'Entrepreneur', 'Banker', 'Merchant', 'Publisher'],
    'Healthcare': ['Physician', 'Surgeon', 'Psychiatrist'],
    'Law and Justice': ['Lawyer'],
    'Media and Communication': ['Journalist', 'Translator'],
    'Education': ['Teacher', 'Librarian'],
    'Engineering and Architecture': ['Architect', 'Engineer'],
    'Agriculture': ['Farmer'],
    'Others': ['Aristocrat', 'Inventor', 'Explorer', 'Unspecified'],
    # 'Unspecified': ['Unspecified']
}

# categorizing occupations
def categorize_occupations(occupation):
    for category, occupations_list in category_mapping.items():
        if occupation in occupations_list:
            return category
    return 'Others'  # if occupation doesn't match any category, assign it to 'Others' - maybe need better name, as some of the 'Others' fall under the rest of the categories (fx. athlete)

# Add a new column 'Occupation categories' based on categorization
df['Occupation categories'] = df['Occupation'].apply(categorize_occupations)

In [11]:
unique_country = df['AssociatedModernCountry'].unique()
unique_country.sort()

# Load the Natural Earth dataset
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Create a dictionary mapping continents to lists of countries
continents_countries = {}
for continent in world['continent'].unique():
    countries_in_continent = world[world['continent'] == continent]['name'].tolist()
    continents_countries[continent] = countries_in_continent
continents_countries = {continent: sorted(countries) for continent, countries in continents_countries.items()}

# add to list together
country_list = sorted(world['continent'].unique().tolist()) + unique_country.tolist() 

#### Occupation ####
# get unique Occupaion list
unique_occupation = df['Occupation'].unique()
unique_occupation.sort()

#### Gender ####
# get gender list
gender_list = sorted(df['Gender'].unique().tolist())
# add 'all' to the gender_list
gender_list.append('All')

  world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))


# Dash setup

In [12]:
# Create the Dash app
app = dash.Dash(__name__)

# Define the app layout
app.layout = html.Div([
    html.Div([
        html.Div([
            html.Div(id='total-observations-tile', style={'padding': '5px', 'border-right': '1px solid lightgray'}),
            html.Div(id='selected-years-tile', style={'padding': '5px', 'border-right': '1px solid lightgray'})
        ], style={'display': 'flex', 'justify-content': 'space-between', 'align-items': 'center', 'width': '25%'})
    ], style={'background-color': 'lightgray', 'height': '50px', 'display': 'flex', 'align-items': 'center'}),
    html.Div([
        dcc.Graph(id='choropleth', style={'width': '55%', 'display': 'inline-block'}),
        dcc.Graph(id='sunburst', style={'width': '45%', 'display': 'inline-block'})
    ]),
    html.Div([
        html.Button('1917-1921', id='button-1', n_clicks=0),
        html.Button('1939-1945', id='button-2', n_clicks=0),
        html.Button('1980-1985', id='button-3', n_clicks=0),
        dcc.RangeSlider(
            id='year-slider',
            min=-1000,
            max=2021,
            value=[-1000, 2021],
            marks={str(year): str(year) for year in range(-1000, 2021, 200)},
            step=None
        ),
        dcc.Graph(id='linechart')  # New line plot graph component
    ], style={'width': '100%', 'margin': 'auto'})
])


filters =html.Div([
    html.H2("Filters"),
    html.Hr(),
    #dbc.Button(id='gender-button', n_clicks=0, children='Both'),
    # add dropdown gender instead of button
    html.P("Select gender"),
    dcc.Dropdown(
        options=
        [{'label':gender,'value': gender} for gender in gender_list],
        multi = True,
        id = 'dropdown-gender',
        placeholder="Select options...",
        style={'width': '100%'}
    ),
    html.Div(id='output-div-gender'),
    #html.Div(id='current-state-div', style={'margin-top': '10px'}),
    html.Hr(),
    html.P("Select the countries you want to compare"),
    dcc.Dropdown(
        # add continents from continents_countries and unique countries to the dropdown
        options=
            [{'label': continent, 'value': continent} for continent in country_list],
        multi=True,
        id='dropdown-checklist',
        placeholder="Select options...",
        style={'width': '100%'}
    ),
    html.Div(id='output-country-div'),
    html.Hr(),
    html.P("Select the occupation you want to compare"),
    # occupation dropdown
    dcc.Dropdown(
        options=
            [{'label': occupation, 'value': occupation} for occupation in unique_occupation],
        multi=True,
        id='dropdown-checklist-occupation',
        placeholder="Select options...",
        style={'width': '100%'}
        ),
    html.Div(id='output-div-occupation'),
    html.Hr(),
                        
    ], style={'width': '100%', 'margin': 'auto'})

# global variables and function for changing them

In [14]:
country_selection = set()
gender_selection = set()
occupation_selection = set()
timerange_selection = []

base_countries = set("Denmark","Turkey","Norway")
base_genders = set("male", "female")
base_occupations = set("Worker","worker2", "worker3")

def set_attribute(attribute, values):
    global country_selection
    global occupation_selection 
    global gender_selection
    
    if values == "all": 
        if attribute == "country": 
            country_selection = base_countries
        elif attribute == "occupation": 
            occupation_selection = base_occupations
        elif attribute == "gender": 
            gender_selection = base_genders 
    
    attribute_selection = []
    if attribute == "country": 
            attribute_selection = country_selection
    elif attribute == "occupation": 
            attribute_selection = occupation_selection 
    elif attribute == "gender": 
            gender_selection = gender_selection 
    
    # TODO: are you sure the variable we wants to use is countries?
    elif type(countries) == str: 
        attribute_selection.add(values)
    else: 
        for value in values: 
            attribute_selection.add(value)
    
def remove_attribute(attribute, values):  
    global country_selection
    global occupation_selection 
    global gender_selection
    
    attribute_selection = []
    if attribute == "country": 
        attribute_selection = country_selection
    elif attribute == "occupation": 
        attribute_selection = occupation_selection 
    elif attribute == "gender": 
        gender_selection = gender_selection 
    
    # TODO: are you sure the variable we wants to use is countries?
    elif type(countries) == str: 
        attribute_selection.add(values)
    else: 
        for value in values: 
            attribute_selection.add(value)


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 26)

<hr style="border:3px solid gray">

# Choropleth functions 

In [15]:
map_df = df["AssociatedModernCountry"].value_counts().rename_axis('Country').reset_index(name='Counts')

country_lookup = {feature['properties']['admin']: feature 
                   for feature in geojson['features']}

selections = set(country_lookup.keys())
selections_default_size = len(selections)

def select_all():
    global selections
    selections = set(country_lookup.keys())

def deselect_all():
    global selections 
    selections = set() 

def get_highlights(selections, geojson=geojson, country_lookup= country_lookup):
    geojson_highlights = dict()
    for k in geojson.keys():
        if k != 'features':
            geojson_highlights[k] = geojson[k]
        else:
            geojson_highlights[k] = [country_lookup[selection] for selection in selections]        
    return geojson_highlights

def get_figure(selections, normalization_mode = "None"):
    
    color_scale = "viridis"
    
    # Base choropleth layer --------------#
    fig = px.choropleth_mapbox(map_df, geojson=geojson, 
                               color="Counts",                               
                               locations="Country", 
                               featureidkey="properties.admin",
                               color_continuous_scale=color_scale,
                               opacity=0.25)

    # Second layer - Highlights ----------#
    if len(selections) > 0:
        # highlights contain the geojson information for only 
        # the selected countries
        highlights = get_highlights(selections)

        fig.add_trace(
            px.choropleth_mapbox(map_df, geojson=highlights, 
                                 color="Counts",
                                 locations="Country", 
                                 featureidkey="properties.admin",
                                 color_continuous_scale=color_scale,                                 
                                 opacity=1).data[0]
        )

    #------------------------------------#
    fig.update_layout(mapbox_style="carto-positron", 
                      mapbox_zoom=9,
                      mapbox_center={"lat": 45.5517, "lon": -73.7073},
                      margin={"r":0,"t":0,"l":0,"b":0},
                      uirevision='constant')
    
    return fig

@app.callback(
    Output('choropleth', 'figure'),
    [Input('choropleth', 'clickData'),
     Input('sunburst', 'clickData')])

def update_figure(clickData, value):    
    if clickData is not None:            
        location = clickData['points'][0]['location']
        
        if len(selections) == selections_default_size: 
            deselect_all()
            selections.add(location)
        
        elif location not in selections:
            selections.add(location)
          
        else:
            selections.remove(location)
            if len(selections) == 0: 
                select_all()
    print(selections)
    return get_figure(selections, value)

# Sunburst functions

In [16]:
selected_countries = []
@app.callback(
    Output('sunburst', 'figure'),
    [Input('choropleth', 'clickData'), 
     Input('year-slider', 'value')]
)
def update_sunburst(clickData, year_range):  # maybe you're missing a filtered_df somewhere with the slicer?
    global selected_countries
    # filter based on year range
    filtered_df = df[(df['Birth year'] <= year_range[1]) & (df['Death year'] >= year_range[0])]
    
    if clickData is not None:
        clicked_country = clickData['points'][0]['location']
    
        if clicked_country not in selected_countries:
            selected_countries.append(clicked_country)
        else:
            selected_countries.remove(clicked_country)
            
            if len(selected_countries) == 0:
                title = 'Occupation Distribution by Category in the World'
                occupation_count = filtered_df.groupby(['Occupation categories', 'Occupation']).size().reset_index(name='count')
                fig = px.sunburst(occupation_count,
                                 path = ['Occupation categories', 'Occupation'], 
                                  values = 'count',
                                 title = title,
                                 width = 500,
                                 height = 500)
                # fig.update_layout(title = 'Occupation Sunburst Plot')
                return fig
                
        print(selected_countries, " + ", len(selected_countries))

        # Filter data for the clicked country
        filtered_df = filtered_df[(filtered_df['AssociatedModernCountry'].isin(selected_countries))]


        occupation_count = filtered_df.groupby(['Occupation categories', 'Occupation']).size().reset_index(name='count')
        
        if len(selected_countries) > 0:
            title = 'Occupation Distribution by Category in ' + ', '.join(selected_countries)
        else:
            title = 'Occupation Distribution by Category in the World'
            
        fig = px.sunburst(occupation_count,
                         path = ['Occupation categories', 'Occupation'], 
                          values = 'count',
                         title = title,
                         width = 500,
                         height = 500)
        # fig.update_layout(title = 'Occupation Sunburst Plot')
        return fig
    else:
        if len(selected_countries) == 0:
            title = 'Occupation Distribution by Category in the World'
            occupation_count = filtered_df.groupby(['Occupation categories', 'Occupation']).size().reset_index(name='count')
            fig = px.sunburst(occupation_count,
                             path = ['Occupation categories', 'Occupation'], 
                              values = 'count',
                             title = title,
                             width = 500,
                             height = 500)
            # fig.update_layout(title = 'Occupation Sunburst Plot')
            return fig
        else: 
            filtered_df = filtered_df[(filtered_df['AssociatedModernCountry'].isin(selected_countries))]
            occupation_count = filtered_df.groupby(['Occupation categories', 'Occupation']).size().reset_index(name='count')

            if len(selected_countries) > 0:
                title = 'Occupation Distribution by Category in ' + ', '.join(selected_countries)
            else:
                title = 'Occupation Distribution by Category in the World'

            fig = px.sunburst(occupation_count,
                             path = ['Occupation categories', 'Occupation'], 
                              values = 'count',
                             title = title,
                             width = 500,
                             height = 500)
            # fig.update_layout(title = 'Occupation Sunburst Plot')
            return fig

# Linechart functions

In [17]:
@app.callback(
    Output('linechart', 'figure'),
    [Input('year-slider', 'value')]
)
def update_line_plot(year_range):
    # count deaths per year
    df['Century'] = (df['Birth year'] // 100) * 100
    average_age_per_century = df.groupby('Century')['Age of death'].mean()

    fig = go.Figure()

    # line plot for average age per century
    fig.add_trace(go.Scatter(x=average_age_per_century.index, y=average_age_per_century.values,
                             mode='lines+markers', name='Average Age'))

    fig.update_layout(title='Average Age of Death Across Centuries',
                      xaxis_title='Century',
                      yaxis_title='Average Age of Death',
                      xaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightPink'),
                      yaxis=dict(showgrid=True, gridwidth=1, gridcolor='LightPink'),
                      plot_bgcolor='white')

    # range slider and selectors
    fig.update_layout(xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(step='all', label='All Centuries'),
            ])
        ),
        rangeslider=dict(visible=True)
    ))

    return fig 
    

# Filter button/dropdown/slider functions 

In [18]:
@app.callback(
    Output('year-slider', 'value'),
    [Input('button-1', 'n_clicks'),
     Input('button-2', 'n_clicks'),
     Input('button-3', 'n_clicks')]
)
def update_slider(button_1, button_2, button_3):
    clicked_button_id = dash.callback_context.triggered[0]['prop_id'].split('.')[0]
    if clicked_button_id == 'button-1':
        return [1917, 1921]
    elif clicked_button_id == 'button-2':
        return [1939, 1945]
    elif clicked_button_id == 'button-3':
        return [1980, 1985]
    else:
        return [df['Birth year'].min(), df['Birth year'].max()]
    
@app.callback(
    Output('output-country-div', 'children'),
    #Input('dropdown-checklist', 'value')
)
def update_country_list(selected_options):
    if selected_options:
        # check if selected options has continents
        continents = [option for option in selected_options if option in continents_countries.keys()]
        # check if selected options has countries
        countries = [option for option in selected_options if option in continents_countries.values()]
        # if continents are selected, add all countries in the continent to the selected options
        if continents:
            for continent in continents:
                selected_options.extend(continents_countries[continent])
        # if countries are selected, add all continents of the countries to the selected options
        if countries:
            for country in countries:
                selected_options.extend(world[world['name'] == country]['continent'].tolist())
        # remove duplicates in selected options
        selected_options = list(set(selected_options))
        # sort selected options
        selected_options.sort()
        #return f'Selected options: {", ".join(selected_options)}'
        return selected_options
    else:
        return 'No options selected'

# callback for occupation dropdown checklist

@app.callback(
    Output('output-div-occupation', 'children'),
    Input('dropdown-checklist-occupation', 'value')
)
def update_occupation_list(selected_options):
    if selected_options:
        # remove duplicates in selected options
        selected_options = list(set(selected_options))
        # sort selected options
        selected_options.sort()
        #return f'Selected options: {", ".join(selected_options)}'
        return selected_options
    else:
        return 'No options selected'


# Tiles 

In [19]:
@app.callback(
    Output('total-observations-tile', 'children'),
    [
        Input('sunburst', 'clickData'),
        Input('choropleth', 'clickData'),
        Input('year-slider', 'value')
    ]
)
def update_total_observations(click_sunburst, click_map, year_range):
    # Filter the DataFrame based on the interactions
    filtered_df = df[(df['Birth year'] <= year_range[1]) & (df['Death year'] >= year_range[0])]
    if click_sunburst is not None:
        clicked_occupation = click_sunburst['points'][0]['label']
        if (click_sunburst['points'][0]['label'] == click_sunburst['points'][0]['id']) == False:  
            # print(clickData)
            filtered_df = filtered_df[filtered_df['Occupation'] == clicked_occupation]  # filtering data based on clicked occupation

    # Recalculate total observation count based on the filtered DataFrame
    total_observations = filtered_df.shape[0]  # Count the number of rows (observations) in the DataFrame

    return f'Total Observations: {total_observations}'

@app.callback(
    Output('selected-years-tile', 'children'),
    [Input('year-slider', 'value')]
)
def update_selected_years(year_range):
    return f'Selected Years: {year_range[0]} - {year_range[1]}'

<hr style="border:3px solid gray">

# Running the Dash app

In [20]:
app.run(debug=True)

# Still To-do 
* change box-whisker adaption colormap
- [x] find a way to keep colors of sunburst catagories constant 
* change slider to have yearly increments
* [x] add tiles with statistics  
* add titles/describtions to charts
* (Find a way to get describitions of people in the selection) - maybe
