In [1]:
# std
import json

# third party
import re
import dash 
import seaborn as sns
import folium
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import geopandas as gpd
import numpy as np
from dash import dcc, html
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from folium.plugins import HeatMap
from keplergl import KeplerGl

# project

In [2]:
# Load the data in 
df = pd.read_csv('data/trees.csv', low_memory=False)

In [3]:
# What does the data look like?
print("Data overview:")
display(df.head())  
display(df.info()) 

Data overview:


Unnamed: 0,objectid,borough,maintainer,gla_tree_name,tree_name,taxon_name,common_name,age,age_group,height_m,spread_m,canopy_spread_group,diameter_at_breast_height_cm,dbh_group,longitude,latitude,condition,load_date,updated
0,1,Kingston upon Thames,Royal Borough of Kingston upon Thames,Other,,Abies grandis,Grand fir,,Early mature (16-30),10 to 15m,,00 to 05m,,21 to 40cm,-0.291147,51.361893,Reasonable,20210318,20210715
1,2,Kingston upon Thames,Royal Borough of Kingston upon Thames,Other,,Abies grandis,Grand fir,,Early mature (16-30),10 to 15m,,00 to 05m,,21 to 40cm,-0.291122,51.361914,Reasonable,20210318,20210715
2,3,Kingston upon Thames,Royal Borough of Kingston upon Thames,Other,,Abies grandis,Grand fir,,Early mature (16-30),05 to 10m,,05 to 10m,,21 to 40cm,-0.290943,51.387016,Reasonable,20210318,20210715
3,4,Kingston upon Thames,Royal Borough of Kingston upon Thames,Other,,Abies grandis,Grand fir,,Mature (31-80),10 to 15m,,05 to 10m,,41 to 70cm,-0.288572,51.387405,Reasonable,20210318,20210715
4,5,Kingston upon Thames,Royal Borough of Kingston upon Thames,Other,,Abies grandis,Grand fir,,Mature (31-80),10 to 15m,,05 to 10m,,41 to 70cm,-0.285025,51.388872,Reasonable,20210318,20210715


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 817150 entries, 0 to 817149
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   objectid                      817150 non-null  int64  
 1   borough                       817150 non-null  object 
 2   maintainer                    817150 non-null  object 
 3   gla_tree_name                 817150 non-null  object 
 4   tree_name                     815916 non-null  object 
 5   taxon_name                    766107 non-null  object 
 6   common_name                   682231 non-null  object 
 7   age                           211161 non-null  object 
 8   age_group                     229336 non-null  object 
 9   height_m                      103218 non-null  object 
 10  spread_m                      100408 non-null  float64
 11  canopy_spread_group           18175 non-null   object 
 12  diameter_at_breast_height_cm  208152 non-nul

None

In [4]:
# Columns like 'height_m', 'spread_m', 'diameter_at_breast_height_cm' might need conversion to numeric

df_cleaned = df.copy()

# Convert columns to numeric, coercing errors to NaN for any non-convertible values
df_cleaned['height_m'] = pd.to_numeric(df_cleaned['height_m'], errors = 'coerce')
df_cleaned['spread_m'] = pd.to_numeric(df_cleaned['spread_m'], errors = 'coerce')
df_cleaned['diameter_at_breast_height_cm'] = pd.to_numeric(df_cleaned['diameter_at_breast_height_cm'], errors = 'coerce')

# Checking missing values in the dataset
print("\nMissing data in the dataset:")
print(df_cleaned.isnull().sum())

# We can see there are lots of missing values in columns like 'tree_name', 'height_m', etc.
# Ideally we'd drop these now


Missing data in the dataset:
objectid                             0
borough                              0
maintainer                           0
gla_tree_name                        0
tree_name                         1234
taxon_name                       51043
common_name                     134919
age                             605989
age_group                       587814
height_m                        732107
spread_m                        716742
canopy_spread_group             798975
diameter_at_breast_height_cm    608998
dbh_group                       798975
longitude                            0
latitude                             0
condition                       798975
load_date                            0
updated                              0
dtype: int64


In [5]:
# Just drop any columns with null values
df_cleaned = df.dropna(axis=1)


In [6]:
# Check if there's any other missing values left
print("\nMissing values check after first clean:")
print(df_cleaned.isnull().sum())


Missing values check after first clean:
objectid         0
borough          0
maintainer       0
gla_tree_name    0
longitude        0
latitude         0
load_date        0
updated          0
dtype: int64


In [None]:
# # Initialize Dash app
# app = dash.Dash(__name__)

# # Nested pie chart (treemap) data
# total_tree_type_counts_nested = df_cleaned[df_cleaned['gla_tree_name'] != 'Other']['gla_tree_name'].value_counts()
# top_7_tree_types_nested = total_tree_type_counts_nested.index[:7]

# # Prepare data for the Treemap
# labels = []  # Labels for the tree types and boroughs
# parents = []  # Parent labels
# values = []  # The number of trees

# # Loop through each tree type and get the top 3 boroughs (or more if needed)
# for tree_type in top_7_tree_types_nested:
#     # Filter dataset for this tree type
#     df_filtered_nested = df_cleaned[df_cleaned['gla_tree_name'] == tree_type]
    
#     # Get the top 3 boroughs for this tree type
#     top_3_boroughs_nested = df_filtered_nested['borough'].value_counts().head(3)
    
#     # Add tree type
#     labels.append(tree_type)
#     parents.append("")
#     values.append(df_filtered_nested.shape[0])
    
#     # Add boroughs under the tree type, but append the tree type to make them unique
#     for borough, count in top_3_boroughs_nested.items():
#         labels.append(f"{borough} ({tree_type})")  # Append the tree type to make each borough unique
#         parents.append(tree_type)
#         values.append(count)

# # Create a treemap
# treemap_fig = go.Figure(go.Treemap(
#     labels=labels,
#     parents=parents,
#     values=values,
#     branchvalues="total",
#     hoverinfo="label+value+percent parent"
# ))

# # Define layout for the Dash app
# app.layout = html.Div([
#     html.H1('London Trees Dashboard', style={'textAlign': 'center'}),

#     # Treemap chart
#     html.Div([
#         dcc.Graph(
#             id='treemap-chart',
#             figure=treemap_fig.update_layout(
#                 margin=dict(t=30, l=0, r=0, b=0),
#                 title='Top 7 Tree Types and Top 3 Boroughs per Type'
#             )
#         )
#     ], style={'width': '100%', 'display': 'inline-block'})
# ])

# # Run the app
# if __name__ == '__main__':
#     app.run_server(debug=True)

In [8]:
# Initialize Dash app
app = dash.Dash(__name__)

# Nested pie chart (sunburst) data
total_tree_type_counts_nested = df_cleaned[df_cleaned['gla_tree_name'] != 'Other']['gla_tree_name'].value_counts()
top_7_tree_types_nested = total_tree_type_counts_nested.index[:7]

# Prepare data for the Sunburst (nested pie chart equivalent)
labels = []  # Will hold the names for each segment
parents = []  # Will hold the parent of each label (to construct the hierarchy)
values = []  # Will hold the values for each segment

# Loop through each tree type and get the top 3 boroughs (or more if needed)
for tree_type in top_7_tree_types_nested:
    # Filter dataset for this tree type
    df_filtered_nested = df_cleaned[df_cleaned['gla_tree_name'] == tree_type]
    
    # Get the top 3 boroughs for this tree type
    top_3_boroughs_nested = df_filtered_nested['borough'].value_counts().head(3)
    
    # Add tree type
    labels.append(tree_type)
    parents.append("")
    values.append(df_filtered_nested.shape[0])
    
    # Add boroughs under the tree type, but append the tree type to make them unique
    for borough, count in top_3_boroughs_nested.items():
        labels.append(f"{borough} ({tree_type})")  # Append the tree type to make each borough unique
        parents.append(tree_type)
        values.append(count)

# Create the Sunburst chart
sunburst_fig = go.Figure(go.Sunburst(
    labels=labels,
    parents=parents,
    values=values,
    branchvalues="total",
    hoverinfo="label+value+percent parent"
))

# Define layout for the Dash app
app.layout = html.Div([
    html.H1('London Trees Dashboard', style={'textAlign': 'center'}),

    # Sunburst chart
    html.Div([
        dcc.Graph(
            id='sunburst-chart',
            figure=sunburst_fig.update_layout(
                margin=dict(t=30, l=0, r=0, b=0),
                title='Top 7 Tree Types and Top 3 Boroughs per Type'
            )
        )
    ], style={'width': '100%', 'display': 'inline-block'})
])

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)