In [1]:
# pip install pandas matplotlib plotly numpy, dash

In [2]:
# pip install --upgrade nbformat

In [3]:
# test library
import plotly.express as px
df = px.data.tips()
fig = px.box(df, y="total_bill")
fig.show()

In [4]:
# install dash libraries
from dash import Dash, dcc, html, Input, Output

In [5]:
import pandas as pd
from os import path

In [6]:
DATA_DIR = '/Users/josephjungermann/Documents/the_real_deal/20250903_LAPlotlyViz/data'
agents_df = pd.read_csv(path.join(DATA_DIR, '2025_LosAngeles_County_Agent_Data.csv'), low_memory=False)
agents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47178 entries, 0 to 47177
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   UniqueID          47178 non-null  object 
 1   mlsId             47178 non-null  object 
 2   address           47178 non-null  object 
 3   city              47097 non-null  object 
 4   price_amount      47178 non-null  object 
 5   Side              47178 non-null  object 
 6   Final_Agent       46777 non-null  object 
 7   url               45077 non-null  object 
 8   TRD_note          2137 non-null   object 
 9   brokerage_name_1  47012 non-null  object 
 10  brokerage_name_2  17290 non-null  object 
 11  zip               45077 non-null  float64
 12  agent_name        45077 non-null  object 
 13  agent_no.         45077 non-null  object 
 14  beds              45064 non-null  float64
 15  baths             45034 non-null  float64
 16  sqft_amount       44829 non-null  object

In [7]:
# change the price amount into an int column
agents_df["price_amount"] = (
    agents_df["price_amount"]
    .astype(str)
    .str.replace(r"[\$,]", "", regex=True)
    .astype(float)
    .astype(int)
)

In [8]:
# Remove "subscriber non" from top 10
agents_df = agents_df[agents_df["Final_Agent"] != "SUBSCRIBER NON"]

## Plotly Graph

In [None]:
import plotly.express as px

# group top 10 agents/teams by sale volume
agents_grouped = agents_df.groupby("Final_Agent", as_index=False)["price_amount"].sum()
top10_agents = agents_grouped.sort_values("price_amount", ascending=False).head(10)["Final_Agent"]

df_top10 = agents_df[agents_df["Final_Agent"].isin(top10_agents)]

fig = px.box(
    df_top10,
    x="Final_Agent",
    y="price_amount",
    points="all", # Change to "suspectedoutliers" if you want to show outliers only
    title="Distribution of LA County Resi Sales for Top 10 Agents/Teams"
)

# Format the axis labels
fig.update_layout(
    title={
        "x": 0.5, # center
        "xanchor": "center",
        "yanchor": "top"
    },
    title_font=dict(size=20, family="Arial", color="black"),
    autosize=True, 
    height=700,
    xaxis_title="Agent/Team",
    yaxis_title="Sale Price ($)",
    xaxis_tickangle=-45,
    yaxis_type="log",
    showlegend=False
)

fig.update_yaxes(
    tickvals=[1e6, 1e7, 1e8],               
    ticktext=["$1M", "$10M", "$100M"],      
    showgrid=True,                         
    ticks="outside"                         
)

fig.show()

# Save chart as interactive HTML
fig.write_html("top10_agents.html", full_html=True, include_plotlyjs="cdn")

## Dash App

In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
# import pandas as pd

#------------
# Initialize Dash App
#------------
app = dash.Dash(__name__)
app.title = "Top 10 LA Agents Sales Distribution"

#------------
# Layout
#------------
app.layout = html.Div([
    html.H1("Distribution of Home Sale Prices by Top 10 Agents"),

    html.Label("Select Agents:"),
    dcc.Dropdown(
        id='agent-dropdown',
        options=[{'label': agent, 'value': agent} for agent in sorted(df_top10["Final_Agent"].unique())],
        value=list(df_top10["Final_Agent"].unique()),
        multi=True
    ),

    dcc.Graph(id='boxplot')
])

#------------
# Callback for the graph
#------------
@app.callback(
    Output('boxplot', 'figure'),
    Input('agent-dropdown', 'value')
)

def update_boxplot(selected_agents):
    #Filter the dataframe
    filtered_df = df_top10[df_top10["Final_Agent"].isin(selected_agents)]

    # Base boxplot style
    fig = px.box(
        filtered_df,
        x="Final_Agent",
        y="price_amount",
        points="all",
        title="Distribution of LA County Resi Sales for Top 10 Agents/Teams"
    )

    fig.update_layout(
        yaxis_type="log",
        yaxis_title="Sale Price ($)",
        xaxis_title="Agent/Team",
        xaxis_tickangle=-45,
        showlegend=False
    )

    # Format the axis labels
    fig.update_layout(
        width=1200, 
        height=700,
        xaxis_title="Agent/Team",
        yaxis_title="Sale Price ($)",
        xaxis_tickangle=-45,
        yaxis_type="log",
        showlegend=False
    )

    fig.update_yaxes(
        tickvals=[1e6, 1e7, 1e8],               
        ticktext=["$1M", "$10M", "$100M"],      
        showgrid=True,                         
        ticks="outside"                         
    )

    return fig

#------------
# Run the App
#------------

if __name__ == '__main__':
    app.run(debug=True)

In [21]:
# This is to download a cleaned up version of our dataset so it can be uploaded into a .py file for web application and display
df_top10.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1018 entries, 0 to 47177
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   UniqueID          1018 non-null   object 
 1   mlsId             1018 non-null   object 
 2   address           1018 non-null   object 
 3   city              1005 non-null   object 
 4   price_amount      1018 non-null   int64  
 5   Side              1018 non-null   object 
 6   Final_Agent       1018 non-null   object 
 7   url               917 non-null    object 
 8   TRD_note          124 non-null    object 
 9   brokerage_name_1  959 non-null    object 
 10  brokerage_name_2  665 non-null    object 
 11  zip               917 non-null    float64
 12  agent_name        917 non-null    object 
 13  agent_no.         917 non-null    object 
 14  beds              917 non-null    float64
 15  baths             916 non-null    float64
 16  sqft_amount       882 non-null    object 
 17 

In [23]:
drop_columns = ["UniqueID", "mlsId", "url", "TRD_note", "brokerage_name_1", "brokerage_name_2", "zip", "agent_name",
                "agent_no.", "beds", "baths", "sqft_amount", "latitude", "longitude", "propertyId", "listingId"]
df_top10_cleaned = df_top10.drop(columns=drop_columns)
df_top10_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1018 entries, 0 to 47177
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   address       1018 non-null   object
 1   city          1005 non-null   object
 2   price_amount  1018 non-null   int64 
 3   Side          1018 non-null   object
 4   Final_Agent   1018 non-null   object
dtypes: int64(1), object(4)
memory usage: 47.7+ KB


In [24]:
df_top10_cleaned.to_csv("/Users/josephjungermann/Documents/the_real_deal/20250903_LAPlotlyViz/output/df_top10_cleaned.csv", index=False)