# testing altair

In [1]:
import altair as alt
import pandas as pd

# Simple Example Dataset
data = pd.DataFrame({
    'x': [1, 2, 3, 4, 5],
    'y': [10, 15, 30, 25, 35],
    'category': ['A', 'B', 'A', 'B', 'A']
})

# Basic Altair Plot
basic_plot = (
    alt.Chart(data)
    .mark_circle(size=100, color='blue')
    .encode(
        x='x:Q',
        y='y:Q',
        color='category:N',
        tooltip=['x', 'y', 'category']
    )
    .properties(
        title='Simple Altair Scatter Plot',
        width=400,
        height=300
    )
    .interactive()
)

# Show the plot
basic_plot.show()

In [2]:
#read cc9_health_cities_bra.csv
db_health_cities = pd.read_csv('PortfolioData/cc9_health_cities_bra.csv')

In [None]:
columns_to_convert = ['n_med', 'tx_med', 'n_obitos_csap', 'desp_tot_saude_pc_mun', 'pop', 'tx_mort_csap']

# Replace commas with dots and convert to numeric
db_health_cities[columns_to_convert] = db_health_cities[columns_to_convert].apply(lambda x: pd.to_numeric(x.str.replace(',', '.'), errors='coerce'))

#is there na values?
db_health_cities.isna().sum()

In [8]:
# Sort dataset by population in descending order
db_health_cities = db_health_cities.sort_values(by='pop', ascending=False).reset_index(drop=True)

# Keep only the top 5000 rows
db_health_cities = db_health_cities.head(5000)

# Verify the change
print(db_health_cities.shape)  # Should print (5000, N) where N is the number of columns
print(db_health_cities.head())


(5000, 9)
   codmun         nomemun    n_med    tx_med  n_obitos_csap  \
0  355030       São Paulo  69126.0  6.036151        17928.0   
1  330455  Rio de Janeiro  31510.0  5.073075        15982.0   
2  230440       Fortaleza  10057.0  4.140885         3271.0   
3  292740        Salvador  12977.0  5.367547         4240.0   
4  310620  Belo Horizonte  21655.0  9.351949         4464.0   

   desp_tot_saude_pc_mun         pop  tx_mort_csap pop_category  
0                1639.41  11451999.0    156.549088         10M+  
1                1084.24   6211223.0    257.308424       5M–10M  
2                1199.32   2428708.0    134.680661        1M–5M  
3                 859.81   2417678.0    175.374884        1M–5M  
4                2283.95   2315560.0    192.782739        1M–5M  


In [21]:
import altair as alt
import pandas as pd
import numpy as np

# Ensure dataset is clean
db_health_cities = db_health_cities.dropna(
    subset=['tx_mort_csap', 'desp_tot_saude_pc_mun', 'pop', 'nomemun']
).reset_index(drop=True)

# Create population categories
db_health_cities['pop_category'] = np.select(
    [
        db_health_cities['pop'] < 500_000,
        (db_health_cities['pop'] >= 500_000) & (db_health_cities['pop'] < 1_000_000),
        (db_health_cities['pop'] >= 1_000_000) & (db_health_cities['pop'] < 5_000_000),
        (db_health_cities['pop'] >= 5_000_000)
    ],
    [
        'Less than 500K',
        '500K–1M',
        '1M–5M',
        '5M+'
    ],
    default='Unknown'
)

# Define regression line using NumPy
regression_model = np.polyfit(
    db_health_cities['tx_mort_csap'],
    db_health_cities['desp_tot_saude_pc_mun'], 
    1
)
regression_line = np.poly1d(regression_model)

# Create regression line data
regression_df = pd.DataFrame({
    'tx_mort_csap': np.linspace(
        db_health_cities['tx_mort_csap'].min(),
        db_health_cities['tx_mort_csap'].max(),
        100
    )
})
regression_df['desp_tot_saude_pc_mun'] = regression_line(regression_df['tx_mort_csap'])

# Bubble Plot with Adjustments
bubble_plot = (
    alt.Chart(db_health_cities)
    .mark_circle(opacity=0.6)
    .encode(
        x=alt.X('tx_mort_csap:Q', title='Number of Deaths (Primary Care-Sensitive)'),
        y=alt.Y('desp_tot_saude_pc_mun:Q', title='Health Expenditure (R$ per Capita)'),
        size=alt.Size(
            'pop:Q',
            scale=alt.Scale(range=[10, 1000]),  # Adjust bubble sizes
            legend=None  # Remove the Population legend
        ),
        color=alt.Color(
            'pop_category:N',
            title='Population Range',
            scale=alt.Scale(
                domain=['Less than 500K', '500K–1M', '1M–5M', '5M+'],
                range=['#48cae4', '#ff7f0e', '#d62728', '#6f2dbd']
            )
        ),
        order=alt.Order(  # Sort bubbles by population size (smallest first)
            'pop:Q',
            sort='ascending'
        ),
        tooltip=[
            alt.Tooltip('nomemun:N', title='Municipality'),
            alt.Tooltip('tx_mort_csap:Q', title='Deaths (Primary Care-Sensitive)'),
            alt.Tooltip('desp_tot_saude_pc_mun:Q', title='Health Expenditure (R$)'),
            alt.Tooltip('pop:Q', title='Population'),
            alt.Tooltip('pop_category:N', title='Population Range')
        ]
    )
    .properties(
        title='Relationship Between Primary Care-Sensitive Deaths and Municipal Health Expenditure',
        width=600,
        height=400
    )
    .interactive()
)

# Add Regression Line
regression_plot = (
    alt.Chart(regression_df)
    .mark_line(color='black', strokeDash=[5, 5])
    .encode(
        x='tx_mort_csap:Q',
        y='desp_tot_saude_pc_mun:Q'
    )
)

# Combine Bubble Plot and Regression Line
final_plot = bubble_plot + regression_plot

# Show plot
final_plot.show()
