In [1]:
import pandas as pd
import os
import altair as alt
import numpy as np
import plotnine as pt
import geopandas as gpd
import plotly

In [2]:
aser = pd.read_csv('/Users/trishapunamiya/Desktop/LSE/Data Viz/Project/Raw Data/ASER_State_Data.csv') 

In [3]:
# filtering to 2024
columns_2024 = [col for col in aser.columns if '2024' in col]
aser_2024 = aser[['State'] + columns_2024]

In [4]:
# Enable data transformer for larger datasets
alt.data_transformers.enable('default', max_rows=None)

# Prepare data
df = aser_2024
df = df[df['State'] != 'All India']

# Get 2024 columns
reading_cols = [c for c in df.columns if '2024' in c and 'read' in c.lower()]
math_cols = [c for c in df.columns if '2024' in c and ('division' in c or 'subtraction' in c)]

# Process reading
df_r = df[['State'] + reading_cols].copy()
df_r.columns = ['State', 'Std III', 'Std V', 'Std VIII']
df_r = df_r.sort_values('Std V', ascending=False)

# Process math  
df_m = df[['State'] + math_cols].copy()
df_m.columns = ['State', 'Std III', 'Std V', 'Std VIII']
df_m['State'] = pd.Categorical(df_m['State'], categories=df_r['State'], ordered=True)

# Melt
reading_long = df_r.melt(id_vars='State', var_name='Standard', value_name='Value')
reading_long['Type'] = 'Reading'

math_long = df_m.melt(id_vars='State', var_name='Standard', value_name='Value')
math_long['Type'] = 'Math'

# Combine
combined = pd.concat([reading_long, math_long])

# Create heatmap with your color scheme
heatmap = alt.Chart(combined).mark_rect(stroke='white', strokeWidth=0.2).encode(
    x=alt.X('Standard:N', title=None, axis=alt.Axis(labelFontSize=12, labelAngle=-45)), # make x-axis labels slanted
    y=alt.Y('State:N', title=None, sort=df_r['State'].tolist(), axis=alt.Axis(labelFontSize=10)),
    color=alt.Color('Value:Q',
                    scale=alt.Scale(
                        domain=[0, 25, 50, 75, 100],
                        range=['#ba7798', '#c9909e', '#d8a9a4', '#9fceb7', '#53826b']
                    ),
                    legend=alt.Legend(title='Proficiency %', orient='right', titleFontSize=12)),
    tooltip=[
        alt.Tooltip('State:N', title='State'),
        alt.Tooltip('Standard:N', title='Standard'),
        alt.Tooltip('Value:Q', format='.1f', title='Proficiency %'),
        alt.Tooltip('Type:N', title='Subject')
    ]
).properties(
    width=300,
    height=350
)



# Combine and facet
chart = (heatmap ).facet(
    column=alt.Column('Type:N', 
                      title=None,
                      header=alt.Header(labelFontSize=16, labelFontWeight='bold'))
).properties(
    title={
        "text": "ASER 2024: Learning Outcomes Across Indian States",
        "subtitle": "Reading: % who can read Std II text | Math: Std III = Subtraction, Std V & VIII = Division",
        "fontSize": 18,
        "fontWeight": "bold",
        "anchor": "middle"
    }
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=18,
    anchor='middle',
    subtitleFontSize=12,
    subtitleColor='gray'
)


chart

In [5]:
import pandas as pd
import altair as alt

df = aser
df = df[df['State'] != 'All India']

region_map = {
    'Andhra Pradesh': 'South', 'Arunachal Pradesh': 'Northeast',
    'Assam': 'Northeast', 'Bihar': 'North', 'Chhattisgarh': 'Central',
    'Gujarat': 'West', 'Haryana': 'North', 'Himachal Pradesh': 'North',
    'Jammu and Kashmir': 'North', 'Jharkhand': 'East', 'Karnataka': 'South',
    'Kerala': 'South', 'Madhya Pradesh': 'Central', 'Maharashtra': 'West',
    'Meghalaya': 'Northeast', 'Mizoram': 'Northeast', 'Nagaland': 'Northeast',
    'Odisha': 'East', 'Punjab': 'North', 'Rajasthan': 'North',
    'Sikkim': 'Northeast', 'Tamil Nadu': 'South', 'Telangana': 'South',
    'Tripura': 'Northeast', 'Uttar Pradesh': 'North', 'Uttarakhand': 'North',
    'West Bengal': 'East'
}

df['Region'] = df['State'].map(region_map)

enrollment_col = [c for c in df.columns if '2024' in c and 'enrolled in govt schools' in c][0]
reading_col = [c for c in df.columns if '2024' in c and 'Std V' in c and ('division' in c or 'subtraction' in c)] [0]
not_in_school_col = [c for c in df.columns if '2024' in c and 'not enrolled in school' in c][0]

bubble_data = pd.DataFrame({
    'State': df['State'],
    'Region': df['Region'],
    'Enrollment': df[enrollment_col],
    'Reading': df[reading_col],
    'Not_in_School': df[not_in_school_col]
})

region_colors = {
    'North': '#77ba99',
    'South': '#ba7798',
    'East': '#9d8bb3',
    'West': '#b39d8b',
    'Central': '#8bb3a6',
    'Northeast': '#b3a68b'
}

chart = alt.Chart(bubble_data).mark_circle(opacity=0.7).encode(
    x=alt.X('Enrollment:Q', 
            title='% Enrolled in Govt Schools', 
            scale=alt.Scale(domain=[35, 95])),
    y=alt.Y('Reading:Q', 
            title='% Math Proficiency (Std V)', 
            scale=alt.Scale(domain=[0, 75])),
    size=alt.Size('Not_in_School:Q', 
                  title='% Not in School', 
                  scale=alt.Scale(range=[100, 1000])),
    color=alt.Color('Region:N', 
                    scale=alt.Scale(domain=list(region_colors.keys()), 
                                   range=list(region_colors.values()))),
    tooltip=[
        alt.Tooltip('State:N', title='State'),
        alt.Tooltip('Region:N', title='Region'),
        alt.Tooltip('Enrollment:Q', format='.1f', title='Enrollment %'),
        alt.Tooltip('Reading:Q', format='.1f', title='Reading %'),
        alt.Tooltip('Not_in_School:Q', format='.1f', title='Not in School %')
    ]
).properties(
    width=700,
    height=500,
    title='Quality-Access Gap in Indian Education (2024)'
)

chart

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
# Read data
df = aser_2024
df = df[df['State'] != 'All India']

# Define regions
region_map = {
    'Andhra Pradesh': 'South', 'Arunachal Pradesh': 'Northeast',
    'Assam': 'Northeast', 'Bihar': 'North', 'Chhattisgarh': 'Central',
    'Gujarat': 'West', 'Haryana': 'North', 'Himachal Pradesh': 'North',
    'Jammu and Kashmir': 'North', 'Jharkhand': 'East', 'Karnataka': 'South',
    'Kerala': 'South', 'Madhya Pradesh': 'Central', 'Maharashtra': 'West',
    'Meghalaya': 'Northeast', 'Mizoram': 'Northeast', 'Nagaland': 'Northeast',
    'Odisha': 'East', 'Punjab': 'North', 'Rajasthan': 'North',
    'Sikkim': 'Northeast', 'Tamil Nadu': 'South', 'Telangana': 'South',
    'Tripura': 'Northeast', 'Uttar Pradesh': 'North', 'Uttarakhand': 'North',
    'West Bengal': 'East'
}

df['Region'] = df['State'].map(region_map)

# Get 2024 columns for all standards
# Reading
std3_reading_col = [c for c in df.columns if '2024' in c and 'Std III' in c and 'read' in c][0]
std5_reading_col = [c for c in df.columns if '2024' in c and 'Std V' in c and 'read' in c][0]
std8_reading_col = [c for c in df.columns if '2024' in c and 'Std VIII' in c and 'read' in c][0]

# Math
std3_math_col = [c for c in df.columns if '2024' in c and 'Std III' in c and 'subtraction' in c][0]
std5_math_col = [c for c in df.columns if '2024' in c and 'Std V' in c and 'division' in c][0]
std8_math_col = [c for c in df.columns if '2024' in c and 'Std VIII' in c and 'division' in c][0]

# Enrollment
enrollment_col = [c for c in df.columns if '2024' in c and 'enrolled in govt schools' in c][0]

# Create dataframe with all standards
comparison_data = pd.DataFrame({
    'State': df['State'],
    'Region': df['Region'],
    'Enrollment': df[enrollment_col],
    'Std_III_Reading': df[std3_reading_col],
    'Std_III_Math': df[std3_math_col],
    'Std_V_Reading': df[std5_reading_col],
    'Std_V_Math': df[std5_math_col],
    'Std_VIII_Reading': df[std8_reading_col],
    'Std_VIII_Math': df[std8_math_col]
})

# Reshape for each standard
std3_data = comparison_data[['State', 'Region', 'Enrollment', 'Std_III_Reading', 'Std_III_Math']].copy()
std3_data.columns = ['State', 'Region', 'Enrollment', 'Reading', 'Math']
std3_data['Standard'] = 'Std III'

std5_data = comparison_data[['State', 'Region', 'Enrollment', 'Std_V_Reading', 'Std_V_Math']].copy()
std5_data.columns = ['State', 'Region', 'Enrollment', 'Reading', 'Math']
std5_data['Standard'] = 'Std V'

std8_data = comparison_data[['State', 'Region', 'Enrollment', 'Std_VIII_Reading', 'Std_VIII_Math']].copy()
std8_data.columns = ['State', 'Region', 'Enrollment', 'Reading', 'Math']
std8_data['Standard'] = 'Std VIII'

# Combine all standards
all_data = pd.concat([std3_data, std5_data, std8_data])

# Your color palette
region_colors = {
    'North': '#77ba99',
    'South': '#ba7798',
    'East': '#9d8bb3',
    'West': '#b39d8b',
    'Central': '#8bb3a6',
    'Northeast': '#b3a68b'
}

# Create dropdown selection
dropdown = alt.binding_select(
    options=['Std III', 'Std V', 'Std VIII'],
    name='Standard: '
)
selection = alt.selection_point(
    fields=['Standard'],
    bind=dropdown,
    value='Std V'  # Default to Std V
)

# Create bubble chart
bubble_chart = alt.Chart(all_data).mark_circle(
    opacity=0.7,
    stroke='white',
    strokeWidth=1
).encode(
    x=alt.X('Reading:Q',
            title='% Reading Proficiency',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(titleFontSize=13, labelFontSize=11)),
    y=alt.Y('Math:Q',
            title='% Math Proficiency',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(titleFontSize=13, labelFontSize=11)),
    size=alt.Size('Enrollment:Q',
                  title='% Enrollment',
                  scale=alt.Scale(range=[100, 1000]),
                  legend=alt.Legend(titleFontSize=11, labelFontSize=10)),
    color=alt.Color('Region:N',
                    scale=alt.Scale(
                        domain=list(region_colors.keys()),
                        range=list(region_colors.values())
                    ),
                    legend=alt.Legend(titleFontSize=12, labelFontSize=11)),
    tooltip=[
        alt.Tooltip('State:N', title='State'),
        alt.Tooltip('Region:N', title='Region'),
        alt.Tooltip('Standard:N', title='Standard'),
        alt.Tooltip('Reading:Q', title='Reading %', format='.1f'),
        alt.Tooltip('Math:Q', title='Math %', format='.1f'),
        alt.Tooltip('Enrollment:Q', title='Enrollment %', format='.1f')
    ]
).add_params(
    selection
).transform_filter(
    selection
)

# Add diagonal reference line (where reading = math)
diagonal_data = pd.DataFrame({'x': [0, 100], 'y': [0, 100]})
diagonal = alt.Chart(diagonal_data).mark_line(
    strokeDash=[5, 5],
    color='gray',
    opacity=0.5
).encode(
    x='x:Q',
    y='y:Q'
)

# Combine
final_chart = (diagonal + bubble_chart).properties(
    width=700,
    height=600,
    title={
        "text": "Reading vs Math Proficiency Across Indian States (2024)",
        "subtitle": "Points above diagonal = stronger math skills | Points below = stronger reading skills | Select standard from dropdown",
        "fontSize": 18,
        "fontWeight": "bold",
        "anchor": "start",
        "subtitleFontSize": 13,
        "subtitleColor": "gray"
    }
).configure_view(
    strokeWidth=0
).configure_axis(
    gridColor='#e0e0e0'
)



final_chart

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
import pandas as pd
import altair as alt
from scipy import stats

# Read data
df = aser_2024
df = df[df['State'] != 'All India']

# Define regions
region_map = {
    'Andhra Pradesh': 'South', 'Arunachal Pradesh': 'Northeast',
    'Assam': 'Northeast', 'Bihar': 'North', 'Chhattisgarh': 'Central',
    'Gujarat': 'West', 'Haryana': 'North', 'Himachal Pradesh': 'North',
    'Jammu and Kashmir': 'North', 'Jharkhand': 'East', 'Karnataka': 'South',
    'Kerala': 'South', 'Madhya Pradesh': 'Central', 'Maharashtra': 'West',
    'Meghalaya': 'Northeast', 'Mizoram': 'Northeast', 'Nagaland': 'Northeast',
    'Odisha': 'East', 'Punjab': 'North', 'Rajasthan': 'North',
    'Sikkim': 'Northeast', 'Tamil Nadu': 'South', 'Telangana': 'South',
    'Tripura': 'Northeast', 'Uttar Pradesh': 'North', 'Uttarakhand': 'North',
    'West Bengal': 'East'
}

df['Region'] = df['State'].map(region_map)

# Get 2024 columns
std3_reading_col = [c for c in df.columns if '2024' in c and 'Std III' in c and 'read' in c][0]
std5_reading_col = [c for c in df.columns if '2024' in c and 'Std V' in c and 'read' in c][0]
std8_reading_col = [c for c in df.columns if '2024' in c and 'Std VIII' in c and 'read' in c][0]

std3_math_col = [c for c in df.columns if '2024' in c and 'Std III' in c and 'subtraction' in c][0]
std5_math_col = [c for c in df.columns if '2024' in c and 'Std V' in c and 'division' in c][0]
std8_math_col = [c for c in df.columns if '2024' in c and 'Std VIII' in c and 'division' in c][0]

enrollment_col = [c for c in df.columns if '2024' in c and 'enrolled in govt schools' in c][0]

# Create dataframe
comparison_data = pd.DataFrame({
    'State': df['State'],
    'Region': df['Region'],
    'Enrollment': df[enrollment_col],
    'Std_III_Reading': df[std3_reading_col],
    'Std_III_Math': df[std3_math_col],
    'Std_V_Reading': df[std5_reading_col],
    'Std_V_Math': df[std5_math_col],
    'Std_VIII_Reading': df[std8_reading_col],
    'Std_VIII_Math': df[std8_math_col]
})

# Reshape for each standard
std3_data = comparison_data[['State', 'Region', 'Enrollment', 'Std_III_Reading', 'Std_III_Math']].copy()
std3_data.columns = ['State', 'Region', 'Enrollment', 'Reading', 'Math']
std3_data['Standard'] = 'Std III'

std5_data = comparison_data[['State', 'Region', 'Enrollment', 'Std_V_Reading', 'Std_V_Math']].copy()
std5_data.columns = ['State', 'Region', 'Enrollment', 'Reading', 'Math']
std5_data['Standard'] = 'Std V'

std8_data = comparison_data[['State', 'Region', 'Enrollment', 'Std_VIII_Reading', 'Std_VIII_Math']].copy()
std8_data.columns = ['State', 'Region', 'Enrollment', 'Reading', 'Math']
std8_data['Standard'] = 'Std VIII'

# Combine
all_data = pd.concat([std3_data, std5_data, std8_data])

# Calculate R² for each standard
correlations = []
for std in ['Std III', 'Std V', 'Std VIII']:
    std_data = all_data[all_data['Standard'] == std].dropna(subset=['Reading', 'Math'])
    if len(std_data) > 0:
        r, p = stats.pearsonr(std_data['Reading'], std_data['Math'])
        correlations.append({
            'Standard': std,
            'r_squared': r**2,
            'r_text': f'R² = {r**2:.3f}',
            'x_pos': 5,
            'y_pos': 95
        })

corr_df = pd.DataFrame(correlations)

# Your color palette
region_colors = {
    'North': '#77ba99',
    'South': '#ba7798',
    'East': '#9d8bb3',
    'West': '#b39d8b',
    'Central': '#8bb3a6',
    'Northeast': '#b3a68b'
}

# Create dropdown
dropdown = alt.binding_select(
    options=['Std III', 'Std V', 'Std VIII'],
    name='Standard: '
)
selection = alt.selection_point(
    fields=['Standard'],
    bind=dropdown,
    value='Std V'
)

# Bubble chart
bubble_chart = alt.Chart(all_data).mark_circle(
    opacity=0.7,
    stroke='white',
    strokeWidth=1
).encode(
    x=alt.X('Reading:Q',
            title='% Reading Proficiency',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(titleFontSize=13, labelFontSize=11)),
    y=alt.Y('Math:Q',
            title='% Math Proficiency',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(titleFontSize=13, labelFontSize=11)),
    size=alt.Size('Enrollment:Q',
                  title='% Enrollment',
                  scale=alt.Scale(range=[100, 1000]),
                  legend=alt.Legend(titleFontSize=11, labelFontSize=10)),
    color=alt.Color('Region:N',
                    scale=alt.Scale(
                        domain=list(region_colors.keys()),
                        range=list(region_colors.values())
                    ),
                    legend=alt.Legend(titleFontSize=12, labelFontSize=11)),
    tooltip=[
        alt.Tooltip('State:N', title='State'),
        alt.Tooltip('Region:N', title='Region'),
        alt.Tooltip('Standard:N', title='Standard'),
        alt.Tooltip('Reading:Q', title='Reading %', format='.1f'),
        alt.Tooltip('Math:Q', title='Math %', format='.1f'),
        alt.Tooltip('Enrollment:Q', title='Enrollment %', format='.1f')
    ]
).add_params(selection).transform_filter(selection)

# Regression line
regression = alt.Chart(all_data).mark_line(
    color='#ba7798',
    strokeWidth=3,
    opacity=0.8
).encode(
    x='Reading:Q',
    y='Math:Q'
).transform_filter(selection).transform_regression('Reading', 'Math', method='linear')

# R² text annotation (updates with dropdown)
r_squared_text = alt.Chart(corr_df).mark_text(
    align='left',
    baseline='top',
    dx=0,
    dy=0,
    fontSize=16,
    fontWeight='bold',
    color='#ba7798'
).encode(
    x=alt.value(20),  # Pixel position from left
    y=alt.value(20),  # Pixel position from top
    text='r_text:N'
).transform_filter(selection)

# Combine all layers
final_chart = (bubble_chart + regression + r_squared_text).properties(
    width=700,
    height=600,
    title={
        "text": "Reading vs Math Proficiency Across Indian States (2024)",
        "subtitle": "Do states that excel at reading also excel at math? Trendline shows correlation. Select standard from dropdown.",
        "fontSize": 18,
        "fontWeight": "bold",
        "anchor": "start",
        "subtitleFontSize": 13,
        "subtitleColor": "gray"
    }
).configure_view(
    strokeWidth=0
).configure_axis(
    gridColor='#e0e0e0'
)

# Save

final_chart

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
