# Smoking in US

The following visualizations are designed to explore patterns of cigarette and vape use, along with demographic and behavioral factors. These charts are based on survey data collected over several years, focusing on the usage of tobacco products, variations by age, gender, and race, and socioeconomic factors. Each chart is crafted with care to provide insights into different aspects of tobacco use.

In [None]:
import pandas as pd
import altair as alt
from pathlib import Path

In [None]:
# Read data
df = pd.read_csv("../data/smoking.csv")
df2021= pd.read_csv("../data/smoking_20-21.csv")

In [None]:
# Recode data
df['IRPINC3'] = df['IRPINC3'].replace({1: '1: Less than $10,000',2: '2: $10,000 - $19,999', 3: '3: $20,000 - $29,999',
                                       4: '4: $30,000 - $39,999', 5: '5: $40,000 - $49,999', 6: '6: $50,000 - $74,999',
                                       7: '7: $75,000 or more' })
df['IRSEX'] = df['IRSEX'].replace({1: 'Male', 2:'Female'})
df['NEWRACE2'] = df['NEWRACE2'].replace({1: 'White', 2: 'Black/African American', 3:'Native American Alaska Native', 
                                        4: 'Native Hawaiian/Other Pacific Islanders', 5: 'Asian', 
                                         6:'More than One Race', 7:'Hispanic'})
df['CATAG3'] = df['CATAG3'].replace({1: '12-17 Years Old', 2: '18-25 Years Old', 3: '26-34 Years Old', 4: '35-49 Years Old',
                                    5: '50 or Older'})
df['CIGEVER'] = df['CIGEVER'].replace({2: 0})
df['CIG30AV_grouped'] = pd.cut(
    df['CIG30AV'], 
    bins=[0, 3, 4, 6, 7], 
    labels=['0.5 pack or less', '0.5 pack to 1 pack', '1 pack to 2 packs', '2 packs or more'],
    right=True
)
df = df.rename(columns={'IRSEX': 'Gender'})

In [None]:
# Get different data frame to work on visualization
smoke = df.loc[df['CIGEVER'] == 1]
smoke = smoke.loc[smoke['CIGTRY'] <= 900]
smoke_2021 = smoke.loc[smoke['year'] == 2021]
current_smoker = smoke.loc[smoke['CIGREC'] == 1]

In [None]:
# Data for pie chart
df2021_youth = df2021.loc[df2021['CATAG3'] == 1]
df2021_youth = df2021_youth[['CIGEVER', 'VAPANYEVR', 'year']].loc[df2021_youth['VAPANYEVR'] < 80]
df2021_youth['CIGEVER'] = df2021_youth['CIGEVER'].replace({2:0})
df2021_youth['VAPANYEVR'] = df2021_youth['VAPANYEVR'].replace({2:0})
df2021_youth = df2021_youth.rename(columns={'CIGEVER': 'Cigarettes','VAPANYEVR': 'Vapes'})
def prepare_pie_data(df, var):
    # Group by year and the variable (CIGEVER or VAPANYEVR)
    pie_data = df.groupby(['year', var]).size().reset_index(name='count')
    pie_data['Usage'] = pie_data[var].map({1: 'Ever Used', 0: 'Never Used'})
    pie_data['Product'] = var
    return pie_data

cig_data = prepare_pie_data(df2021_youth, 'Cigarettes')
vap_data = prepare_pie_data(df2021_youth, 'Vapes')

# Combine both datasets
pie_data = pd.concat([cig_data, vap_data])
pie_data['percentage'] = pie_data.groupby(['year', 'Product'])['count'].transform(lambda x: (x / x.sum()))

In [None]:
alt.data_transformers.disable_max_rows()

def year_everuse(df):
    """
    Line chart of the trend of cigarette smoking experience over years.
    """
    line_chart = alt.Chart(df).mark_line(point=True, color="#367588").encode(
        x=alt.X('year:N', title='Year', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('mean(CIGEVER):Q', scale=alt.Scale(domain=[0, 0.8]), 
                title='Percentage of Participants Who Ever Smoked').axis(format='%', tickCount=5)
    ).configure_axis(
        grid=False # No grid
    ).configure_view(
        stroke=None
    ).properties(
    title={
        "text":['Trend of Cigarette Smoking Experience Over Years'],
        "subtitle":["The smoking population is decreasing over years."]},
        width=400).configure_title(
        anchor='start', # Title starts from left
        fontSize=24, 
        font='Roboto, sans-serif', 
        subtitleFont='Roboto, sans-serif', 
        subtitleFontSize=16
    )
    
    return line_chart

In [None]:
year_everuse(df)

This line chart visualizes the proportion of people who have ever smoked cigarettes by year. Over the years, the proportion of people who have ever smoked shows a slight downward trend, indicating that fewer people are initiating smoking, which could be due to effective public health campaigns, policy changes, or shifts in societal attitudes toward smoking.

In [None]:
def year_firstage(df):
    """
    Line chart: Average age when first smoked cigarette by year and gender.
    """
    line_first = alt.Chart(df).mark_line(point=True).encode(
        alt.Y("mean(CIGTRY)", title='Average Age', scale=alt.Scale(domain=[12, 18])),
        alt.X("year:N", axis=alt.Axis(labelAngle=0)),
        alt.Color("Gender:N", scale=alt.Scale(
            domain=['Female', 'Male'],  # Ensure correct mapping to gender
            range=['#367588', '#dd644e']))
    )
    # Vertical rule at 2019
    rule_2019 = alt.Chart(pd.DataFrame({'year': [2019]})).mark_rule(
        color='grey',
    ).encode(
        x='year:N'
    )
    
    # Combine line chart and rule
    chart = (line_first + rule_2019).configure_axis(
        grid=False
    ).configure_view(
        stroke=None
    ).properties(
        title={
            "text": ['Average Age When First Smoked Cigarette by Year'],
            "subtitle": ['Increasing over time no matter which gender, especially after "Tobacco 21" (2019).']
        },
        width=600
    ).configure_title(
        anchor='start', 
        fontSize=24, 
        font='Roboto, sans-serif', 
        subtitleFont='Roboto, sans-serif', 
        subtitleFontSize=16
    )
    
    return chart

In [None]:
year_firstage(smoke)

This chart shows a gradual increase in the average age at which individuals first smoked cigarettes from 2015 to 2021, for both genders. Notably, there is a slight uptick after 2019, aligning with the implementation of the "Tobacco 21" policy, which raised the legal purchasing age for tobacco products to 21. This suggests that the policy may have contributed to delaying the initiation age for smoking, with both males and females showing similar trends over time.

In [None]:
def vap_cig(pie_data):
    """
    Pie chart of cigarettes and vapes use among young generation.
    """
    base = alt.Chart(pie_data).encode(
        theta=alt.Theta('count:Q', title='Proportion'),
        color=alt.Color('Usage:N', title='Ever Used').scale(
            domain=['Ever Used', 'Never Used'],
            range=['#dd644e', '#367588'])       
    )
    
    pie_chart = base.mark_arc(outerRadius=80)
    text = base.mark_text(radius=100, size=10, align="center").encode(
        text=alt.Text("percentage:N", format=".1%"),
        color=alt.value("#605E5C"))
    
    combined_chart = (pie_chart + text).facet(
        row=alt.Row('Product:N', title='Percentage of Product Use'),
        column=alt.Column('year:N', title='Year')
    ).properties(
        title={
            "text": ['Cigarettes vs Vapes Usage among 12-17 year old in 2020 and 2021'],
            "subtitle":['Vape usage shows higher prevalence than cigarette usage among youth, with slight decreases in both from 2020 to 2021.']
        }
    ).configure_title(
        anchor='start', 
        fontSize=24, 
        font='Roboto, sans-serif', 
        subtitleFont='Roboto, sans-serif', 
        subtitleFontSize=16
    )
    return combined_chart

In [None]:
vap_cig(pie_data)

This pie chart visualizes the proportion of 12-17 year-old individuals who have ever used versus never used cigarettes and vape products for the years 2020 and 2021. For cigarettes, the majority of respondents reported never using them, with a slight decline in ever-use from 7.7% in 2020 to 6.4% in 2021. In contrast, vape usage shows a notably higher percentage of ever-users, though it also decreased from 18.4% in 2020 to 14.8% in 2021. This suggests that while vapes are more commonly used than cigarettes in this age group, both products experienced a decrease in usage over the two years.

In [None]:
def race_smoke(df):
    """
    Stacked bar chart: CUrrent smokers by race.
    """
    total_by_race = df.groupby('NEWRACE2').size().reset_index(name='Total')
    
    # Calculate number of current smokers (CIGREC == 1) in each race
    smokers_by_race = df[df['CIGREC'] == 1].groupby('NEWRACE2').size().reset_index(name='Smokers')
    race_smoker_data = pd.merge(total_by_race, smokers_by_race, on='NEWRACE2', how='left')
    race_smoker_data['Smoker_Percentage'] = (race_smoker_data['Smokers'] / race_smoker_data['Total'])

    current = race_smoker_data[['NEWRACE2', 'Total', 'Smokers', 'Smoker_Percentage']]
    
    color_scale = alt.Scale(
        domain=['White', 'Black/African American', 'Asian', 'Hispanic', 
                'More than One Race', 'Native American Alaska Native', 'Native Hawaiian/Other Pacific Islanders'],
        range=['#d3d3d3', '#d3d3d3', '#d3d3d3', '#d3d3d3', '#d3d3d3', '#367588', '#d3d3d3']  # Adjusted colors
    )    
    chart = alt.Chart(current).mark_bar().encode(
        x=alt.X('NEWRACE2:N', title='Race', 
                sort=alt.EncodingSortField(field="Smoker_Percentage", order="descending"),
                axis=alt.Axis(labelAngle=-50)),
        y=alt.Y('Smoker_Percentage:Q', title='Percentage of Current Smokers').axis(format='%'),
        color=alt.Color('NEWRACE2:N', title='Race', scale=color_scale, legend=None)
    ).properties(
        title={
            "text": ['Percentage of Current Smokers by Race'],
            "subtitle":['Native American and Alaska Native individuals have the highest smoking rates, while Asian individuals report the lowest.']
        }
    ).configure_title(
        anchor='start', 
        fontSize=24, 
        font='Roboto, sans-serif', 
        subtitleFont='Roboto, sans-serif', 
        subtitleFontSize=16
    )
    return chart

In [None]:
race_smoke(df)

This bar chart displays the percentage of current smokers among various racial groups. Native American/Alaska Native individuals have the highest percentage of current smokers, while Asian populations show the lowest smoking rates. The data reveals racial disparities in smoking behavior, potentially influenced by socioeconomic and cultural factors.

In [None]:
def violin_frequency(df, var):
    """
    Violin chart: Cigarette Use Frequency 
    """
    violin = alt.Chart(df).transform_density(
    var,
    as_=[var, 'density'],
    groupby=['CATAG3']
).mark_area(orient='horizontal').encode(
    alt.X('density:Q')
        .stack('center')
        .impute(None)
        .title(None)
        .axis(labels=False, values=[0], grid=False, ticks=True),
    alt.Y(var, title=" Days Smoked in Past 30 Days"),
    alt.Color('CATAG3:N', legend=None).scale(scheme="tealblues"),
    alt.Column('CATAG3:N', title="Age Group")
        .spacing(0)
        .header(titleOrient='bottom', labelOrient='bottom', labelPadding=0)
    ).configure_axis(
    grid=False
    ).configure_view(
    stroke=None
).properties(
    title={
        "text":['Cigarette Use Frequency (Days Smoked in Past 30 Days) Across Different Age Groups'],
        "subtitle":["Smoking intensity increases with age, showing a trend toward heavier cigarette use in older age groups."]}).configure_title(
        anchor='start', 
        fontSize=24, 
        font='Roboto, sans-serif', 
        subtitleFont='Roboto, sans-serif', 
        subtitleFontSize=16
    )
    return violin

In [None]:
violin_frequency(smoke_2021.loc[smoke_2021['CIG30USE']<90], 'CIG30USE')

This violin plot demonstrates the distribution of cigarette use across different age groups. The 26-34 years old group shows the broadest distribution, with higher frequencies of both light and heavy smoking. The 18-25 group has a more narrow and lighter use pattern, while the 50+ group shows more sporadic but heavier smoking behaviors.

In [None]:
def stacked_cigarette_use(df):
    """
    Stacked bar chart:Daily Cigarette Consumption
    """
    stacked_bar = alt.Chart(df).mark_bar().encode(
        x=alt.X('CATAG3:N', title='Age Group', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('count()', stack='normalize', title='Proportion of Smokers'),
        color=alt.Color('CIG30AV_grouped:N', title='Avg Cigarettes Smoked Per Day',
                        scale=alt.Scale(scheme='tealblues'))
    ).configure_view(
    stroke=None
    ).properties(
    title={
        "text":['Daily Cigarette Consumption Distribution by Age Group'],
        "subtitle":["Similar to the previous trend, older age groups demonstrate heavier cigarette use, with younger age groups smoking fewer cigarettes per day."]},
    width=600,
    height=400
    ).configure_title(
        anchor='start', 
        fontSize=24, 
        font='Roboto, sans-serif', 
        subtitleFont='Roboto, sans-serif', 
        subtitleFontSize=16
    )

    return stacked_bar

In [None]:
stacked_cigarette_use(smoke_2021.loc[smoke_2021['CIG30AV']<90])

This stacked bar chart shows the proportion of smokers across different age groups, categorized by the average number of cigarettes smoked per day. Younger age groups tend to smoke fewer cigarettes per day, while middle-aged individuals (35-49 years old) have a higher proportion of heavy smokers. The trend reveals a clear gradation where older age groups are more likely to smoke larger quantities of cigarettes daily.

In [None]:
def tobacco_use(df):
    """
    Line chart: Different types of tobacco use by year
    """
    # Calculate percentage of ever use for each product by year
    aggregated_data = df.groupby('year').agg(
        Cigarettes=('CIGEVER', lambda x: (x == 1).mean()),  # Percentage for CIGEVER
        Smokeless=('SMKLSSEVR', lambda x: (x == 1).mean()),  # Percentage for SMKLSSEVR
        Cigars=('CIGAREVR', lambda x: (x == 1).mean()),  # Percentage for CIGAREVR
        Pipes=('PIPEVER', lambda x: (x == 1).mean())  # Percentage for PIPEVER
    ).reset_index()

    # Melt the data into long format for plotting
    melted_data = aggregated_data.melt(id_vars='year', 
                                       value_vars=['Cigarettes', 'Smokeless', 'Cigars', 'Pipes'],
                                       var_name='Product', value_name='Percentage')
    # Create the line plot
    line_chart = alt.Chart(melted_data).mark_line(point=True).encode(
        x=alt.X('year:O', title='Year', axis=alt.Axis(labelAngle=0)),  # X-axis: Year
        y=alt.Y('Percentage:Q', title='Percentage of Ever Use').axis(format='%', tickCount=5),  # Y-axis: Percentage of ever use
        color=alt.Color('Product:N', title='Product', legend=None).scale(scheme="tealblues")
    )
    label = alt.Chart(melted_data).encode(
        x='max(year):O', y=alt.Y('Percentage:Q').aggregate(argmax='year'), text='Product')
    
    # Create a text label
    text = label.mark_text(align='left', dx=10)

    combined = (line_chart + text).configure_axis(
        grid=False
    ).configure_view(
        stroke=None
    ).properties(
        title={
            "text": ['Percentage of Ever Use of Cigarettes, Smokeless Tobacco, Cigars, and Pipes by Year'],
            "subtitle": ['Cigarette use shows the steepest decline among all tobacco products, while other forms of tobacco exhibit more gradual decreases over time.']
        },
        width=700,
        height=400
    ).configure_title(
        anchor='start', 
        fontSize=24, 
        font='Roboto, sans-serif', 
        subtitleFont='Roboto, sans-serif', 
        subtitleFontSize=16
    )

    return combined


In [None]:
tobacco_use(df)

This line chart tracks the percentage of ever use of various tobacco products (Cigarettes, Smokeless Tobacco, Cigars, and Pipes) from 2015 to 2021. Cigarette use is consistently the highest, though it shows a steady decline. Use of cigars and smokeless tobacco remains stable with a slight decrease, while the use of pipes is relatively rare and shows minimal change.