In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,OBJECTID,PostalCode,TotalVaccinated,VaccinatedAge75Up,VaccinatedAge65To74,VaccinatedAge55To64,VaccinatedAge45To54,VaccinatedAge35To44,VaccinatedAge25To34,VaccinatedAge15To24,VaccinatedAge0To14,StartDate,CreateDate
0,1,85190,,,,,,,,,,2021/06/16 07:00:00+00,
1,2,85333,,,,,,,,,,2021/06/16 07:00:00+00,
2,3,85139,3.0,0.0,,0.0,0.0,0.0,,0.0,0.0,2021/06/16 07:00:00+00,
3,4,85343,102.0,17.0,17.0,16.0,17.0,16.0,12.0,,0.0,2021/06/16 07:00:00+00,
4,5,85377,2662.0,817.0,936.0,528.0,146.0,72.0,82.0,72.0,,2021/06/16 07:00:00+00,


In [73]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

# Data Cleaning and Processing Functions
def clean_data(df):
    df = df.dropna(subset=['TotalVaccinated'])
    df['StartDate'] = pd.to_datetime(df['StartDate'])
    
    age_columns = ['VaccinatedAge75Up', 'VaccinatedAge65To74', 'VaccinatedAge55To64',
                   'VaccinatedAge45To54', 'VaccinatedAge35To44', 'VaccinatedAge25To34',
                   'VaccinatedAge15To24', 'VaccinatedAge0To14']
    df[age_columns] = df[age_columns].fillna(0)
    
    return df

def combine_age_groups(df):
    df['Seniors'] = df['VaccinatedAge75Up'] + df['VaccinatedAge65To74']
    df['Adults'] = (df['VaccinatedAge55To64'] + df['VaccinatedAge45To54'] + 
                   df['VaccinatedAge35To44'] + df['VaccinatedAge25To34'])
    df['Youth'] = df['VaccinatedAge15To24'] + df['VaccinatedAge0To14']
    
    return df

def add_region_mapping(df):
    def assign_region(postal_code):
        if postal_code < 85200:
            return 'Central'
        elif postal_code < 85300:
            return 'East'
        elif postal_code < 85400:
            return 'West'
        else:
            return 'North'
    
    df['Region'] = df['PostalCode'].apply(assign_region)
    return df

def create_pretty_plot(df):
    fig = plt.figure(figsize=(15, 10))
    gs = fig.add_gridspec(2, 2)

    # Plot 1: Age Groups (Top Left)
    ax1 = fig.add_subplot(gs[0, 0])
    age_categories = ['Seniors', 'Adults', 'Youth']
    age_totals = df[age_categories].sum().sort_values(ascending=True)

    colors = sns.color_palette("viridis", len(age_categories))
    bars = ax1.barh(range(len(age_totals)), age_totals.values, color=colors)
    ax1.set_yticks(range(len(age_totals)))
    ax1.set_yticklabels(age_totals.index)
    ax1.set_xlabel('Total Vaccinations')
    ax1.set_title('Vaccinations by Age Category')

    # Add value labels
    for bar in bars:
        width = bar.get_width()
        ax1.text(width, bar.get_y() + bar.get_height()/2, 
                 f'{int(width):,}', 
                 ha='left', va='center')

    # Plot 2: Vaccination Trends (Top Right) - New Plot
    ax2 = fig.add_subplot(gs[0, 1])
    
    # Calculate daily average vaccinations
    daily_avg = df.groupby('StartDate')['TotalVaccinated'].mean()
    
    # Create the main line plot
    ax2.plot(daily_avg.index, daily_avg.values, 
            color='darkblue', 
            linewidth=2, 
            marker='o',
            markersize=6,
            label='Daily Average')
    
    # Add trend line
    z = np.polyfit(range(len(daily_avg)), daily_avg.values, 1)
    p = np.poly1d(z)
    ax2.plot(daily_avg.index, p(range(len(daily_avg))), 
            "r--", 
            alpha=0.8,
            label='Trend Line')
    
    # Customize the trend plot
    ax2.set_title('Average Daily Vaccination Totals')
    ax2.set_xlabel('Date')
    ax2.set_ylabel('Average Number of Vaccinations')
    ax2.grid(True, linestyle='--', alpha=0.7)
    ax2.tick_params(axis='x', rotation=45)
    ax2.legend()
    
    # Add peak annotation
    max_point = daily_avg.max()
    max_date = daily_avg.idxmax()
    ax2.annotate(f'Peak: {int(max_point):,}',
                xy=(max_date, max_point),
                xytext=(10, 10),
                textcoords='offset points',
                ha='left',
                va='bottom',
                bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle='->'))

    # Plot 3: Regional Distribution (Bottom Left)
    ax3 = fig.add_subplot(gs[1, 0])
    regional_totals = df.groupby('Region')['TotalVaccinated'].sum()
    ax3.pie(regional_totals, labels=regional_totals.index, autopct='%1.1f%%',
            colors=sns.color_palette("Set2"))
    ax3.set_title('Distribution of Vaccinations by Region')

    # Plot 4: Age Distribution by Region (Bottom Right)
    ax4 = fig.add_subplot(gs[1, 1])
    regional_age_dist = df.groupby('Region')[age_categories].sum()
    regional_age_dist.plot(kind='bar', ax=ax4)
    ax4.set_title('Age Distribution by Region')
    ax4.set_ylabel('Total Vaccinations')
    plt.xticks(rotation=45)

    plt.suptitle('Maricopa County COVID-19 Vaccination Analysis\nJune 2021 - October 2021', 
                 fontsize=14, y=1.02)
    plt.tight_layout()

    plt.savefig('pretty_plot.png', dpi=300, bbox_inches='tight')
    plt.close()
def create_ugly_plot(df):
    # Create a figure with inappropriate dimensions (scale issue)
    plt.figure(figsize=(10, 8))  # Too wide and short
    
    # Create multiple subplots with poor layout
    plt.subplot(121)
    
    # Wrong plot type for time series data
    time_data = df.groupby('StartDate')['TotalVaccinated'].sum()
    plt.pie(time_data.values[-10:],  # Using only last 10 values without context
            labels=[str(d.date()) for d in time_data.index[-10:]],
            colors=['red', 'yellow', 'green', 'blue'] * 3)  # Repeating harsh colors
    
    # Error 1: Inappropriate plot type for temporal data
    # Error 2: Cut-off labels due to poor spacing
    plt.title('vacination trends\nover time!!', 
             color='purple',
             pad=-20)  # Negative padding causes overlap
    
    # Add second subplot with different scale
    plt.subplot(122)
    
    # Error 3: Mixed scales and units
    age_data = df[['Seniors', 'Adults', 'Youth']].sum() / 1000000  # Inconsistent scaling
    plt.bar(range(len(age_data)), age_data, 
           width=0.1)  # Too narrow bars
    
    # Error 4: Unreadable text and poor alignment
    plt.xticks(range(len(age_data)), 
               ['OLD\nPPL', 'MIDDLE\nAGE', 'KIDS'],  # Informal and inconsistent labels
               rotation=75,  # Awkward rotation
               fontsize=6)  # Too small font
    
    # Error 5: Missing or misleading axis labels
    plt.ylabel('millions (maybe)?')
    
    # Error 6: Overwhelming grid
    plt.grid(True, which='both', color='green', 
             linestyle='-', linewidth=2)
    
    # Error 7: Random text overlapping data
    plt.text(0.5, max(age_data)/2, 
             'IMPORTANT DATA!!!', 
             fontsize=20, 
             color='red',
             rotation=30)
    
    # Error 8: Poor use of space
    plt.subplots_adjust(wspace=0.8)  # Too much space between subplots
    
    # Error 9: Low resolution output
    plt.savefig('ugly_plot.png', dpi=50, bbox_inches=None)  # Intentionally poor resolution
    plt.close()


def main():
    # Read and process data
    df = pd.read_csv('data.csv')
    df = clean_data(df)
    df = combine_age_groups(df)
    df = add_region_mapping(df)
    
    # Create plots
    create_pretty_plot(df)
    create_ugly_plot(df)
    
    print("Plots have been created successfully!")

if __name__ == "__main__":
    main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['StartDate'] = pd.to_datetime(df['StartDate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[age_columns] = df[age_columns].fillna(0)


Plots have been created successfully!


In [74]:
from fpdf import FPDF
import textwrap

def create_pdf_report():
    class PDF(FPDF):
        def chapter_title(self, title):
            self.set_font('Arial', 'B', 15)
            self.cell(0, 10, title, ln=True)
            self.ln(10)

        def chapter_body(self, body):
            self.set_font('Arial', '', 12)
            lines = textwrap.wrap(body, width=90)
            for line in lines:
                self.multi_cell(0, 10, line)
            self.ln()

    # Initialize PDF
    pdf = PDF()
    
    # Pretty Plot Page
    pdf.add_page()
    pdf.image('pretty_plot.png', x=10, y=20, w=190)
    
    # Add caption for pretty plot
    pretty_caption = (
        "Figure 1: Comprehensive analysis of COVID-19 vaccinations in Maricopa County from "
        "June to October 2021. The visualization shows: "
        "(A) Total vaccinations by age category, demonstrating higher vaccination rates among seniors and adults compared to youth; "
        "(B) Temporal trends across regions, revealing varying patterns of vaccine uptake; "
        "(C) Regional distribution of total vaccinations, highlighting geographic patterns; "
        "and (D) Age distribution patterns across different regions. Data has been cleaned "
        "and aggregated to show clear demographic and geographic patterns in vaccination rates."
    )
    pdf.set_y(-55)
    pdf.set_font('Arial', '', 11)
    pdf.multi_cell(0, 5, pretty_caption)
    
    # Ugly Plot Page
    pdf.add_page()
    pdf.image('ugly_plot.png', x=10, y=20, w=190)
    
    # Add intentionally bad caption for ugly plot
    ugly_caption = (
        "fig 1: vacination data showing stuff from maricopa county ZIP CODES!!! "
        "the left thing shows time data (maybe?) & the right thing shows age groups "
        "or something... IMPORTANT ANALYSIS!!! done in 2021!!!!!!"
    )
    pdf.set_y(-45)
    pdf.set_font('Arial', '', 11)
    pdf.multi_cell(0, 5, ugly_caption)
    
    # Code and Documentation Page
    pdf.add_page()
    pdf.set_font('Arial', 'B', 14)
    pdf.cell(0, 10, 'Question Analysis', ln=True)
    pdf.ln(5)

    # Analysis
    documentation = [
        {
            'title': 'Q1. How do the vaccination totals between age groups compare?:',
            'content': (
                "- Adults have the highest vaccination totals, followed by Seniors and Youth. "
                "This indicates that vaccination efforts have been successful in reaching adults "
                "and that vaccination rates are high among this age group.\n"
                "- Among the Adults, in the East region, the vaccination rates are the highest."
                "followed by the Central region and then the West region. "
            )
        },
        {
            'title': 'Q2. How have the average vaccination totals changed over time in Maricopa County? ',
            'content': (
                "- The average vaccination totals have been increasing over time, "
                "peaking around February-March 2022.\n"
                "- The trend line shows a gradual increase in vaccination totals, "
                "indicating a positive trend in vaccination efforts.\n"
                "- We can see a dip in the vaccination trend in the month of August 2021.\n"
            )
        }
    ]
    
    pdf.set_font('Arial', '', 11)
    for section in documentation:
        pdf.set_font('Arial', 'B', 12)
        pdf.cell(0, 10, section['title'], ln=True)
        pdf.set_font('Arial', '', 11)
        pdf.multi_cell(0, 6, section['content'])
        pdf.ln(5)

    pdf.add_page()
    pdf.set_font('Arial', 'B', 14)
    pdf.cell(0, 10, 'Data Processing and Visualization Steps', ln=True)
    pdf.ln(5)
    # Adding the documentation
    documentation = [
        {
            'title': '1. Data Preparation:',
            'content': (
                "- Input: 'data.csv' containing Maricopa County vaccination data\n"
                "- Cleaned missing values in vaccination columns\n"
                "- Converted dates to datetime format\n"
                "- Created age group categories (Seniors, Adults, Youth)"
            )
        },
        {
            'title': '2. Data Processing:',
            'content': (
                "- Combined age groups:\n"
                "  * Seniors: 75+ and 65-74\n"
                "  * Adults: 55-64, 45-54, 35-44, and 25-34\n"
                "  * Youth: 15-24 and 0-14\n"
                "- Created regional mapping based on postal codes"
            )
        },
        {
            'title': '3. Required Packages:',
            'content': (
                "- pandas\n"
                "- matplotlib\n"
                "- seaborn\n"
                "- numpy\n"
                "- fpdf"
            )
        },
        {
            'title': '4. Execution Steps:',
            'content': (
                "1. Install required packages:\n"
                "   pip install pandas matplotlib seaborn numpy fpdf\n"
                "2. Place data.csv in the same directory as the script\n"
                "3. Run the Python script to generate visualizations and PDF report"
            )
        }
    ]
    
    pdf.set_font('Arial', '', 11)
    for section in documentation:
        pdf.set_font('Arial', 'B', 12)
        pdf.cell(0, 10, section['title'], ln=True)
        pdf.set_font('Arial', '', 11)
        pdf.multi_cell(0, 6, section['content'])
        pdf.ln(5)
    
    # Save the PDF
    pdf.output('vaccination_analysis.pdf')

def main():
    create_pdf_report()
    print("PDF report has been generated successfully!")

if __name__ == "__main__":
    main()

PDF report has been generated successfully!
