In [None]:
from pathlib import Path
import pandas as pd

BASE = Path("..") / "data" / "processed"
data_path = BASE / "combined_state_summary_2022.csv"

df = pd.read_csv(data_path)
df.head()


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Make plots look a bit nicer
sns.set(style="whitegrid")

BASE = Path("..") / "data" / "processed"
data_path = BASE / "combined_state_summary_2022.csv"

df = pd.read_csv(data_path)
df.head()


In [None]:
df.shape, df.columns
df.describe()


In [None]:
df.nlargest(5, "incidence_rate_adj")[["State", "incidence_rate_adj"]]


In [None]:
df.nsmallest(5, "incidence_rate_adj")[["State", "incidence_rate_adj"]]


In [None]:
df.nlargest(5, "obesity_rate")[["State", "obesity_rate"]]


In [None]:
df.nlargest(5, "smoking_rate")[["State", "smoking_rate"]]


In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df["incidence_rate_adj"], kde=True)
plt.title("Distribution of Age-Adjusted Colon & Rectum Cancer Incidence (2022)")
plt.xlabel("Incidence rate per 100,000")
plt.ylabel("Number of states")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.regplot(data=df, x="obesity_rate", y="incidence_rate_adj")
plt.title("Obesity vs Colon & Rectum Cancer Incidence (2022)")
plt.xlabel("Obesity rate (%)")
plt.ylabel("Incidence rate per 100,000")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.regplot(data=df, x="smoking_rate", y="incidence_rate_adj")
plt.title("Smoking vs Colon & Rectum Cancer Incidence (2022)")
plt.xlabel("Smoking rate (%)")
plt.ylabel("Incidence rate per 100,000")
plt.show()


In [None]:
plt.figure(figsize=(7,5))
sns.heatmap(df[["obesity_rate", "smoking_rate", "incidence_rate_adj", "incidence_cases"]].corr(),
            annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()


In [None]:
plt.figure(figsize=(12,8))
plt.scatter(df["obesity_rate"], df["incidence_rate_adj"])

for _, row in df.iterrows():
    plt.text(row["obesity_rate"] + 0.1,
             row["incidence_rate_adj"] + 0.1,
             row["Abbrev"],
             fontsize=8)

plt.xlabel("Obesity rate (%)")
plt.ylabel("Incidence rate per 100,000")
plt.title("Obesity vs Colon & Rectum Cancer Incidence by State (2022)")
plt.show()
