## Title Cell

In [None]:
# Import dependencies
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Import csv, convert to a dataframe. List the columns so we know how to clean it up
diabetes_csv = pd.read_csv("diabetes_indicators.csv")
diabetes_df = pd.DataFrame(diabetes_csv)
diabetes_df.columns

## William

In [None]:
# Trim the dataframe so it is only our variables of interest
df_trimmed = diabetes_df[["Diabetes_012", "Income", "Education", "Age", "HvyAlcoholConsump", "Fruits", "Veggies", "CholCheck", "HighChol", "PhysActivity", "BMI"]].copy()
df_trimmed.head()

In [None]:
# Group by fruit consumption and diabetes status, count the number of participants for each category
dia_fruits = df_trimmed.groupby(["Fruits", "Diabetes_012"]).size().reset_index(name='count')

# ***(I think we could also use a .loc here to make it easier - but this works)***
# Separate 0 fruit values from 1 fruit values, copying the tables so we can easily perform actions on them
nofruits = dia_fruits[dia_fruits["Fruits"] == 0].copy()
yesfruits = dia_fruits[dia_fruits["Fruits"] == 1].copy()

# Find the total respondents per category (fruit and no fruit)
# Create a new column for the proportion of respondents in each row out of their respective category
nofruits_sum = nofruits["count"].sum()
nofruits["proportion"] = (nofruits["count"] / nofruits_sum) * 100

yesfruits_sum = yesfruits["count"].sum()
yesfruits["proportion"] = (yesfruits["count"] / yesfruits_sum) * 100

# Using the pd.concat() function, combine the two tables into the new fruitstack df, which shows the proportions
fruitstack = pd.concat([nofruits, yesfruits], ignore_index=False)
fruitstack

In [None]:
# Set up pivot table to reorganize fruitstack df
pivot_df = fruitstack.pivot(index='Fruits', columns='Diabetes_012', values='proportion')

# Rename columns for easy display formatting
pivot_df.columns = ['No Diabetes', 'Prediabetes', 'Diabetes']

# Rename indices for easy display formatting
index_mapping = {0.0: 'Consume fruit less than once per day', 1.0: 'Consume fruit once or more per day'}
pivot_df.index = pivot_df.index.map(index_mapping)

# Plot the table
pivot_df.plot(kind='bar', figsize=(10, 6))
plt.title('Proportion of Respondents with Diabetes\n(Non-fruit consumers vs. fruit consumers)')
plt.xlabel('Fruits')
plt.ylabel('Percentage')
plt.xticks(ticks=range(len(pivot_df.index)), labels=pivot_df.index, rotation=0)
plt.legend(title='Diabetes Status', title_fontsize='13')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Chinna

## Abigail

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the relevant dataset
file_path = '/Users/muadrashid/Downloads/archive-6/diabetes_012_health_indicators_BRFSS2015.csv'
data = pd.read_csv(file_path)

# Map the education codes to their corresponding labels and set the order
education_labels = {
    1: 'Never attended/KG',
    2: 'Grade 1-8',
    3: 'Grade 9-11',
    4: '12-GED',
    5: 'College 1-3',
    6: 'College Graduate'
}

education_order = ['Never attended/KG', 'Grade 1-8', 'Grade 9-11', '12-GED', 'College 1-3', 'College Graduate']

# Map the age codes to their corresponding labels and set the order
age_labels = {
    1: '18-24',
    2: '25-29',
    3: '30-34',
    4: '35-39',
    5: '40-44',
    6: '45-49',
    7: '50-54',
    8: '55-59',
    9: '60-64',
    10: '65-69',
    11: '70-74',
    12: '75-79',
    13: '80+'
}

age_order = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80+']

# Map the income codes to their corresponding labels and set the order
income_labels = {
    1: 'Less than $10,000',
    2: '$10,000 to <$15,000',
    3: '$15,000 to <$20,000',
    4: '$20,000 to <$25,000',
    5: '$25,000 to <$35,000',
    6: '$35,000 to <$50,000',
    7: '$50,000 to <$75,000',
    8: '$75,000 or more'
}

income_order = ['Less than $10,000', '$10,000 to <$15,000', '$15,000 to <$20,000', '$20,000 to <$25,000', '$25,000 to <$35,000', '$35,000 to <$50,000', '$50,000 to <$75,000', '$75,000 or more']

# Replace the numeric codes with labels
data['Education'] = pd.Categorical(data['Education'].map(education_labels), categories=education_order, ordered=True)
data['Age'] = pd.Categorical(data['Age'].map(age_labels), categories=age_order, ordered=True)
data['Income'] = pd.Categorical(data['Income'].map(income_labels), categories=income_order, ordered=True)

# Visualization 1: BMI vs Income
plt.figure(figsize=(10, 6))
sns.boxplot(x='Income', y='BMI', data=data, order=income_order)
plt.title('BMI vs Income')
plt.xlabel('Income')
plt.ylabel('BMI')
plt.xticks(rotation=45)
plt.show()

# Visualization 2: BMI vs Education
plt.figure(figsize=(10, 6))
sns.boxplot(x='Education', y='BMI', data=data, order=education_order)
plt.title('BMI vs Education')
plt.xlabel('Education')
plt.ylabel('BMI')
plt.xticks(rotation=45)
plt.show()

# Visualization 3: BMI vs Age
plt.figure(figsize=(10, 6))
sns.boxplot(x='Age', y='BMI', data=data, order=age_order)
plt.title('BMI vs Age')
plt.xlabel('Age')
plt.ylabel('BMI')
plt.xticks(rotation=45)
plt.show()

# Visualization 4: BMI's correlation with Diabetes outcome
plt.figure(figsize=(10, 6))
sns.boxplot(x='Diabetes_012', y='BMI', data=data)
plt.title('BMI vs Diabetes Outcome')
plt.xlabel('Diabetes Outcome (0 = No Diabtetes, 1 = Pre-Diabetes, 2 = Diabetes)')
plt.ylabel('BMI')
plt.show()


## Muad

## Sophia

## Conclusions and Bonus Workspace