In [24]:
import pandas as pd

df = pd.read_csv('../data/pregnancy.csv')

# Question 1: What is the most common pregnancy-related material item bought and used by non-pregnant people?

# Pseudocode: 
## 1. Filter the dataset for non-pregnant people only (0's)
## 2. Select only the columns with data containing items purchased
## 3. Sum up the purchases for each column
## 4. Identify the item with the highest purchase count

In [32]:
not_pregnant_df = df[df["PREGNANT"] == 0] #creates a new dataframe that only has the rows where the value of the "PREGNANT" column is 0 (not pregnant).

In [28]:
column_names = df.columns # get column names to select categories of items
print(column_names)

Index(['Pregnancy Test', 'Birth Control', 'Feminine Hygiene', 'Folic Acid',
       'Prenatal Vitamins', 'Prenatal Yoga', 'Body Pillow', 'Ginger Ale',
       'Sea Bands', 'Stopped buying ciggies', 'Cigarettes',
       'Smoking Cessation', 'Stopped buying wine', 'Wine', 'Maternity Clothes',
       'PREGNANT'],
      dtype='object')


In [None]:
pregnancy_items = ["Prenatal Vitamins", "Pregnancy Test", "Body Pillow", "Ginger Ale", "Sea Bands", "Maternity Clothes", "Folic Acid"] # creates a list of pregnancy-related columns only


In [30]:
selected_columns = non_pregnant_df[pregnancy_items] # pulls just the columns in pregnancy_items into a new dataframe

item_counts = selected_columns.sum() # sums the values for each of the columns for non-pregnant people

print(item_counts)

Prenatal Vitamins    44
Pregnancy Test       13
Body Pillow          12
Ginger Ale           38
Sea Bands            15
Maternity Clothes    59
Folic Acid            6
dtype: int64


In [31]:
most_common_item = item_counts.idxmax() # finds the column with the highest sum
print(most_common_item)

Maternity Clothes


In [48]:
def most_common_item_not_preg(df, pregnancy_items):
    non_pregnant_df = df[df["PREGNANT"] == 0]  # filter out pregnant people
    item_counts = non_pregnant_df[pregnancy_items].sum()  # count purchases for each column
    return item_counts.idxmax()  # return most common item

pregnancy_items = ["Prenatal Vitamins", "Pregnancy Test", "Body Pillow", "Ginger Ale", "Sea Bands", "Maternity Clothes", "Folic Acid"]
print(f"Most common pregnancy-related item: {most_common_item_not_preg(df, pregnancy_items)}")

Most common pregnancy-related item: Maternity Clothes


# Question 2: Are people who stop buying wine more likely to be pregnant?

# Pseudocode:
## 1: Filter the dataset to just people who stopped drinking wine
## 2: Select pregnant column from the stopped buying wine category 
## 3: Calculate the percentage of people who stopped drinking wine that are also pregnant 

In [44]:
stopped_buying_wine_df = df[df["Stopped buying wine"] == 1] # filter for just the people who stopped buying wine

pregnant_column = stopped_buying_wine_df["PREGNANT"] # selecting the pregnant people within the df of those who stopped buying wine

mean_pregnant = pregnant_column.mean() # find the overall average of pregnant people in this category 

percent_pregnant_stopped_buying_wine = mean_pregnant * 100 # turn it into a percentage

print(f"Percentage: {percent_pregnant_stopped_buying_wine:.2f}%")

Percentage: 60.00%


In [49]:
# function
def percent_pregnant_stopped_buying_wine(df):
    return (df[df["Stopped buying wine"] == 1]["PREGNANT"].mean()) * 100

print(f"Percentage of women that stopped buying wine and are pregnant: {percent_pregnant_stopped_buying_wine(df):.2f}%")

Percentage of women that stopped buying wine and are pregnant: 60.00%


In [50]:
df = pd.read_csv('../data/cereal.csv')
print(df.head())

                        name mfr type  calories  protein  fat  sodium  fiber  \
0                  100% Bran   N    C        70        4    1     130   10.0   
1          100% Natural Bran   Q    C       120        3    5      15    2.0   
2                   All-Bran   K    C        70        4    1     260    9.0   
3  All-Bran with Extra Fiber   K    C        50        4    0     140   14.0   
4             Almond Delight   R    C       110        2    2     200    1.0   

   carbo  sugars  potass  vitamins  shelf  weight  cups     rating  
0    5.0       6     280        25      3     1.0  0.33  68.402973  
1    8.0       8     135         0      3     1.0  1.00  33.983679  
2    7.0       5     320        25      3     1.0  0.33  59.425505  
3    8.0       0     330        25      3     1.0  0.50  93.704912  
4   14.0       8      -1        25      3     1.0  0.75  34.384843  


# Question 1: What are the top 5 highest rated pcereal products with <100 calories?

# Pseudocode: 
## 1. Filter the dataset to keep only cereals with <100 calories
## 2. Sort the filtered data by rating in descending order
## 3. Select the top 5 cereals with the highest ratings
## 4. Keep only name, calories, rating columns 


In [52]:
df_less_than_100 = df.query("calories < 100") #filters for cereals with less than 100 calories
descending_df = df_less_than_100.sort_values(by="rating", ascending=False)
top_5_df = descending_df.head(5)
result_df = top_5_df.loc[:, ["name", "calories", "rating"]]
print(result_df)

                         name  calories     rating
3   All-Bran with Extra Fiber        50  93.704912
64     Shredded Wheat 'n'Bran        90  74.472949
65  Shredded Wheat spoon size        90  72.801787
0                   100% Bran        70  68.402973
63             Shredded Wheat        80  68.235885


In [53]:
def high_rating_low_cal_cereals(df):
    return (
        df.query("calories < 100")
        .sort_values(by="rating", ascending=False)
        .head(5) 
        .loc[:, ["name", "calories", "rating"]] 
    )

top_5_cereals = high_rating_low_cal_cereals(df)
print(top_5_cereals)


                         name  calories     rating
3   All-Bran with Extra Fiber        50  93.704912
64     Shredded Wheat 'n'Bran        90  74.472949
65  Shredded Wheat spoon size        90  72.801787
0                   100% Bran        70  68.402973
63             Shredded Wheat        80  68.235885


# Question 2: Do higher rated cereals tend to have more calories than lower rated cereals?

# Pseudocode
## 1. Sort the dataset by rating in descending order
## 2. Select the top 10 highest-rated cereals
## 3. Compute their average calories
## 4. Select the bottom 5 lowest-rated cereals 
## 5. Compute their average calories.
## 6. Find the difference between the two means 

In [59]:
sort_df = df.sort_values(by="rating", ascending=False) #sort df by the rating column, highest to lowest
top_10 = sort_df.head(10) # select top 10 rated
top_10_avg_calories = top_10["calories"].mean() # calculate the average # of cals 
print(top_10_avg_calories)
bottom_10 = sort_df.tail(10) # select bottom 10 rated
bottom_10_avg_calories = bottom_10["calories"].mean() # calculate the average # of cals
print(bottom_10_avg_calories)
cal_diff = top_10_avg_calories - bottom_10_avg_calories # find the difference
print(f"Difference in average calories: {cal_diff}")





74.0
116.0
Difference in average calories: -42.0


In [67]:
def compare_cals_by_rating(df, top_n=10, bottom_n=10):
    sort_df = df.sort_values(by="rating", ascending=False)
    top_avg_cals = sort_df.head(top_n)["calories"].mean()
    bottom_avg_cals = sort_df.tail(bottom_n)["calories"].mean()
    cals_diff = top_avg_cals - bottom_avg_cals
    
    return {
        "Top 10 Avg Calories": top_avg_cals,
        "Bottom 10 Avg Calories": bottom_avg_cals,
        "Difference": cals_diff
    }

result = compare_cals_by_rating(df)
print(result)

{'Top 10 Avg Calories': 74.0, 'Bottom 10 Avg Calories': 116.0, 'Difference': -42.0}
