<a href="https://colab.research.google.com/github/tammyd/CFDB_Notebook/blob/main/CFDB_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the CSV file
file_path = '/content/catfood.csv'
df = pd.read_csv(file_path)

# Remove rows with 'discontinued' value equal to 0
df = df[df['discontinued'] == 0]

# drop columns unneeded for analysis
to_drop = ['asin', 'imageUrl', 'catfood', 'automatedUpdate', 'raw', 'baby', 'veterinary', 'discontinued']
df = df.drop(to_drop, axis=1)

df.set_index('id', inplace=True)


In [None]:
# Split out the top 5 ingredients for each cat food
ingredients_split = df['ingredients'].str.split(',', expand=True)
ingredients_split = ingredients_split.add_prefix('ing_')

none_counts = ingredients_split.isna().sum()

# Display the counts
print("Total number of rows:", ingredients_split.shape[0])
print("Count of 'None' (NaN) values in each 'ing_' column:")
print(none_counts)

df = pd.concat([df, ingredients_split], axis=1)
df = df.drop(columns=['ingredients'])

# Lets see the rows that have very few ingredients
df[df['ing_6'].isna()]

Total number of rows: 2848
Count of 'None' (NaN) values in each 'ing_' column:
ing_0         0
ing_1         0
ing_2         0
ing_3         0
ing_4         1
           ... 
ing_130    2847
ing_131    2847
ing_132    2847
ing_133    2847
ing_134    2847
Length: 135, dtype: int64


Unnamed: 0_level_0,brand,flavor,protein,fat,fibre,moisture,ash,source,updated,ing_0,...,ing_125,ing_126,ing_127,ing_128,ing_129,ing_130,ing_131,ing_132,ing_133,ing_134
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3279,Vital Essentials,Turkey Patties Frozen Grain Free,12.0,5.0,0.5,75.0,3.0,https://www.vitalessentialsraw.com/product/tur...,2023-01-29,Turkey,...,,,,,,,,,,
4631,Snappy Tom,Lites Chicken With Salmon,11.0,0.5,1.0,85.0,1.0,https://www.snappytom.com/product-page/snappy-...,2022-02-09,Chicken,...,,,,,,,,,,
4632,Snappy Tom,Lites Tuna,15.0,0.5,1.0,85.0,1.0,https://www.snappytom.com/product-page/snappy-...,2022-02-09,Tuna,...,,,,,,,,,,
4634,Snappy Tom,Lites Tuna With Pumpkin,14.0,0.5,1.0,85.0,1.0,https://www.snappytom.com/product-page/snappy-...,2022-02-09,Tuna,...,,,,,,,,,,
4639,Snappy Tom,Naturals Sardine Cutlet With Salmon,10.0,1.0,1.0,85.0,3.0,https://www.snappytom.com/product-page/snappy-...,2022-02-09,Sardine,...,,,,,,,,,,
4640,Snappy Tom,Naturals Tuna Temptations With Salmon,15.0,2.5,1.0,85.0,1.0,https://www.snappytom.com/product-page/snappy-...,2022-02-09,Tuna,...,,,,,,,,,,
4642,Snappy Tom,Naturals Tuna With Whitebait And Crabmeat,16.0,1.0,1.0,85.0,1.5,https://www.snappytom.com/product-page/snappy-...,2022-02-09,Tuna,...,,,,,,,,,,
5012,Wysong,Canine-Feline Beef,10.0,8.0,1.7,75.0,3.0,https://www.wysong.net/epigen-canned,2022-02-20,Beef,...,,,,,,,,,,
5013,Wysong,Canine-Feline Chicken,10.0,8.0,1.5,75.0,3.0,https://www.wysong.net/epigen-canned,2022-02-20,Chicken,...,,,,,,,,,,
5014,Wysong,Canine-Feline Duck,10.0,8.0,1.5,75.0,3.0,https://www.wysong.net/epigen-canned,2022-02-20,Duck,...,,,,,,,,,,


In [None]:
# Algo copied from catfooddb:
#  * Ingredients: Start at 2
#   *              + 1 protein first
#   *              + .5 each other protein 2-4
#   *              - 1 if top by product or filler
#   *              - .5 each other by product
#   *              + 1 if no filler
#   *              + 1 if no byproducts
#   *              + 1.5 if < 15 ingredients (+1 if <20)
#   *              -1 if any undeserable perservative


# define the various types of ingredients
def get_protein_adjectives():
    return ['deboned', 'fresh deboned', 'organic', 'boneless', 'boneless/skinless', 'de-boned', 'whole', 'fresh', 'hydrolyzed', 'raw',
            'lamb', 'cutlets', 'flaked', 'shredded', 'flakes', 'freeze-dried', 'dried', 'baby', 'meat', 'dehydrated', 'fresh whole',
            'whole atlantic', 'dehydrated whole', 'fresh angus', 'fresh plains', 'fresh yorkshire', 'fresh whole pacific', 'grass-fed',
            'raw grass-fed', 'finely ground', 'king', 'ocean caught', 'humanely raised', 'pasture raised', 'wild caught', 'sustainably sourced',
            'humanely raised', 'wild pacific', 'wild atlantic', 'tongol']

def get_protein_specifics():
    return ['heart', 'thigh', 'liver', 'lung', 'liver', 'giblets', 'meal', 'white meat', 'filets', 'red meat', 'meat meal',
            'whole meat', 'cutlets', 'tripe', 'meat', '(boneless, skinless breast)', 'livers', 'gizzards', 'gizzard', 'hearts', 'necks',
            'giblets (liver, heart, kidney)', 'with ground bone', '(ground with bone)', 'kidney', 'lungs', 'trachea', 'skin',
            'with bone', 'spleen']



def is_protein_source(ingredient):
  return False

def is_byproduct(ingredient):
  return False

def is_fillter(ingredient):
  return False

def is_undesirable_perservative(ingredient):
  return False



def get_ingredient_score(row):

  is_top_protein = is_protein_source(row['ing_1'])
  is_second_ing_protein = is_protein_source(row['ing_2'])
  is_third_ing_protein = is_protein_source(row['ing_3'])
  is_fourth_ing_protein = is_protein_source(row['ing_4'])
  is_fifth_ing_protein = is_protein_source(row['ing_5'])

  is_top_filler = is_fillter(row['ing_1'])
  is_top_byproduct = is_byproduct(row['ing_1'])

  num_fillers = 0
  num_proteins = 0
  num_byproducts = 0
  num_undesirable_perservatives = 0

  for column in row.index:
        if column.startswith('ing_') and pd.notna(row[column]):
            ingredient = row[column].strip()
            if is_protein_source(ingredient):
                num_proteins += 1
            if is_byproduct(ingredient):
                num_byproducts += 1
            if is_fillter(ingredient):
                num_fillers += 1
            if is_undesirable_perservative(ingredient):
                num_undesirable_perservatives += 1


