In [154]:
from huggingface_hub import notebook_login
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
#from google.colab import drive
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from thefuzz import fuzz
from thefuzz import process
import dedupe
import pandas_dedupe

In [155]:
complete_food_data = pd.read_excel('/Users/anthonyhakim/DSI/good-food-purchasing/data/raw/Name Normalization_Combined_Example.xlsx')

In [156]:
columns_list = ['Product Type', 'Food Product Category']
renaming_mapping = {'Product Type': 'text', 'Food Product Category': 'labels'}

In [157]:
class DataSet():
    
    def __init__(self, dataframe):
        self.data = dataframe
        self.labels_grouping = None
        self.threshold = None
        self.dataset_observations = len(self.data)
        
    def clean_data(self, columns_list: list, renaming_mapping: dict) -> pd.DataFrame:
    
        self.data = self.data[columns_list]
        self.data.rename(columns=renaming_mapping, inplace=True)
        self.data['labels'] = self.data['labels'].str.lower().str.split(',').str[0].str.strip()
        self.data.dropna(inplace=True)
        self.data.reset_index(drop=True, inplace=True)
        self.data['labels'] = self.data['labels'].str.replace('prequalified: ', '')
        self.data = self.data.drop_duplicates().reset_index(drop=True)
    
    def group_labels(self, category: str, category_label: str, labels_column = 'labels', matching_function= fuzz.ratio, thresh = 75) -> dict:
        
        self.threshold = thresh
    
        #subset dataframe by category
        if category or category_label == None:
            column_name = self.data[[labels_column]]
        else:
            column_name = self.data[self.data[category] == category_label][[labels_column]]
        
        column_name['value_counts_column'] = column_name['labels'].map(column_name['labels'].value_counts())
        column_name = column_name.sort_values('value_counts_column', ascending=False).reset_index(drop=True)
        
        column_name = column_name.labels.values.flatten().tolist()
        print('number of non unique entries in column', len(column_name))
        
        #Create list of dictionaries of subgroups
        label_groups = {}
        for label_entry in column_name:
            counter = 0
            for label_group_key in label_groups.keys():
                if matching_function(label_entry, label_group_key) >= thresh:
                    counter += 1
                    if label_entry != label_group_key:
                        label_groups[label_group_key].append(label_entry)
            if counter == 0:
                label_groups[label_entry] = []
                
        self.labels_grouping = label_groups
        
    def clean_labels(self, add_list, rm_list):
        
        def alter_food_set(labels_set, add_list, rm_list):
            print('number of elements in food set before additions', len(labels_set))
            labels_set.update(add_list)
            labels_set.difference_update(rm_list)
            
            print('number of elements in food set after additions', len(labels_set))
            
        def add_and_remove_keys_from_dict(input_dict, keys_to_add, keys_to_remove, default_value = []):
            for key in keys_to_remove:
                if key in input_dict:
                    del input_dict[key]

            for key in keys_to_add:
                if key not in input_dict:
                    input_dict[key] = default_value
            
        def find_best_match(misspelled_value, choices):
            best_match = process.extractOne(misspelled_value, choices)
            return best_match[0]
            
        self.add_list = add_list
        self.rm_list = rm_list
        self.labels_set = set(self.labels_grouping)
        alter_food_set(self.labels_set, self.add_list, self.rm_list)
        add_and_remove_keys_from_dict(self.labels_grouping, self.add_list, self.rm_list)
        
        self.data['correct_labels'] = self.data['labels'].apply(lambda x: find_best_match(x, self.labels_set))
        self.data = self.data[['text', 'correct_labels']]
        self.data.rename(columns={'correct_labels': 'labels'}, inplace=True)

    def factorize_dataset(self):
        self.data['labels'], self.factorized_label_mapping = pd.factorize(self.data['labels'])
        
    def shuffle_dataset(self):
        self.data = shuffle(self.data)
        self.data.reset_index(drop=True, inplace=True)

In [174]:
fpg_dataset = DataSet(complete_food_data)

In [175]:
fpg_dataset.clean_data(columns_list, renaming_mapping)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data.rename(columns=renaming_mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['labels'] = self.data['labels'].str.lower().str.split(',').str[0].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in 

In [176]:
fpg_dataset.data

Unnamed: 0,text,labels
0,Carrot Baby Petite Peeled,fruit
1,Kiwi 33 count,fruit
2,Pear D' Anjou 135ct,fruit
3,"Sweet Potato Peeled/Cut 1""",roots & tubers
4,"Sweet Potato Dice 3/4"" Peeled",roots & tubers
...,...,...
5348,FC Beef Burger 69101R Daily Choice 21#,beef
5349,F/C Beef Taco Filling 4/5#,beef
5350,BEEF MEATBALLS NO SAUCE 1 OZ 10 LB,beef
5351,Beef Hot Dog 15# 88719 Applegate 144ct,beef


In [177]:
fpg_dataset.group_labels(category=None ,category_label=None, thresh=85, matching_function=fuzz.ratio)

number of non unique entries in column 5353


In [178]:
fpg_dataset.labels_grouping

{'condiments & snacks': [],
 'meals': [],
 'vegetables': [],
 'non-food': [],
 'grain products': [],
 'beverages': [],
 'fruit': [],
 'chicken': [],
 'roots & tubers': [],
 'beef': [],
 'cheese': [],
 'pork': [],
 'milk': [],
 'milk & dairy': [],
 'seafood': [],
 'turkey': [],
 'yogurt': [],
 'legumes': [],
 'eggs': [],
 'tree nuts & seeds': [],
 'fish (wild)': [],
 'rice': [],
 'meat': ['meats', 'meats', 'meats'],
 'butter': [],
 'fish (farm-raised)': ['fish (farmed-raised)', 'fish (farmed-raised)'],
 'produce': [],
 'turkey & other poultry': [],
 'bread': []}

In [179]:
fpg_add_list =['anthony']
fpg_rm_list =['turkey']

In [180]:
fpg_dataset.clean_labels(fpg_add_list, fpg_rm_list)

number of elements in food set before additions 28
number of elements in food set after additions 28


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data.rename(columns={'correct_labels': 'labels'}, inplace=True)


In [181]:
fpg_dataset.labels_grouping

{'condiments & snacks': [],
 'meals': [],
 'vegetables': [],
 'non-food': [],
 'grain products': [],
 'beverages': [],
 'fruit': [],
 'chicken': [],
 'roots & tubers': [],
 'beef': [],
 'cheese': [],
 'pork': [],
 'milk': [],
 'milk & dairy': [],
 'seafood': [],
 'yogurt': [],
 'legumes': [],
 'eggs': [],
 'tree nuts & seeds': [],
 'fish (wild)': [],
 'rice': [],
 'meat': ['meats', 'meats', 'meats'],
 'butter': [],
 'fish (farm-raised)': ['fish (farmed-raised)', 'fish (farmed-raised)'],
 'produce': [],
 'turkey & other poultry': [],
 'bread': [],
 'anthony': []}

In [182]:
fpg_dataset.data

Unnamed: 0,text,labels
0,Carrot Baby Petite Peeled,fruit
1,Kiwi 33 count,fruit
2,Pear D' Anjou 135ct,fruit
3,"Sweet Potato Peeled/Cut 1""",roots & tubers
4,"Sweet Potato Dice 3/4"" Peeled",roots & tubers
...,...,...
5348,FC Beef Burger 69101R Daily Choice 21#,beef
5349,F/C Beef Taco Filling 4/5#,beef
5350,BEEF MEATBALLS NO SAUCE 1 OZ 10 LB,beef
5351,Beef Hot Dog 15# 88719 Applegate 144ct,beef


In [184]:
fpg_dataset.factorize_dataset()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['labels'], self.factorized_label_mapping = pd.factorize(self.data['labels'])


In [185]:
fpg_dataset.data

Unnamed: 0,text,labels
0,Carrot Baby Petite Peeled,0
1,Kiwi 33 count,0
2,Pear D' Anjou 135ct,0
3,"Sweet Potato Peeled/Cut 1""",1
4,"Sweet Potato Dice 3/4"" Peeled",1
...,...,...
5348,FC Beef Burger 69101R Daily Choice 21#,6
5349,F/C Beef Taco Filling 4/5#,6
5350,BEEF MEATBALLS NO SAUCE 1 OZ 10 LB,6
5351,Beef Hot Dog 15# 88719 Applegate 144ct,6


In [186]:
fpg_dataset.factorized_label_mapping

Index(['fruit', 'roots & tubers', 'milk', 'grain products',
       'condiments & snacks', 'beverages', 'beef', 'turkey & other poultry',
       'meals', 'pork', 'milk & dairy', 'fish (wild)', 'legumes', 'chicken',
       'cheese', 'eggs', 'non-food', 'tree nuts & seeds', 'meat', 'vegetables',
       'seafood', 'fish (farm-raised)', 'yogurt', 'rice', 'butter', 'bread',
       'produce'],
      dtype='object')

In [191]:
fpg_dataset.shuffle_dataset()

In [192]:
fpg_dataset.data

Unnamed: 0,text,labels
0,CROISSANT BUTR RTB LG 3.25Z,4
1,POTATO BOUREKAS 96 CT 1.5 OZ (USA),8
2,BEANS GARBANZO LOW SALT CANNED,12
3,COOKIE OREO CRUMBS MED BULK,4
4,BREAD SLICED CINN RAISIN 16OZ,3
...,...,...
5348,RED FRESNO CHILE 10#,19
5349,FC Chix Wings Sect 1/20# Spare Time 38315,13
5350,BREAD SLICED PULLMAN WHITE,3
5351,"CRUST, PIZA RICE ITLN 10.63",4
