In [83]:
import pandas as pd
import re

In [8]:
f_raw = '../static/data/death-and-co-raw-data.csv'
all_columns = ['row', 'page', 'name', 'ingredient', 'simple_ingredient', 'quantity', 'notes', 'empty', 'empty2']
drop_columns = ['row', 'page', 'notes', 'empty', 'empty2']

In [16]:
# Read file and drop unneeded columns
df_raw = pd.read_csv(f_raw)
df_raw.columns = all_columns
df_raw = df_raw.drop(drop_columns, 'columns')

In [17]:
df_raw.head()

Unnamed: 0,row,page,name,ingredient,simple_ingredient,quantity,notes,empty,empty2
0,2,,,,,,,,
1,3,139.0,20TH CENTURY,BEEFEATER LONDON DRY GIN,GIN (LONDON DRY),1.5,"Shake all the ingredients with ice, then double",,
2,4,139.0,20TH CENTURY,MARIE BRIZARD WHITE CRÈME DE CACAO,LIQUEUR (CRÈME DE CACAO),0.75,strain into a coupe. No garnish.,,
3,5,139.0,20TH CENTURY,COCCHI AMERICANO,APERITIF (COCCHI AMERICANO),0.75,,,
4,6,139.0,20TH CENTURY,LEMON JUICE,JUICE (LEMON),0.75,,,


In [22]:
# Copy and drop null rows (these separate cocktails in raw csv)
df = df_raw.copy()
df = df.dropna(how='all')

In [23]:
df.head()

Unnamed: 0,name,ingredient,simple_ingredient,quantity
1,20TH CENTURY,BEEFEATER LONDON DRY GIN,GIN (LONDON DRY),1.5
2,20TH CENTURY,MARIE BRIZARD WHITE CRÈME DE CACAO,LIQUEUR (CRÈME DE CACAO),0.75
3,20TH CENTURY,COCCHI AMERICANO,APERITIF (COCCHI AMERICANO),0.75
4,20TH CENTURY,LEMON JUICE,JUICE (LEMON),0.75
6,AIRMAIL,RON DEL BARRILITO 3-STAR RUM,RUM (SPANISH),1.0


In [31]:
grouped_cocktails = df.groupby('name')

In [130]:
# matches 'SIMPLE SIMPLE (SPECIFIC (MORE SPECIFIC))' as 2 groups
name_regex = '([^\(\)]*)\((.*)\)'

cocktails = {}
ingredients = set()
simple_ingredients = set()
verbose_ingredients = set()

for cocktail in grouped_cocktails.groups:
    group = grouped_cocktails.get_group(cocktail)
    
    cocktail_ingredients = {}
    
    for i, item in group.iterrows():
        verbose_ingredient = item[1]
        ingredient = item[2]
        
        if pd.isna(ingredient): 
            continue

        ingredient_match = re.match(name_regex, ingredient)
        simple_ingredient = ingredient_match.group(1) if ingredient_match else ingredient
        simple_ingredient = simple_ingredient.strip()
        
        ingredient_obj = { 
            'cocktail': cocktail,
            'ingredient': ingredient,
            'simple_ingredient': simple_ingredient, 
            'verbose_ingredient': verbose_ingredient, 
            'quantity': item[3],
        }
        
        # key on most specific ingredient
        cocktail_ingredients[ingredient_verbose] = ingredient_obj
        
        ingredients.add(ingredient)
        simple_ingredients.add(simple_ingredient)
        verbose_ingredients.add(verbose_ingredient)
        
        
    cocktails[cocktail] = cocktail_ingredients

In [131]:
print(len(simple_ingredients), len(ingredients),len(verbose_ingredients), len(cocktails))

46 222 451 486


In [133]:
simple_ingredients

{'ABSINTHE',
 'AGAVE',
 'AMARO',
 'APERITIF',
 'APPERITIF',
 'APÉRITIF',
 'AU CHOIX',
 'BATAVIA ARRACK',
 'BEER',
 'BITTERS',
 'BRANDY',
 'BUTTER',
 'CACHAÇA',
 'CHARTREUSE',
 'CHERRY HEERING',
 'CIDER',
 'CORDIAL',
 'CREAM',
 'EGG',
 'FRUIT',
 'GIN',
 'JUICE',
 'LEAF',
 'LIQUEUR',
 'MADEIRA',
 'MINT',
 'MIX',
 'OTHER',
 'PUREE',
 'RIM',
 'RUM',
 'SALT',
 'SHERRY',
 'SHRUB',
 'SODA',
 'SPICE',
 'SUGAR',
 'SUGAR CUBE',
 'SYRUP',
 'TWIST',
 'VEGETABLE',
 'VERMOUTH',
 'VODKA',
 'WATER',
 'WHISKEY',
 'WINE'}

In [132]:
ingredients

{'ABSINTHE',
 'AGAVE (MEZCAL)',
 'AGAVE (TEQUILA ANEJO)',
 'AGAVE (TEQUILA BLANCO)',
 'AGAVE (TEQUILA REPOSADO)',
 'AMARO (APERITIVO)',
 'AMARO (CYNAR)',
 'AMARO (FERNET)',
 'AMARO (MEDIUM)',
 'AMARO (MILD)',
 'APERITIF (COCCHI AMERICANO)',
 'APERITIF (GENTIANE-QUINA)',
 'APERITIF (RASPBERRY)',
 'APERITIF (SALERS GENTIAN)',
 'APPERITIF (LILLET BLANC)',
 'APPERITIF (LILLET ROSÉ)',
 'APPERITIF (LILLET ROUGE)',
 'APÉRITIF (???)',
 'AU CHOIX',
 'BATAVIA ARRACK',
 'BEER (DOUBLE/IMPERIAL IPA)',
 'BEER (SCHWARZBIER/DUNKEL)',
 'BEER (SWEET/MILK STOUT)',
 'BEER (VIENNA LAGER)',
 'BEER (WITBIER)',
 'BITTERS (ALLSPICE)',
 'BITTERS (APPLE)',
 'BITTERS (AROMATIC)',
 'BITTERS (BECHEROVKA)',
 'BITTERS (CELERY)',
 'BITTERS (CHERRY BARK AND VANILLA)',
 'BITTERS (CHOCOLATE)',
 'BITTERS (CREOLE)',
 'BITTERS (GRAPEFRUIT)',
 'BITTERS (LAVENDER)',
 'BITTERS (ORANGE)',
 'BITTERS (TIKI)',
 'BRANDY (APPLE)',
 'BRANDY (CHERRY)',
 'BRANDY (GRAPE)',
 'BRANDY (PEAR)',
 'BRANDY (PISCO)',
 'BUTTER (APPLE)',
 'BUTTER

In [134]:
verbose_ingredients

{'ABBOTT’S BITTERS',
 'ACACIA HONEY SYRUP',
 'ACID PHOSPHATE',
 'AGAVE NECTAR',
 'AGAVE SYRUP',
 'AGED BALSAMIC VINEGAR',
 'ALCHEMIA CHOCOLATE VODKA',
 'ALVEAR FESTIVAL PALE CREAM SHERRY',
 'AMARETTO',
 'AMARO AVERNA',
 'AMARO CIOCIARO',
 'AMARO LUCANO',
 'AMARO MELETTI',
 'AMARO NARDINI',
 'AMARO NONINO',
 'AMERICAN FRUITS BLACK CURRANT CORDIAL',
 'ANCHO CHILE-INFUSED DOLIN ROUGE VERMOUTH',
 'ANCHO CHILE–INFUSED DOLIN ROUGE VERMOUTH',
 'ANCHOR GENEVIEVE GIN',
 'ANCHOR JUNIPERO GIN',
 'ANGOSTURA 5-YEAR RUM',
 'ANGOSTURA BITTERS',
 'ANGOSTURA ORANGE BITTERS',
 'APEROL',
 'APPLE BUTTER',
 'APPLE JUICE',
 'APPLETON ESTATE RESERVE RUM',
 'APPLETON ESTATE V/X RUM',
 'APPLETON WHITE RUM',
 'APRICOT-INFUSED FAMOUS GROUSE SCOTCH',
 'AVIATION GIN',
 'AVUÁ AMBURANA CACHAÇA',
 'BACARDI RON SUPERIOR LIMITED EDITION',
 'BAKER’S BOURBON',
 'BANANA CHIP–INFUSED GOSLING’S BLACK SEAL RUM',
 'BANANA SYRUP',
 'BANKS 5-ISLAND WHITE RUM',
 'BAR CODE BAKED APPLE BITTERS',
 'BARBADILLO PRINCIPE AMONTILLADO S

In [87]:
name_regex = '(.*)\((.*)\)'
m = re.match(name_regex, test_name)

In [97]:
m.group(2)

'BLACKBERRY'

In [91]:
s = ' asdf asdf '

In [92]:
s.strip()

'asdf asdf'