## Find possibile matching with jaccard similarity

In [1]:
import pandas as pd

In [2]:
def contains_tokens(attr,tokens):
    attr = list(map(lambda s:s.lower(),attr.split()))
    return all(elem in attr for elem in tokens)

In [5]:
attr = 'Sanibel Imperial red ale'
tokens = ['imperial','red','ale']
contains_tokens(attr,tokens)

True

In [6]:
train = pd.read_csv('../../Structured/Beer/merged_train.csv')

In [7]:
train_neg = train[train['label']==0].copy()

In [8]:
l_beernames = list(train_neg['ltable_Beer_Name'].value_counts().keys())

In [9]:
r_beername = list(train_neg['rtable_Beer_Name'].value_counts().keys())

In [10]:
l_beernames_selected = list(filter(lambda v : contains_tokens(v,['imperial','red','ale']),l_beernames))

In [11]:
r_beername_selected = list(filter(lambda v: contains_tokens(v,['imperial','red','ale']),r_beername))

In [12]:
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

In [38]:
def jaccard_similarity_withlowercase(list1, list2):
    list1 = list(map(lambda tok:tok.lower(),list1))
    list2 = list(map(lambda tok:tok.lower(),list2))
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [39]:
jaccard_similarity_withlowercase(l_beernames_selected[0].split(),r_beername_selected[1].split())

0.23076923076923078

In [40]:
def findMoreSimilarAttr(val,val_list):
    distances = list(map(lambda v:jaccard_similarity_withlowercase(v.split(),val.split()),val_list))
    maxDistance = max(distances)
    if maxDistance <= 0.5:
        return ""
    return val_list[distances.index(maxDistance)]

In [42]:
similarAttributes = []
for lbeer in l_beernames_selected:
    most_similar = findMoreSimilarAttr(lbeer,r_beername_selected)
    if most_similar is not "":
        similarAttributes.append((lbeer,most_similar))

In [43]:
similarAttributes

[('Roundabout Imperial Red Ale', '406 Imperial Red Ale'),
 ('Deranger Imperial Red Ale', 'Laurelwood Organic Deranger Imperial Red Ale'),
 ('Double Dread Imperial Red Ale', 'Wildcard Double Down Imperial Red Ale'),
 ('Three Beavers Imperial Red Ale',
  'Howe Sound Three Beavers Imperial Red Ale'),
 ('Lavery Imperial Red Ale', '406 Imperial Red Ale'),
 ('Photobomb Imperial Red Ale', '406 Imperial Red Ale'),
 ('Eruption Imperial Red Ale', 'Worthy Eruption Imperial Red Ale'),
 ('Lagunitas Imperial Red Ale', 'Lagunitas Imperial Red Ale'),
 ('Ale Mania Imperial Red Ale', 'Ale Mania Imperial Red Ale'),
 ('Deputation Imperial Red Ale', 'Brannon_s Deputation Imperial Red Ale'),
 ('Buffalo Soul Jah Imperial Red Ale',
  'Three Heads Buffalo Soul Jah Imperial Red Ale'),
 ('Stupiphany Imperial Red Ale', 'Rusty Truck Stupiphany Imperial Red Ale'),
 ('Sledgehammer Imperial Red Ale', '406 Imperial Red Ale'),
 ('Arsenal Imperial Red Ale', '406 Imperial Red Ale'),
 ('Double Dragon Imperial Red Ale', 'W

In [49]:
possibleMatchings = pd.DataFrame(data = similarAttributes,columns = ['ltable_Beer_Name','rtable_Beer_Name'])

In [51]:
possibleMatchings.to_csv('new_positives.csv',index=False)

## Create new training data
manually alter possible matching files to exclude clearly not matching names

In [19]:
possibleMatchings = pd.read_csv('possibleMatchings.csv')

In [20]:
tableA = pd.read_csv("../../Structured/Beer/tableA.csv")
tableB = pd.read_csv("../../Structured/Beer/tableB.csv")

In [21]:
p_newtrain = pd.merge(possibleMatchings,tableA,left_on='ltable_Beer_Name',right_on='Beer_Name')
p_newtrain.head(10)

Unnamed: 0,ltable_Beer_Name,rtable_Beer_Name,id,Beer_Name,Brew_Factory_Name,Style,ABV
0,Deranger Imperial Red Ale,Laurelwood Organic Deranger Imperial Red Ale,1458,Deranger Imperial Red Ale,Laurelwood Public House & Brewery,American Amber / Red Ale,8.60 %
1,Three Beavers Imperial Red Ale,Howe Sound Three Beavers Imperial Red Ale,487,Three Beavers Imperial Red Ale,Howe Sound Inn & Brewing Company,American Amber / Red Ale,7.50 %
2,Eruption Imperial Red Ale,Worthy Eruption Imperial Red Ale,486,Eruption Imperial Red Ale,Worthy Brewing Co. .,American Amber / Red Ale,8.00 %
3,Lagunitas Imperial Red Ale,Lagunitas Imperial Red Ale,9,Lagunitas Imperial Red Ale,Lagunitas Brewing Company,American Amber / Red Ale,7.80 %
4,Ale Mania Imperial Red Ale,Ale Mania Imperial Red Ale,2934,Ale Mania Imperial Red Ale,Ale-Mania,American Amber / Red Ale,9.50 %
5,Deputation Imperial Red Ale,Brannon_s Deputation Imperial Red Ale,3126,Deputation Imperial Red Ale,Brannon 's Pub and Brewery,American Amber / Red Ale,8.00 %
6,Buffalo Soul Jah Imperial Red Ale,Three Heads Buffalo Soul Jah Imperial Red Ale,833,Buffalo Soul Jah Imperial Red Ale,Three Heads Brewing,American Amber / Red Ale,9.20 %
7,Stupiphany Imperial Red Ale,Rusty Truck Stupiphany Imperial Red Ale,1794,Stupiphany Imperial Red Ale,Rusty Truck Brewing Company / Roadhouse 101,American Amber / Red Ale,8.00 %
8,Barrel Aged Imperial Red Ale,Barley_s Barrel Aged Ulysses Imperial Red Ale,2526,Barrel Aged Imperial Red Ale,Boone Valley Brewing,American Amber / Red Ale,9.99 %
9,Tongue Buckler - Imperial Red Ale,Ballast Point Tongue Buckler Imperial Red Ale,29,Tongue Buckler - Imperial Red Ale,Ballast Point Brewing Company,American Amber / Red Ale,10.00 %


In [22]:
new_train = pd.merge(p_newtrain,tableB,left_on='rtable_Beer_Name',right_on='Beer_Name')
new_train.head(10)

Unnamed: 0,ltable_Beer_Name,rtable_Beer_Name,id_x,Beer_Name_x,Brew_Factory_Name_x,Style_x,ABV_x,id_y,Beer_Name_y,Brew_Factory_Name_y,Style_y,ABV_y
0,Deranger Imperial Red Ale,Laurelwood Organic Deranger Imperial Red Ale,1458,Deranger Imperial Red Ale,Laurelwood Public House & Brewery,American Amber / Red Ale,8.60 %,411,Laurelwood Organic Deranger Imperial Red Ale,Laurelwood Public House & Brewery,American Strong Ale,8.60 %
1,Three Beavers Imperial Red Ale,Howe Sound Three Beavers Imperial Red Ale,487,Three Beavers Imperial Red Ale,Howe Sound Inn & Brewing Company,American Amber / Red Ale,7.50 %,2583,Howe Sound Three Beavers Imperial Red Ale,Howe Sound Brewing,American Strong Ale,7.50 %
2,Eruption Imperial Red Ale,Worthy Eruption Imperial Red Ale,486,Eruption Imperial Red Ale,Worthy Brewing Co. .,American Amber / Red Ale,8.00 %,2582,Worthy Eruption Imperial Red Ale,Worthy Brewing Company,American Strong Ale,8 %
3,Lagunitas Imperial Red Ale,Lagunitas Imperial Red Ale,9,Lagunitas Imperial Red Ale,Lagunitas Brewing Company,American Amber / Red Ale,7.80 %,90,Lagunitas Imperial Red Ale,Lagunitas Brewing Company &#40; Heineken &#41;,American Strong Ale,7.80 %
4,Ale Mania Imperial Red Ale,Ale Mania Imperial Red Ale,2934,Ale Mania Imperial Red Ale,Ale-Mania,American Amber / Red Ale,9.50 %,1551,Ale Mania Imperial Red Ale,Ale Mania &#40; Fritz Ale &#41;,American Strong Ale,9.50 %
5,Deputation Imperial Red Ale,Brannon_s Deputation Imperial Red Ale,3126,Deputation Imperial Red Ale,Brannon 's Pub and Brewery,American Amber / Red Ale,8.00 %,1679,Brannon_s Deputation Imperial Red Ale,Brannon_s Pub and Brewery,American Strong Ale,8.79 %
6,Buffalo Soul Jah Imperial Red Ale,Three Heads Buffalo Soul Jah Imperial Red Ale,833,Buffalo Soul Jah Imperial Red Ale,Three Heads Brewing,American Amber / Red Ale,9.20 %,2883,Three Heads Buffalo Soul Jah Imperial Red Ale,Custom Brewcrafters,American Strong Ale,9.20 %
7,Stupiphany Imperial Red Ale,Rusty Truck Stupiphany Imperial Red Ale,1794,Stupiphany Imperial Red Ale,Rusty Truck Brewing Company / Roadhouse 101,American Amber / Red Ale,8.00 %,698,Rusty Truck Stupiphany Imperial Red Ale,Rusty Truck Brewing / Roadhouse 101,American Strong Ale,8 %
8,Barrel Aged Imperial Red Ale,Barley_s Barrel Aged Ulysses Imperial Red Ale,2526,Barrel Aged Imperial Red Ale,Boone Valley Brewing,American Amber / Red Ale,9.99 %,1682,Barley_s Barrel Aged Ulysses Imperial Red Ale,Barley_s Brewing Company,American Strong Ale,8.60 %
9,Tongue Buckler - Imperial Red Ale,Ballast Point Tongue Buckler Imperial Red Ale,29,Tongue Buckler - Imperial Red Ale,Ballast Point Brewing Company,American Amber / Red Ale,10.00 %,1638,Ballast Point Tongue Buckler Imperial Red Ale,Ballast Point Brewing Company,American Strong Ale,10 %


In [23]:
new_train = new_train.drop(['ltable_Beer_Name','rtable_Beer_Name','id_x','id_y'],axis=1)

In [24]:
new_column_names = {}
for col in list(new_train):
    if col.endswith("x"):
        newName = col.replace("_x","")
        new_column_names[col] = "ltable_"+newName
    else:
        newName = col.replace("_y","")
        new_column_names[col] = 'rtable_'+newName

In [25]:
import random 
new_train =new_train.rename(columns = new_column_names)

In [26]:
import random
newPositivesIds=random.sample(range(900,1000),new_train.shape[0])

In [27]:
new_train.insert(loc=0,column='label',value=1)

In [28]:
new_train['id']=newPositivesIds

In [29]:
new_train.head()

Unnamed: 0,label,ltable_Beer_Name,ltable_Brew_Factory_Name,ltable_Style,ltable_ABV,rtable_Beer_Name,rtable_Brew_Factory_Name,rtable_Style,rtable_ABV,id
0,1,Deranger Imperial Red Ale,Laurelwood Public House & Brewery,American Amber / Red Ale,8.60 %,Laurelwood Organic Deranger Imperial Red Ale,Laurelwood Public House & Brewery,American Strong Ale,8.60 %,942
1,1,Three Beavers Imperial Red Ale,Howe Sound Inn & Brewing Company,American Amber / Red Ale,7.50 %,Howe Sound Three Beavers Imperial Red Ale,Howe Sound Brewing,American Strong Ale,7.50 %,917
2,1,Eruption Imperial Red Ale,Worthy Brewing Co. .,American Amber / Red Ale,8.00 %,Worthy Eruption Imperial Red Ale,Worthy Brewing Company,American Strong Ale,8 %,949
3,1,Lagunitas Imperial Red Ale,Lagunitas Brewing Company,American Amber / Red Ale,7.80 %,Lagunitas Imperial Red Ale,Lagunitas Brewing Company &#40; Heineken &#41;,American Strong Ale,7.80 %,925
4,1,Ale Mania Imperial Red Ale,Ale-Mania,American Amber / Red Ale,9.50 %,Ale Mania Imperial Red Ale,Ale Mania &#40; Fritz Ale &#41;,American Strong Ale,9.50 %,977


In [30]:
new_train.to_csv('newPositiveSamples.csv',index=False)