# Exp 20 
analyze nearest neighbors of positive and negative samples

In [2]:
import pandas as pd

In [3]:
def read_datasets(*paths):
    datasets= []
    for path in paths:
        datasets.append(pd.read_csv(path))
    return datasets

In [4]:
def getAttributes(df,ignore_columns=['id','label'],left_prefix='ltable_',right_prefix='rtable_'):
    attributes = set()
    for att in list(df):
        if att not in ignore_columns:
            attribute = att.replace(left_prefix,"").replace(right_prefix,"")
            attributes.add(attribute)
    return attributes

In [5]:
import numpy as np
def getMeanEditDistance(attribute_values,sep,comparator):
    distances = []
    for attr_val in attribute_values:
        edit_dist = comparator.distance(attr_val.split(sep)[0],attr_val.split(sep)[1])
        distances.append(edit_dist)
    distances = np.array(distances)
    return np.mean(distances)

In [6]:
def calculateMeanEditDistances(nn_dataset,opposite_label_data,attributes,str_comparator,left_prefix='ltable_',
                               right_prefix='rtable_',sep="|"):
    mean_distances = {}
    for att in attributes:
        nn_values = pd.merge(nn_dataset,opposite_label_data,left_on=att,right_on='id')
        left_col = nn_values[left_prefix+att].astype('str')
        right_col = nn_values[right_prefix+att].astype('str')
        nn_values_distribution = (left_col+"|"+right_col).value_counts()
        mean_edit_distances = getMeanEditDistance(nn_values_distribution.keys(),sep=sep,comparator=str_comparator)
        mean_distances[att] = mean_edit_distances
    return mean_distances

In [7]:
import re
from nltk.util import ngrams

def getNGrams(sentence,N):
    s = sentence.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    return list(ngrams(tokens,N))

In [30]:
def commonNGrams(sentences):
    result = set(sentences[0])
    for s in sentences[1:]:
        result.intersection_update(s)
    return result

def getLongestNGramInCommon(sentences):
    maxNGramLength = min(list(map(lambda sent:len(sent.split()),sentences)))
    currentMax = 0
    maxNGram = []
    for i in range(maxNGramLength+1):
        ngrams = list(map(lambda sent: getNGrams(sent,i),sentences))
        intersection = commonNGrams(ngrams)
        if len(intersection)>0:
            currentMax = len(intersection)
            maxNGram = intersection
    return maxNGram

In [33]:
s1 = "ciao simpatici belli"
s2 = "mucconi simpatici"
s3 = "niente in comune simpatici"
getLongestNGramInCommon([s1,s2,s3])

{('simpatici',)}

In [68]:
def findPatternsInNeighbors(nn_dataset,opposite_label_data,attributes,k,left_prefix='ltable_',right_prefix='rtable_'):
    commonPatterns = {}
    for att in attributes:
        occurrences = nn_dataset[att].value_counts()
        occurrences_df = occurrences.rename_axis('unique_id').reset_index(name='counts')
        occurrences_df = occurrences_df.head(k)
        nn_values = pd.merge(occurrences_df,opposite_label_data,left_on='unique_id',right_on='id')
        left_common_ngrams = getLongestNGramInCommon(nn_values[left_prefix+att])
        right_common_ngrams = getLongestNGramInCommon(nn_values[right_prefix+att])
        commonPatterns[left_prefix+att] = left_common_ngrams
        commonPatterns[right_prefix+att] = right_common_ngrams
    return commonPatterns

## Itunes-Amazon

In [69]:
datasets = read_datasets('iTunes-Amazon/experiments-results/exp3/positives_nn.csv',
                         'iTunes-Amazon/experiments-results/exp3/negatives_nn.csv',
                        '../Structured/itunes-amazon/positives.csv',
                        '../Structured/itunes-amazon/negatives.csv')

In [70]:
attributes = list(getAttributes(datasets[2]))
attributes

['Song_Name',
 'Time',
 'Artist_Name',
 'Genre',
 'Price',
 'Released',
 'Album_Name',
 'CopyRight']

In [75]:
findPatternsInNeighbors(datasets[0],datasets[3],attributes,3)

{'ltable_Song_Name': [],
 'rtable_Song_Name': [],
 'ltable_Time': [],
 'rtable_Time': [],
 'ltable_Artist_Name': [],
 'rtable_Artist_Name': [],
 'ltable_Genre': {('music',)},
 'rtable_Genre': [],
 'ltable_Price': {('album', 'only')},
 'rtable_Price': {('1', '29')},
 'ltable_Released': {('11',)},
 'rtable_Released': {('2011',), ('march',)},
 'ltable_Album_Name': {('the',)},
 'rtable_Album_Name': [],
 'ltable_CopyRight': {('interscope', 'records')},
 'rtable_CopyRight': {('c',), ('interscope',)}}

In [64]:
from similarity.ngram import NGram
str_comparator = NGram(4)
calculateMeanEditDistances(datasets[1],datasets[2],attributes,str_comparator)

{'Song_Name': 0.07347332319132763,
 'Released': 0.8090666817651884,
 'CopyRight': 0.745830117605375,
 'Time': 0.12369791666666666,
 'Genre': 0.4930203195749618,
 'Price': 0.24583333333333335,
 'Artist_Name': 0.0,
 'Album_Name': 0.16491070240313319}

## Beer

In [87]:
datasets = read_datasets('beerAdvo/experiment-results/positives_nn.csv',
                         'beerAdvo/experiment-results/negatives_nn.csv',
                        '../Structured/Beer/positives.csv',
                        '../Structured/Beer/negatives.csv')

In [88]:
attributes = list(getAttributes(datasets[2]))
attributes

['Beer_Name', 'Style', 'Brew_Factory_Name', 'ABV']

In [92]:
from similarity.normalized_levenshtein import NormalizedLevenshtein
str_comparator = NormalizedLevenshtein()
calculateMeanEditDistances(datasets[1],datasets[2],attributes,str_comparator)

{'Beer_Name': 0.347152097590012,
 'Style': 0.7214912280701754,
 'Brew_Factory_Name': 0.40166168767268356,
 'ABV': 0.41200828157349906}

In [93]:
findPatternsInNeighbors(datasets[0],datasets[3],attributes,5)

{'ltable_Beer_Name': [],
 'rtable_Beer_Name': [],
 'ltable_Style': {('american',)},
 'rtable_Style': [],
 'ltable_Brew_Factory_Name': [],
 'rtable_Brew_Factory_Name': [],
 'ltable_ABV': [],
 'rtable_ABV': []}

In [86]:
att = "Beer_Name"
k = 10
occurrences = datasets[0][att].value_counts()
occurrences_df = occurrences.rename_axis('unique_id').reset_index(name='counts')
occurrences_df = occurrences_df.head(k)
nn_values = pd.merge(occurrences_df,datasets[3],left_on='unique_id',right_on='id')
nn_values[['ltable_Beer_Name','rtable_Beer_Name']].head(10)

Unnamed: 0,ltable_Beer_Name,rtable_Beer_Name
0,Big Bear Amber Ale,Big Horn Buttface Amber Ale
1,Road Rash Red,Kettle House Bourbon Barrel Road Rash Imperial...
2,Flying Monkey Amber Ale,Flying Monkeys Amber Ale
3,Eruption Imperial Red Ale,De Koninck TSTBRW 01 Imperial Red Ale
4,Frog Island Amber Ale,Heavy Seas Desert Island Series American Honey...
5,Imperial Red Ale,ShuBrew Heart and Sole Imperial Red Ale
6,Amber Ale With Spanish Orange,Big Bear Mountain Orange Blossom Amber Ale
7,Lower De Boom Barleywine,21st Amendment Bourbon Barrel Lower da Boom Ba...
8,Barrel Aged Riverside Red,Blue Pants Big & Tall # 2 : Barrel Aged Belgia...
9,Alaskan Barley Wine ( Pilot Series ),Hermit Thrush Abbey Series # 1 - Jolly Abbot B...


## Amazon Google

In [67]:
datasets = read_datasets('Amazon-Google/experiment_results/testpositives_nn.csv',
                         'Amazon-Google/experiment_results/testnegatives_nn.csv',
                        '../Structured/Amazon-Google/positives.csv',
                        '../Structured/Amazon-Google/negatives.csv')

In [69]:
attributes = list(getAttributes(datasets[2]))
attributes

['manufacturer', 'price', 'title']

In [74]:
str_comparator = NGram(5)
calculateMeanEditDistances(datasets[1],datasets[2],attributes,str_comparator)

{'manufacturer': 0.06009868719670978,
 'price': 0.3907952487482689,
 'title': 0.5150678803144639}

## Walmart Amazon

In [78]:
datasets = read_datasets('WalmartAmazon/experiment_results/testpositives_nn.csv',
                         'WalmartAmazon/experiment_results/testnegatives_nn.csv',
                        '../Structured/Walmart-Amazon/positives.csv',
                        '../Structured/Walmart-Amazon/negatives.csv')

In [79]:
attributes = list(getAttributes(datasets[2]))
attributes

['modelno', 'category', 'title', 'price', 'brand']

In [82]:
from similarity.jarowinkler import JaroWinkler
str_comparator = JaroWinkler()
calculateMeanEditDistances(datasets[1],datasets[2],attributes,str_comparator)

{'modelno': 0.5512313923121839,
 'category': 0.45083646597846566,
 'title': 0.21392036864369224,
 'price': 0.4589821222606689,
 'brand': 0.1146129773043387}