# Exp 20 
analyze nearest neighbors of positive and negative samples

In [3]:
import pandas as pd

In [49]:
def read_datasets(*paths):
    datasets= []
    for path in paths:
        datasets.append(pd.read_csv(path))
    return datasets

In [22]:
def getAttributes(df,ignore_columns=['id','label'],left_prefix='ltable_',right_prefix='rtable_'):
    attributes = set()
    for att in list(df):
        if att not in ignore_columns:
            attribute = att.replace(left_prefix,"").replace(right_prefix,"")
            attributes.add(attribute)
    return attributes

In [77]:
import numpy as np
def getMeanEditDistance(attribute_values,sep,comparator):
    distances = []
    for attr_val in attribute_values:
        edit_dist = comparator.distance(attr_val.split(sep)[0],attr_val.split(sep)[1])
        distances.append(edit_dist)
    distances = np.array(distances)
    return np.mean(distances)

In [75]:
def calculateMeanEditDistances(nn_dataset,opposite_label_data,attributes,str_comparator,left_prefix='ltable_',
                               right_prefix='rtable_',sep="|"):
    mean_distances = {}
    for att in attributes:
        nn_values = pd.merge(nn_dataset,opposite_label_data,left_on=att,right_on='id')
        left_col = nn_values[left_prefix+att].astype('str')
        right_col = nn_values[right_prefix+att].astype('str')
        nn_values_distribution = (left_col+"|"+right_col).value_counts()
        mean_edit_distances = getMeanEditDistance(nn_values_distribution.keys(),sep=sep,comparator=str_comparator)
        mean_distances[att] = mean_edit_distances
    return mean_distances

In [101]:
import re
from nltk.util import ngrams

def getNGrams(sentence,N):
    s = sentence.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    return list(ngrams(tokens,N))

In [98]:
s = "Natural-language processing (NLP) is an area of computer science " \
    "and artificial intelligence concerned with the interactions " \
    "between computers and human (natural) languages."
def intersection(first, *others):
    return set(first).intersection(*others)
def getLongestNGramInCommon(sentences):
    maxNGramLength = min(list(map(lambda sent:len(sent.split()),sentences)))
    currentMax = 0
    maxNGram = None
    for i in range(maxNGramLength):
        ngrams_first = getNGrams(sentences[0],i)
        ngrams_others = list(map(lambda sent: getNGrams(sent,i),sentences[1:]))
        intersect = intersection(ngrams_first,ngrams_others)
        if len(intersection>0):
            currentMax = len(intersection)
            maxNGram = intersection
    return maxNGram

In [111]:
def intersection(elements):
    current_incommon = []
    for el in elements[1]:
        

In [121]:
s1 = "ciao belli"
s2 = "ciao belli mucconi"
s3 = "niente in comune a parte ciao"
intersection(getNGrams(s2,1),getNGrams(s1,1),getNGrams(s3,1))

{('ciao',)}

## Itunes-Amazon

In [60]:
datasets = read_datasets('iTunes-Amazon/experiments-results/exp3/positives_nn.csv',
                         'iTunes-Amazon/experiments-results/exp3/negatives_nn.csv',
                        '../Structured/itunes-amazon/positives.csv',
                        '../Structured/itunes-amazon/negatives.csv')

In [61]:
attributes = list(getAttributes(datasets[2]))
attributes

['Song_Name',
 'Released',
 'CopyRight',
 'Time',
 'Genre',
 'Price',
 'Artist_Name',
 'Album_Name']

In [64]:
from similarity.ngram import NGram
str_comparator = NGram(4)
calculateMeanEditDistances(datasets[1],datasets[2],attributes,str_comparator)

{'Song_Name': 0.07347332319132763,
 'Released': 0.8090666817651884,
 'CopyRight': 0.745830117605375,
 'Time': 0.12369791666666666,
 'Genre': 0.4930203195749618,
 'Price': 0.24583333333333335,
 'Artist_Name': 0.0,
 'Album_Name': 0.16491070240313319}

## Beer

In [50]:
datasets = read_datasets('beerAdvo/experiment-results/positives_nn.csv',
                         'beerAdvo/experiment-results/negatives_nn.csv',
                        '../Structured/Beer/positives.csv',
                        '../Structured/Beer/negatives.csv')

In [52]:
attributes = list(getAttributes(datasets[2]))
attributes

['ABV', 'Style', 'Brew_Factory_Name', 'Beer_Name']

In [53]:
calculateMeanEditDistances(datasets[1],datasets[2],attributes)

{'ABV': 2.4210526315789473,
 'Style': 18.0,
 'Brew_Factory_Name': 15.038461538461538,
 'Beer_Name': 10.515151515151516}

## Amazon Google

In [67]:
datasets = read_datasets('Amazon-Google/experiment_results/testpositives_nn.csv',
                         'Amazon-Google/experiment_results/testnegatives_nn.csv',
                        '../Structured/Amazon-Google/positives.csv',
                        '../Structured/Amazon-Google/negatives.csv')

In [69]:
attributes = list(getAttributes(datasets[2]))
attributes

['manufacturer', 'price', 'title']

In [74]:
str_comparator = NGram(5)
calculateMeanEditDistances(datasets[1],datasets[2],attributes,str_comparator)

{'manufacturer': 0.06009868719670978,
 'price': 0.3907952487482689,
 'title': 0.5150678803144639}

## Walmart Amazon

In [78]:
datasets = read_datasets('WalmartAmazon/experiment_results/testpositives_nn.csv',
                         'WalmartAmazon/experiment_results/testnegatives_nn.csv',
                        '../Structured/Walmart-Amazon/positives.csv',
                        '../Structured/Walmart-Amazon/negatives.csv')

In [79]:
attributes = list(getAttributes(datasets[2]))
attributes

['modelno', 'category', 'title', 'price', 'brand']

In [82]:
from similarity.jarowinkler import JaroWinkler
str_comparator = JaroWinkler()
calculateMeanEditDistances(datasets[1],datasets[2],attributes,str_comparator)

{'modelno': 0.5512313923121839,
 'category': 0.45083646597846566,
 'title': 0.21392036864369224,
 'price': 0.4589821222606689,
 'brand': 0.1146129773043387}