### Aggregating binary ratings (algebraic approach)

Algorithm reimplementation from: Dalvi, Nilesh, et al. "Aggregating crowdsourced binary ratings." Proceedings of the 22nd international conference on World Wide Web. ACM, 2013.

In [32]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import utils

from sklearn.metrics import confusion_matrix
from numpy import linalg as la



In [33]:
#BEERS
init_data = pd.read_csv("data/beers-matrix-ec.csv")
init_data.head()


Unnamed: 0,tid,attrName,label,ec-missing-value,ec-default-value,ec-top-value,ec-valid-number,ec-cardinality-vio,ec-lookup-attr,ec-pattern-value,ec-valid-ssn
0,0,tid,0,-1,-1,-1,-1,0,0,0,0
1,1,tid,0,-1,-1,1,-1,0,0,0,0
2,2,tid,0,-1,-1,1,-1,0,0,0,0
3,3,tid,0,-1,-1,1,-1,0,0,0,0
4,4,tid,0,-1,-1,1,-1,0,0,0,0


In [34]:
#Generate assignment matrix: G

ec_data = init_data[["ec-missing-value","ec-default-value","ec-top-value","ec-valid-number","ec-cardinality-vio","ec-lookup-attr","ec-pattern-value","ec-valid-ssn"]]
ec_data.head()


Unnamed: 0,ec-missing-value,ec-default-value,ec-top-value,ec-valid-number,ec-cardinality-vio,ec-lookup-attr,ec-pattern-value,ec-valid-ssn
0,-1,-1,-1,-1,0,0,0,0
1,-1,-1,1,-1,0,0,0,0
2,-1,-1,1,-1,0,0,0,0
3,-1,-1,1,-1,0,0,0,0
4,-1,-1,1,-1,0,0,0,0


In [35]:
# Q - Initial matrix
Q = np.asmatrix(ec_data.values)

#### Assignment matrix

In [36]:
# G - assignment matrix
ec_data = ec_data.replace(-1, 1)
G = np.asmatrix(ec_data.values)
G.shape

(26510, 8)

In [37]:
# assignment matrix
G

matrix([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]])

#### Rating generation model

In [38]:
def coin_flip(prob): #parameter prob denotes the probability of each trial
    n = 1  # number of trials, 
    s = np.random.binomial(n, prob, 1)
    return s[0]

In [39]:
apply_flip = lambda i: i if(i == 0) else ( i if(coin_flip(0.8)==1) else -i )

#apply_flip = lambda i: i==0 and i or (coin_flip(0.5)==1 and i or -i)

vectorized_flip = np.vectorize(apply_flip)
U = vectorized_flip(G)
U.shape

(26510, 8)

In [40]:
# ratings matrix
U

matrix([[-1,  1,  1, ...,  0,  0,  0],
        [ 1,  1,  1, ...,  0,  0,  0],
        [ 1,  1, -1, ...,  0,  0,  0],
        ...,
        [ 1,  0,  0, ...,  0,  0,  0],
        [ 1,  1,  1, ...,  0,  0,  0],
        [ 1,  0,  0, ...,  0,  0,  0]])

In [41]:
# D[j][k] denotes the difference between agreements and disagreements between users j and k
D = np.dot(U.T, U)
D

matrix([[26510,  8961,  9271,  3875,   699,  1646,    10,     0],
        [ 8961, 25503,  9129,  3855,   711,  1701,     4,     0],
        [ 9271,  9129, 25503,  3875,   719,  1803,    18,     0],
        [ 3875,  3855,  3875, 11175,   285,     0,     0,     0],
        [  699,   711,   719,   285,  2079,   166,     0,     0],
        [ 1646,  1701,  1803,     0,   166,  4820,     0,     0],
        [   10,     4,    18,     0,     0,     0,    34,     0],
        [    0,     0,     0,     0,     0,     0,     0,     0]])

In [42]:
# C[j][k] denotes the number of common items rated by users j and k
C = np.dot(G.T, G)
C

matrix([[26510, 25503, 25503, 11175,  2079,  4820,    34,     0],
        [25503, 25503, 25503, 11175,  2027,  4693,    34,     0],
        [25503, 25503, 25503, 11175,  2027,  4693,    34,     0],
        [11175, 11175, 11175, 11175,   907,     0,     0,     0],
        [ 2079,  2027,  2027,   907,  2079,   378,     0,     0],
        [ 4820,  4693,  4693,     0,   378,  4820,     2,     0],
        [   34,    34,    34,     0,     0,     2,    34,     0],
        [    0,     0,     0,     0,     0,     0,     0,     0]])

In [43]:
#(a[:,None]==b).astype(int)
(C!=0).astype(int)

matrix([[1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 1, 1, 1, 0],
        [1, 1, 1, 0, 0, 1, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]])

#### Preparing top eigenvectors of the two user-user matrices

In [44]:

w_d, v_d = la.eig(D)

#top eigenvector of matrix D, which is the difference between agreements and disagreements between users
top_v_d = v_d[:, np.argmax(w_d)]
print(top_v_d)

w_c, v_c = la.eig(C)
#top eigenvector of matrix C, which is the number of common items rated by users
top_v_c = v_c[:, np.argmax(w_c)]
print(top_v_c) 


[[5.77931524e-01]
 [5.55209541e-01]
 [5.61717178e-01]
 [1.90448814e-01]
 [2.91219402e-02]
 [7.13606075e-02]
 [3.97076611e-04]
 [0.00000000e+00]]
[[0.55864317]
 [0.55169093]
 [0.55169093]
 [0.25881275]
 [0.04530878]
 [0.10077035]
 [0.00068274]
 [0.        ]]


In [45]:

#print(w)
#print(v)

# calculate eigenvectors and eigenvalues
#values, vectors = eig(D)
# create matrix from eigenvectors
#Q = v
# create inverse of eigenvectors matrix
#R = la.inv(Q)
# create diagonal matrix from eigenvalues
#L = np.diag(w)
# reconstruct the original matrix
#B_reconstr = Q.dot(L).dot(R)
#print(B_reconstr)

In [46]:
def compute_hadamard_vectorized(ms, ns):
    #uu = np.asarray(ms).flatten()
    #gg = np.asarray(ns).flatten()
    hadamard_op= lambda i, j: 0 if (j==0) else i/j
    hadamard_vectorized = np.vectorize(hadamard_op)
    w_hat = hadamard_vectorized(ms, ns)
    return w_hat

In [47]:
ms = [10, 20, 30, 40, 50, 23, 0,4]
ns = [5, 1, 10, 40, 0]
np.array(compute_hadamard_vectorized(top_v_c, top_v_d)).flatten()

array([0.96662519, 0.99366255, 0.98215071, 1.35896223, 1.55582983,
       1.41212847, 1.7194235 , 0.        ])

#### Hadamard product calculation

In [48]:
def compute_hadamard(ms, ns):
    uu = np.asarray(ms).flatten()
    gg = np.asarray(ns).flatten()
    w_hat = list(map( lambda i, j: 0 if (j==0) else i/j, uu, gg))
    return w_hat

In [49]:
#ms = [10, 20, 30, 40, 50]
#ns = [5, 1, 10, 40, 0]
#compute_hadamard(ms, ns)

In [50]:
w_hat= compute_hadamard(top_v_d, top_v_c)
w_hat

[1.0345271473907736,
 1.0063778705379018,
 1.0181736727355541,
 0.7358556248185232,
 0.6427438160365514,
 0.7081508687092986,
 0.5815902822314302,
 0]

In [51]:
def compute_user_reliability(w_hat):
    w_signs = np.sign(w_hat)
    w_abs= np.abs(w_hat)
    reliability = list(map(lambda s,a: s*max(a,1.0) , w_signs, w_abs ))
    return reliability

In [52]:
u_reliablility = compute_user_reliability(w_hat)
u_reliablility

[1.0345271473907736,
 1.0063778705379018,
 1.0181736727355541,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0]

### item quality:


In [53]:
qs= np.multiply(U, u_reliablility)
q_hat = np.sign(np.sum(qs, axis=1))
q_tmp = np.asarray(np.where(q_hat < 0.0, 0, q_hat))
q_predicted =q_tmp.astype(int)
q_predicted.shape, U.shape

((26510, 1), (26510, 8))

In [54]:
#utils.f1(conf_matrix)

In [55]:
def get_top_eigenvector(D):
    #preparing eigenvectors:
    w_d, v_d = la.eig(D)
    #top eigenvector of matrix D
    top_v_d = v_d[:, np.argmax(w_d)]
    return top_v_d

In [56]:
def create_indicator_mtrx(C):
    I = (C!=0).astype(int)
    return I

In [64]:
def aggregating_labels_alg1(ec_data, prob):
    #ec_data = init_data[["ec-missing-value","ec-default-value","ec-top-value","ec-valid-number","ec-cardinality-vio","ec-lookup-attr","ec-pattern-value","ec-valid-ssn"]]
    # Q - Initial matrix
    Q = np.asmatrix(ec_data.values)
    # G - assignment matrix
    ec_data = ec_data.replace(-1, 1)
    G = np.asmatrix(ec_data.values)

    #Rating generation model:
    apply_flip = lambda i: i if(i == 0) else ( i if(coin_flip(prob)==1) else -i )

    vectorized_flip = np.vectorize(apply_flip)
    U = vectorized_flip(G)

    # D[j][k] denotes the difference between agreements and disagreements between users j and k
    D = np.dot(U.T, U)
    # C[j][k] denotes the number of common items rated by users j and k
    C = np.dot(G.T, G)

    top_v_d = get_top_eigenvector(D)    
    top_v_c = get_top_eigenvector(C)
    
    hadamard = compute_hadamard_vectorized(top_v_d, top_v_c)
    w_hat= np.array(hadamard).flatten()
    u_reliablility = compute_user_reliability(w_hat)
    qs= np.multiply(U, u_reliablility)
    q_hat = np.sign(np.sum(qs, axis=1))
    q_tmp = np.asarray(np.where(q_hat < 0.0, 0, q_hat))
    q_predicted =q_tmp.astype(int)
    return u_reliablility, q_predicted

In [65]:

I=(C!=0).astype(int)
top_v_i = get_top_eigenvector(I)
hadamard1 = compute_hadamard_vectorized(D, C)
hadamard2 = compute_hadamard_vectorized(hadamard1, top_v_i)
print(hadamard2)
get_top_eigenvector(hadamard2).shape

[[2.37693168 0.8351835  0.86407613 0.82421568 0.79917039 0.81170737
  0.69909755 0.        ]
 [0.8351835  2.37693168 0.85084144 0.81996167 0.83374367 0.86153011
  0.27963902 0.        ]
 [0.86407613 0.85084144 2.37693168 0.82421568 0.84312475 0.91319152
  1.25837559 0.        ]
 [1.11342188 1.10767518 1.11342188 3.21096503 1.00895814 0.
  0.         0.        ]
 [0.90667111 0.94589503 0.95653801 0.84735357 2.69666558 1.18424997
  0.         0.        ]
 [0.92089451 0.97741917 1.03602984 0.         1.18424997 2.69666558
  0.         0.        ]
 [0.94440148 0.37776059 1.69992266 0.         0.         0.
  3.21096503 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.        ]]


(8, 1)

In [66]:
def aggregating_labels_alg2(ec_data, prob):
    #ec_data = init_data[["ec-missing-value","ec-default-value","ec-top-value","ec-valid-number","ec-cardinality-vio","ec-lookup-attr","ec-pattern-value","ec-valid-ssn"]]
    # Q - Initial matrix
    Q = np.asmatrix(ec_data.values)
    # G - assignment matrix
    ec_data = ec_data.replace(-1, 1)
    G = np.asmatrix(ec_data.values)

    #Rating generation model:
    apply_flip = lambda i: i if(i == 0) else ( i if(coin_flip(prob)==1) else -i )

    vectorized_flip = np.vectorize(apply_flip)
    U = vectorized_flip(G)

    # D[j][k] denotes the difference between agreements and disagreements between users j and k
    D = np.dot(U.T, U)
    # C[j][k] denotes the number of common items rated by users j and k
    C = np.dot(G.T, G)
    #create indicator matrix from C
    I = create_indicator_mtrx(C)       
    top_v_i = get_top_eigenvector(I)
    
    hadamard1 = compute_hadamard_vectorized(D, C)
    hadamard2 = compute_hadamard_vectorized(hadamard1, top_v_i)
    w_tmp = get_top_eigenvector(hadamard2)
    w_hat= np.array(w_tmp).flatten()
    u_reliablility = compute_user_reliability(w_hat)

    qs= np.multiply(U, u_reliablility)
    q_hat = np.sign(np.sum(qs, axis=1))
    q_tmp = np.asarray(np.where(q_hat < 0.0, 0, q_hat))
    q_predicted =q_tmp.astype(int)
    return u_reliablility, q_predicted

In [78]:
#ALGORITHM 1
init_data = pd.read_csv("data/address-matrix-ec.csv")

ec_data = init_data[["ec-missing-value","ec-default-value","ec-top-value","ec-valid-number",
                     "ec-cardinality-vio","ec-lookup-attr","ec-valid-data-type","ec-unused-column",
                     "ec-pattern-value","ec-value-len","ec-value-len-Hampelx84","ec-value-len-evt",
                     "ec-value-len-1-5-IQR","ec-value-len-z-test","ec-value-len-trimmed-range",
                     "ec-value-len-winsorized-range"]]

print("ALGORITHM 1: all ec rules")
for p in np.linspace(0.1, 1, num=15, endpoint=False):
    u_reliablility, q_predicted = aggregating_labels_alg1(ec_data, p)
    truth_labels = np.asarray(init_data[["label"]].values)
    conf_matrix=confusion_matrix(truth_labels, q_predicted)
    print("Probability:",p)
    utils.f1(conf_matrix)

    
    
ec_data_outlier = init_data[["ec-pattern-value","ec-value-len","ec-value-len-Hampelx84","ec-value-len-evt",
                     "ec-value-len-1-5-IQR","ec-value-len-z-test","ec-value-len-trimmed-range",
                     "ec-value-len-winsorized-range"]]
print("ALGORITHM 1: outlier rule for the value length distribution")
for p in np.linspace(0.1, 1, num=15, endpoint=False):
    u_reliablility, q_predicted = aggregating_labels_alg1(ec_data_outlier, p)
    truth_labels = np.asarray(init_data[["label"]].values)
    conf_matrix=confusion_matrix(truth_labels, q_predicted)
    print("Probability:",p)
    utils.f1(conf_matrix)

ALGORITHM 1: all ec rules
Probability: 0.1
Precision: 0.1547, Recall: 0.0174, F-1: 0.0312
Probability: 0.16
Precision: 0.3492, Recall: 0.9523, F-1: 0.5110
Probability: 0.22000000000000003
Precision: 0.2458, Recall: 0.0933, F-1: 0.1353
Probability: 0.28
Precision: 0.2785, Recall: 0.1583, F-1: 0.2018
Probability: 0.34
Precision: 0.3003, Recall: 0.2367, F-1: 0.2647
Probability: 0.4
Precision: 0.3493, Recall: 0.6705, F-1: 0.4593
Probability: 0.4600000000000001
Precision: 0.3430, Recall: 0.5693, F-1: 0.4281
Probability: 0.52
Precision: 0.3365, Recall: 0.4711, F-1: 0.3926
Probability: 0.5800000000000001
Precision: 0.3240, Recall: 0.3628, F-1: 0.3423
Probability: 0.64
Precision: 0.3079, Recall: 0.2675, F-1: 0.2863
Probability: 0.7000000000000001
Precision: 0.2874, Recall: 0.1837, F-1: 0.2242
Probability: 0.76
Precision: 0.2591, Recall: 0.1136, F-1: 0.1579
Probability: 0.8200000000000001
Precision: 0.3503, Recall: 0.9392, F-1: 0.5102
Probability: 0.88
Precision: 0.3471, Recall: 0.9750, F-1: 0.

In [79]:
#ALGORITHM 2
init_data = pd.read_csv("data/address-matrix-ec.csv")
ec_data = init_data[["ec-missing-value","ec-default-value","ec-top-value","ec-valid-number",
                     "ec-cardinality-vio","ec-lookup-attr","ec-valid-data-type","ec-unused-column",
                     "ec-pattern-value","ec-value-len","ec-value-len-Hampelx84","ec-value-len-evt",
                     "ec-value-len-1-5-IQR","ec-value-len-z-test","ec-value-len-trimmed-range",
                     "ec-value-len-winsorized-range"]]
print("ALGORITHM 2: all ec rules")

for p in np.linspace(0.1, 1, num=15, endpoint=False):
    u_reliablility, q_predicted = aggregating_labels_alg2(ec_data, p)
    truth_labels = np.asarray(init_data[["label"]].values)
    conf_matrix=confusion_matrix(truth_labels, q_predicted)
    print("Probability:",p)
    utils.f1(conf_matrix)
    
ec_data_outlier = init_data[["ec-pattern-value","ec-value-len","ec-value-len-Hampelx84","ec-value-len-evt",
                     "ec-value-len-1-5-IQR","ec-value-len-z-test","ec-value-len-trimmed-range",
                     "ec-value-len-winsorized-range"]]
print("ALGORITHM 2: outlier rule for the value length distribution")
for p in np.linspace(0.1, 1, num=15, endpoint=False):
    u_reliablility, q_predicted = aggregating_labels_alg2(ec_data_outlier, p)
    truth_labels = np.asarray(init_data[["label"]].values)
    conf_matrix=confusion_matrix(truth_labels, q_predicted)
    print("Probability:",p)
    utils.f1(conf_matrix)

ALGORITHM 2: all ec rules
Probability: 0.1
Precision: 0.2169, Recall: 0.0311, F-1: 0.0544
Probability: 0.16
Precision: nan, Recall: 0.0000, F-1: nan


  precision = tp/(tp+fp)


Probability: 0.22000000000000003
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.28
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.34
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.4
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.4600000000000001
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.52
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.5800000000000001
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.64
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.7000000000000001
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.76
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.8200000000000001
Precision: nan, Recall: 0.0000, F-1: nan
Probability: 0.88
Precision: 0.3463, Recall: 0.8664, F-1: 0.4949
Probability: 0.9400000000000001
Precision: 0.3434, Recall: 0.9365, F-1: 0.5025
ALGORITHM 2: outlier rule for the value length distribution
Probability: 0.1
Precision: 0.3561, Recall: 0.0563, F-1: 0.0973
Probability: 