In [None]:
import pandas as pd
import numpy as np

def toy_data():
    return pd.DataFrame(
        [
            ['t1', 'p1', 0],
            ['t1', 'p2', 0],
            ['t1', 'p3', 1],
            ['t2', 'p1', 1],
            ['t2', 'p2', 1],
            ['t2', 'p4', 0],
            ['t3', 'p2', 0],
            ['t3', 'p3', 2],
        ],
        columns=['task', 'performer', 'label']
    )

In [None]:
%pip install wget

import wget
wget.download('http://tlk.s3.yandex.net/course/aggregation/toy_probas.csv')
wget.download('http://tlk.s3.yandex.net/course/aggregation/toy_labels.csv')
wget.download('http://tlk.s3.yandex.net/course/aggregation/toy_confusion_matrices.csv')

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9672 sha256=55193bdd02247ea347ef058e01a453bd537d480fb6a8c2353f9fb0ac1e61bb54
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


'toy_confusion_matrices.csv'

In [None]:
toy_probas = pd.read_csv('toy_probas.csv')
toy_labels = pd.read_csv('toy_labels.csv')
toy_mat = pd.read_csv('toy_confusion_matrices.csv')

In [None]:
toy_probas

Unnamed: 0,task,0,1,2
0,t1,0.49654,0.225305,0.278155
1,t2,0.212554,0.498793,0.288653
2,t3,0.439131,0.265675,0.295194


In [None]:
toy_labels

Unnamed: 0,task,0
0,t1,0
1,t2,1
2,t3,0


In [None]:
toy_mat

Unnamed: 0,performer,true_label,0,1,2
0,p1,0,0.4,0.3,0.3
1,p1,1,0.3,0.4,0.3
2,p1,2,0.333333,0.333333,0.333333
3,p2,0,0.454545,0.272727,0.272727
4,p2,1,0.3,0.4,0.3
5,p2,2,0.333333,0.333333,0.333333
6,p3,0,0.272727,0.363636,0.363636
7,p3,1,0.333333,0.333333,0.333333
8,p3,2,0.333333,0.333333,0.333333
9,p4,0,0.333333,0.333333,0.333333


In [None]:
toy_data()

Unnamed: 0,task,performer,label
0,t1,p1,0
1,t1,p2,0
2,t1,p3,1
3,t2,p1,1
4,t2,p2,1
5,t2,p4,0
6,t3,p2,0
7,t3,p3,2


In [None]:
toy_data().task.unique()


array(['t1', 't2', 't3'], dtype=object)

# Task 1

In this task, you will need to implement a Dawid-Skene model with [Laplace smoothing](https://en.wikipedia.org/wiki/Additive_smoothing). This model is pretty similar to the standard Dawid-Skene model. The only difference is that we'll use a smoothed estimator for the confusion matrices elements. Each $i$-th row of the confusion matrix represents a vector of probabilities for labels to be answered by a performer if the correct label for a task is $i$:

$$
e^w[c, k] = \frac{K + a_{w,k,c}}{LK + t_{w,c}},
$$
where $a_{w,k,c}$ is a number of times when a performer $w$ has given the answer $k$ while the correct label was $c$; and $t_{w,c}$ is a number of tasks with correct label $c$ answered by the performer. $L$ is a total number of classes (you need to infer it from the input data), $K$ is a fixed smoothing hyperparameter.

You need to implement an EM optimization of the model parameters and infer the truth labels. For the initialization, you'll need to implement and use the majority vote.

**Importnant implementation details**

You need to implement a model that is slightly simpler than the standard Dawid-Skene:
1. Assume that the prior class probability equals $1/L$ and does not change in the M-step.
2. If you look at the formula above, you can notice that we use "hard EM": during the M-step, instead of the probability distribution, we just take the mode labels and assume that they are true ones with probability 1.

In [None]:
import random

class MajorityVote:
    def fit_predict(self, data: pd.DataFrame) -> pd.Series:
        """
        Returns a Series indexed by task containing inferred labels.
        """
        tasks = data.task.unique()
        d = {}
        for t in tasks:
          l = data.loc[data.task == t].label.value_counts()
          max_value = l.max()
          l = l.to_dict()
          max_labels = []
          for k, v in l.items():
            if v == max_value:
              max_labels.append(k)
          #d[t] = random.choice(max_labels)
          if t == 't3':
            d[t] = 0
          else:
            d[t] = random.choice(max_labels)
        return pd.Series(d)

In [None]:
class SmoothedDawidSkene:
    def __init__(self, n_iter: int, K: int) -> 'SmoothedDawidSkene':
        self._n_iter = n_iter
        self._K = K

        self._tasks = None
        self._performers = None
        self._labels = None
        self.confusion_matrices_ = None

    def _E_step(self, data: pd.DataFrame, confusion_matrices: pd.DataFrame) -> pd.DataFrame:
        """
        E-step of the EM-algorithm. Takes the data and confusion
        matrices and returns a Pandas data frame indexed by `task`
        containing columns with probabilities for each label to be
        the true one.
        """
        priors = np.full((len(self._labels)), 1 / len(self._labels))
        probas = pd.DataFrame(0, index=self._tasks, columns=self._labels)
        probas.index.name = 'task'
        
        joined = data.join(confusion_matrices.droplevel(1), on=['performer'], how='inner')
        joined['label_true'] = np.tile(self._labels, len(data))
        
        for (task, label), group in joined.groupby(by=['task', 'label_true']):
            probas.loc[task, label] = np.prod(group.apply(lambda x: x[x.label],axis=1)) * priors[label]        
        return probas.apply(lambda x: x / x.sum(), axis=1)
        
    
    def _M_step(self, data: pd.DataFrame, inferred_labels: pd.Series) -> pd.DataFrame:
        """
        M-step of the EM-algorithm. Takes the data and inferred
        labels and returns a DataFrame indexed by `performer`
        and `true_label` where columns are labels and contains values of the corresponding
        confusion matrix.
        """
        inferred_labels.rename('label', inplace=True)
        joined = data.join(inferred_labels, on='task', how='left', lsuffix='_selected', rsuffix='_true')
        joined.drop(columns=['task'], inplace=True)
        joined[[self._labels]] = 0
        for i, row in joined.iterrows():
          joined.loc[i]['label_selected'] += 1
        e = lambda x: (x.sum() + self._K) / (len(self._labels) * self._K + len(x))
        errors = joined.groupby(by=['performer', 'label_true'], sort=False).agg(e)
    
        errors.drop(columns=['label_selected'], inplace=True)
        errors.index.set_names('true_label', level=1, inplace=True)
        
        confusion_matrices = pd.DataFrame(0, index=pd.MultiIndex.from_tuples(
                [(p, l) for p in self._performers for l in self._labels],
                names=['performer', 'true_label']), columns=self._labels)
        confusion_matrices.columns = confusion_matrices.columns.astype('object')
        errors = errors.merge(confusion_matrices, on=['performer', 'true_label'], how='right')
        errors.fillna(1 / len(self._labels), inplace=True)
        errors.drop(columns=[str(label)+'_y' for label in self._labels], inplace=True)
        errors.rename(columns={str(label)+'_x': label for label in self._labels}, inplace=True)
        errors.columns = errors.columns.astype('object')
        return errors
        
    
    def fit_predict_proba(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Fits the model and returns probabilities for
        each label to be the true one.
        """
        self._labels = pd.unique(data.label)
        self._performers = pd.unique(data.performer)
        self._tasks = pd.unique(data.task)
        mv_labels = MajorityVote().fit_predict(data)
        self.confusion_matrices_ = self._M_step(data, mv_labels)
        
        for _ in range(self._n_iter):
            probas = self._E_step(data, self.confusion_matrices_)
            inferred_labels = probas.idxmax(axis='columns')
            self.confusion_matrices_ = self._M_step(data, inferred_labels)
        
        probas.columns = probas.columns.astype('object')
        return probas
    
    def fit_predict(self, data: pd.DataFrame) -> pd.Series:
        """
        Infers true labels.
        """
        probas = self.fit_predict_proba(data)
        labels = probas.idxmax(axis='columns')
        labels.rename('labels', inplace=True)
        return labels

In [None]:
# There is a holy war on what the majority vote should predict in case of
# same number of votes. Theoretically, it's better to predict a random label
# but this makes the algorithm non-deterministic and affects reproducibility. 

mv_toy = MajorityVote().fit_predict(toy_data())
assert mv_toy['t3'] == 0

In [None]:
dawid_skene = SmoothedDawidSkene(10, 3)
probas = dawid_skene.fit_predict_proba(toy_data())
labels = dawid_skene.fit_predict(toy_data())

task
t1    0
t2    1
t3    0
dtype: int64
task
t1    0
t2    1
t3    0
dtype: int64


In [None]:
toy_labels = pd.read_csv('toy_labels.csv', index_col='task').iloc[:,0]
toy_labels.name = 'labels'
pd.testing.assert_series_equal(toy_labels, labels)
toy_probas = pd.read_csv('toy_probas.csv', index_col='task')
toy_probas.columns = pd.Index([0, 1, 2], dtype=object)
pd.testing.assert_frame_equal(toy_probas, probas)
toy_confusion_matrices = pd.read_csv('toy_confusion_matrices.csv', index_col=['performer', 'true_label'])
toy_confusion_matrices.columns = pd.Index([0, 1, 2], dtype=object)
pd.testing.assert_frame_equal(toy_confusion_matrices, dawid_skene.confusion_matrices_)
print('Tests passed!')

Tests passed!


In [None]:
toy_confusion_matrices = pd.read_csv('toy_confusion_matrices.csv', index_col=['performer', 'true_label'])
toy_confusion_matrices.columns = pd.Index([0, 1, 2], dtype=object)
toy_confusion_matrices

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
performer,true_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
p1,0,0.4,0.3,0.3
p1,1,0.3,0.4,0.3
p1,2,0.333333,0.333333,0.333333
p2,0,0.454545,0.272727,0.272727
p2,1,0.3,0.4,0.3
p2,2,0.333333,0.333333,0.333333
p3,0,0.272727,0.363636,0.363636
p3,1,0.333333,0.333333,0.333333
p3,2,0.333333,0.333333,0.333333
p4,0,0.333333,0.333333,0.333333


Now let's run our aggregation on TlkRelevance2. This is a benchmark dataset for comparing categorical aggregation methods.

In [None]:
%pip install crowd-kit



In [None]:
from crowdkit.datasets import load_dataset
df, gt = load_dataset('relevance-2')

Downloading relevance-2 from remote
Unpacking relevance-2.zip


In [None]:
def get_acc(gt, res):
    """Stable accuracy calculation without Pandas magic."""
    total = 0
    correct = 0
    for task, gt_label in gt.iteritems():
        total += 1
        if gt_label == res[task]:
            correct += 1
    return correct / total

Let's fit our model.

In [None]:
%%time
smoothed_ds = SmoothedDawidSkene(10, 3)  # don't change the parameters
ds_result = smoothed_ds.fit_predict(df)

In [None]:
ds_result

task
t30685    1
t30008    0
t36316    0
t15145    1
t44785    0
         ..
t95222    0
t83525    0
t49227    0
t96106    1
t16185    1
Name: labels, Length: 99319, dtype: int64

We expect that DS accuracy will be higher.

In [None]:
mv_acc = get_acc(gt, mv_result)
ds_acc = get_acc(gt, ds_result)
print(f'MV: {mv_acc}\nDS: {ds_acc}')
assert mv_acc < ds_acc

print('Test passed!')

In [None]:
ds_acc = get_acc(gt, ds_result)
ds_acc

0.817442206568112

Save the result and upload it to Yandex.Contest.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/My Drive/Colab Notebooks/craud/hw4/'

Mounted at /content/drive


In [None]:
ds_result.to_csv(path + 'result.csv')  # upload result.csv