# Task 1: Working with a dataset with categorical features

### Step 1, Reading the data

In [185]:
import pandas as pd
import numpy as np
import sklearn as sk
import krippendorff

class DataSet:
    def __init__(self, RawY=None, X=None, Y=None, File=None, Consensus=None, SplitY=None, ConfidenceWeights=None):
        self.RawY = RawY
        self.X = X
        self.Y = Y
        self.SplitY = SplitY
        self.File = File
        self.Consensus = Consensus
        self.ConfidenceWeights = ConfidenceWeights

trainSet = DataSet(File="assets/a3_train_final_partial.tsv")
testSet = DataSet(File="assets/a3_test.tsv")
datasets = [trainSet, testSet]
name2dataset = {"train":trainSet, "test":testSet}

def toNumOrNan(x):
    try:
        if (x == "-1"):
            return np.nan
        return int(x)
    except Exception:
        return np.nan

for dset in datasets:
    df = pd.read_table(dset.File, names=['opinion', 'text'])
    df = df.sample(frac=1, random_state=0)

    dset.RawY = df["opinion"]
    dset.X = df["text"]

In [186]:
# The trainset has annotator disagreements
# https://towardsdatascience.com/assessing-annotator-disagreements-in-python-to-build-a-robust-dataset-for-machine-learning-16c74b49f043
for dset in [trainSet]:
    splitOpinion = dset.RawY.str.split('/', expand=True)
    splitOpinion = splitOpinion.applymap(toNumOrNan).transpose()
    
    # since we don't know who the annotators are who wrote what should be arbitrary
    splitOpinion = pd.DataFrame(data=[sk.utils.shuffle(list(splitOpinion.iloc[:,c]), random_state=c) for c in splitOpinion.columns]).transpose()

    dset.SplitY = splitOpinion
    dset.Consensus = krippendorff.alpha(reliability_data=splitOpinion, value_domain=[0,1])

In [187]:
# Weiging annotations, https://arxiv.org/pdf/2208.06161.pdf
#   SPA makes one key assumption: The degree to
#   which labels are absent must be independent of the
#   true item-agreements ni⊥Pi.
from collections import Counter
import random

def getMostLikelyAndItsWeight(col):
    answer2count = Counter([x for x in col if x in [0,1]])
    nAnnotators = float(len(answer2count))

    mostPopularAnswer = sorted(answer2count, reverse=True)[0]
    mostPopularCount = answer2count[mostPopularAnswer]

    # agreement = % is the most popular - % isn't the most popular
    del answer2count[mostPopularAnswer]
    agreement = float(mostPopularCount - sum(answer2count.values()))/nAnnotators

    #using weight = number of annotators
    weight = nAnnotators

    return (weight*agreement, mostPopularAnswer)

for dset in [trainSet]:
    df = pd.DataFrame()
    
    weight,mostpop = zip(*[
        getMostLikelyAndItsWeight(dset.SplitY.iloc[:,c]) 
        for c in dset.SplitY.columns
    ])

    dset.ConfidenceWeights = pd.Series(list(weight))
    dset.Y = pd.Series(list(mostpop))

for dset in [testSet]:
    dset.Y = dset.RawY

In [188]:
import sklearn as sk
from sklearn import ensemble
from sklearn import pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

classifier = sk.ensemble.RandomForestClassifier(n_jobs=-1)
classifierName = f"{type(classifier).__name__}"

pipe = sk.pipeline.Pipeline([    
  ("tfid", sk.feature_extraction.text.TfidfVectorizer()),
  (classifierName, classifier)
])
display(pipe)
#display(pipe.get_params())

In [189]:
pipe.fit(trainSet.X, trainSet.Y, **{f'{classifierName}__sample_weight':trainSet.ConfidenceWeights})

In [194]:
pos_label = 1

df = pd.DataFrame()
df["Text"] = testSet.X
df["Real Opinion"] = testSet.Y
df["Predicted Opinion"] = pipe.predict(testSet.X)

display(df)

f1 = sk.metrics.f1_score(df["Real Opinion"], df["Predicted Opinion"], pos_label=pos_label),

print(f1)

Unnamed: 0,Text,Real Opinion,Predicted Opinion
1360,Worried about side effects in the future,,0.0
511,I mean people should know that vaccines doesn’...,,0.0
9,#Poison,,1.0
393,I don’t understand why there are people on her...,,0.0
471,I have not been “vaccinated” and I have never ...,,0.0
...,...,...,...
763,I’m a week out of my pfirst Pfizer vaccine and...,,0.0
835,Lie. The best way to protect yourself from vir...,,1.0
1216,This is objectively very fucking good.,,1.0
559,I was gonna get it until a close friend of min...,,0.0


ValueError: Classification metrics can't handle a mix of unknown and binary targets