In [24]:
import sys
sys.path.append("..")

from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from sklearn.model_selection import GroupShuffleSplit 
import matplotlib.pyplot as plt
import nltk
from torch import optim
from nltk.corpus import stopwords
from models import bert
from transformers import BertTokenizer, BertForSequenceClassification, BertPreTrainedModel, BertModel
import json
from torch.utils.data import DataLoader
from ray import tune
from ray.air import Checkpoint, session
from ray.tune.schedulers import ASHAScheduler
from functools import partial


In [25]:
train_df = pd.read_csv('../data/train_older_adult_annotations.csv',delimiter=',', encoding='latin-1')
test_df = pd.read_csv('../data/test_annotations.csv', delimiter=',')
# df = pd.concat([test_df, train_df])

# age_anxiety_df = pd.read_csv('../data/age_anxiety_full_responses.csv', delimiter=',')
# age_experience_df = pd.read_csv('../data/age_experience_responses.csv', delimiter=',')
demographics_df = pd.read_csv('../data/demographics_responses.csv', delimiter=',')
anxiety_score_df = pd.read_csv('../data/respondent_anxiety_table.csv', delimiter=',')

anxiety_score_df = anxiety_score_df[anxiety_score_df['respondent_id'].isin(train_df['respondent_id'])]
demographics_df = demographics_df[demographics_df['respondent_id'].isin(train_df['respondent_id'])]

# df1 = pd.merge(demographics_df, anxiety_score_df, on='respondent_id')
# merged_df = pd.merge(df, df1, on='respondent_id')

sentiment_labels = ['Very negative','Somewhat negative','Neutral','Somewhat positive','Very positive']
total_annotator_ids = train_df['respondent_id'].unique().tolist()

id2label = {index: row for (index, row) in enumerate(sentiment_labels)} 
label2id = {row: index for (index, row) in enumerate(sentiment_labels)}

id2annotator = {index: row for (index, row) in enumerate(total_annotator_ids)}
annotator2id = {row: index for (index, row) in enumerate(total_annotator_ids)}

train_df["annotation"] = train_df["annotation"].map(label2id)
train_df["respondent_id"] = train_df["respondent_id"].map(annotator2id)


test_df["annotation"] = test_df["annotation"].map(label2id)
test_df["respondent_id"] = test_df["respondent_id"].map(annotator2id)

anxiety_score_df["respondent_id"] = anxiety_score_df["respondent_id"].map(annotator2id)
demographics_df["respondent_id"] = demographics_df["respondent_id"].map(annotator2id)



train_df.rename(columns = {'respondent_id':'annotator_id', 'unit_text':'text'}, inplace = True)
test_df.rename(columns = {'respondent_id':'annotator_id', 'unit_text':'text'}, inplace = True)

train_df['original_index'] = train_df.index
test_df['original_index'] = test_df.index


train_agr = train_df.groupby('unit_id')['annotation'].nunique().reset_index()
train_agr.columns = ['unit_id', 'unique_annotations']
train_df = train_df.merge(train_agr, on='unit_id')
train_df['disagreement'] = train_df['unique_annotations'] > 1

test_agr = test_df.groupby('unit_id')['annotation'].nunique().reset_index()
test_agr.columns = ['unit_id', 'unique_annotations']
test_df = test_df.merge(test_agr, on='unit_id')
test_df['disagreement'] = test_df['unique_annotations'] > 1


# Sort the DataFrame back based on the original index
train_df.sort_values(by='original_index', inplace=True)
test_df.sort_values(by='original_index', inplace=True)

# Drop the additional column used for sorting
train_df.drop(columns='original_index', inplace=True)
test_df.drop(columns='original_index', inplace=True)


# grouped = merged_df.groupby('unit_id')['annotation'].nunique().reset_index()
# grouped.columns = ['unit_id', 'unique_annotations']
# merged_df = merged_df.merge(grouped, on='unit_id')
# merged_df['disagreement'] = merged_df['unique_annotations'] > 1



In [26]:
train_df.to_csv('train_new_agr.csv',index=False)
test_df.to_csv('test_new_agr.csv',index=False)

# order anxiety_score_df and demographics_df by respondent_id
anxiety_score_df = anxiety_score_df.sort_values(by=['respondent_id'])
demographics_df = demographics_df.sort_values(by=['respondent_id'])
anxiety_score_df.to_csv('anxiety_score.csv',index=False)
demographics_df.to_csv('demographics.csv',index=False)

In [27]:
len(train_df[train_df["disagreement"]==True]["unit_id"].unique())

13179

In [28]:
len(test_df)

1419

In [29]:
len(test_df["annotator_id"].unique())

878

In [30]:
demographics_df["respondent_id"]

0          0
1          1
2          2
3          3
4          4
        ... 
1478    1476
1479    1477
1480    1478
1481    1479
1482    1480
Name: respondent_id, Length: 1481, dtype: int64

In [31]:
id2annotator[105]

'R_3fP575Tw6XFk2ZQ'

In [32]:
id2annotator

{0: 'R_1I5depz6ASpP2YF',
 1: 'R_27sAwU5dMyvKR8P',
 2: 'R_2vYT6suV4FhYAHk',
 3: 'R_2BnzNYeKLHABEei',
 4: 'R_A6vEunsl5EoPDpv',
 5: 'R_3fJfPSVmVDNnwSV',
 6: 'R_2WBVl0niLFn9elJ',
 7: 'R_3r05gWBZNv3oDVT',
 8: 'R_2RVSQMPKzVgYL6h',
 9: 'R_pyBFvQ1UFMyVIK5',
 10: 'R_1nOZMXdItb3ndyg',
 11: 'R_2DYyJkrMpwLQa1R',
 12: 'R_1DNgjAY3UEWZ0cY',
 13: 'R_um1uN1qA26hCGWt',
 14: 'R_3iFkZsqgPlEdip0',
 15: 'R_3O2oZt1CSyajAp3',
 16: 'R_3hhxJZ7lbzmKDRa',
 17: 'R_yryBGY1HcMufVDP',
 18: 'R_1Q583LMM1Y0MnTQ',
 19: 'R_ROat0DEbUWO4USZ',
 20: 'R_2dYEKmjPPDzHdWK',
 21: 'R_88rUOre21x1uFzj',
 22: 'R_2v8rV7Ca1iS18to',
 23: 'R_0H7jeIMm5pV11fz',
 24: 'R_1Q5e81duDXvXxKj',
 25: 'R_1d0KM7jMHnInzID',
 26: 'R_6Gpg9oK8GbVKrfz',
 27: 'R_2P09pxi3f97SjmC',
 28: 'R_2eWGOh7ayjbChpo',
 29: 'R_2ZJilZyKgHWlt5P',
 30: 'R_1laGR7wF0PmMJcj',
 31: 'R_3NCJr1AtKOCIlQA',
 32: 'R_32MhJB47fCWq7KD',
 33: 'R_V23HpjP1cmZge8V',
 34: 'R_33BfNBilcVzC8U2',
 35: 'R_2Pcy0501eyoVd8a',
 36: 'R_1gd12wOeCAzyYmG',
 37: 'R_2CB8bFwIiD7UPAl',
 38: 'R_1Cg5DUX77xWG2N

In [33]:
annotator2id["R_3fP575Tw6XFk2ZQ"]

105