# Ratings merging and filtering

This notebook associates community notes with related ratings (RQ2 in the paper)

## Loading in data

In [None]:
import pandas as pd
import numpy as np

In [3]:
notes = pd.read_csv('notes_with_bias.csv')
note_ids = notes['noteId'].unique()
del notes

## Matching IDs and date cutoff for ratings and notes sub-datasets

In [4]:
import pandas as pd

files = ['ratings-00000.tsv',
         'ratings-00001.tsv',
         'ratings-00002.tsv',
         'ratings-00003.tsv',
         'ratings-00004.tsv',
         'ratings-00005.tsv',
         'ratings-00006.tsv',
         'ratings-00007.tsv']


merged_data = pd.DataFrame()

for file in files:
    df = pd.read_csv(file, sep='\t')

    filtered_df = df[(df['createdAtMillis'] < 1706379015852) & df['noteId'].isin(note_ids)]

    del df
    merged_data = pd.concat([merged_data, filtered_df], ignore_index=True)

merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9553456 entries, 0 to 9553455
Data columns (total 32 columns):
 #   Column                                Dtype 
---  ------                                ----- 
 0   noteId                                int64 
 1   raterParticipantId                    object
 2   createdAtMillis                       int64 
 3   version                               int64 
 4   agree                                 int64 
 5   disagree                              int64 
 6   helpful                               int64 
 7   notHelpful                            int64 
 8   helpfulnessLevel                      object
 9   helpfulOther                          int64 
 10  helpfulInformative                    int64 
 11  helpfulClear                          int64 
 12  helpfulEmpathetic                     int64 
 13  helpfulGoodSources                    int64 
 14  helpfulUniqueContext                  int64 
 15  helpfulAddressesClaim           

In [5]:
file_path = 'ratings_filtered.csv'
merged_data.to_csv(file_path, index=False)

In [6]:
notes = pd.read_csv('notes_with_bias.csv')

In [7]:
ratings = merged_data

## Calculating and adding agreement levels

In [8]:
ratings.head()

Unnamed: 0,noteId,raterParticipantId,createdAtMillis,version,agree,disagree,helpful,notHelpful,helpfulnessLevel,helpfulOther,...,notHelpfulMissingKeyPoints,notHelpfulOutdated,notHelpfulHardToUnderstand,notHelpfulArgumentativeOrBiased,notHelpfulOffTopic,notHelpfulSpamHarassmentOrAbuse,notHelpfulIrrelevantSources,notHelpfulOpinionSpeculation,notHelpfulNoteNotNeeded,ratedOnTweetId
0,1430937459051929600,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318468371,2,0,0,0,0,HELPFUL,0,...,0,0,0,0,0,0,0,0,0,-1
1,1530296836271091712,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1665445691380,2,0,0,0,0,NOT_HELPFUL,0,...,1,0,0,0,0,0,0,0,0,-1
2,1530566246416228352,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1665445712749,2,0,0,0,0,NOT_HELPFUL,0,...,0,0,0,1,0,0,0,0,0,-1
3,1534997217689407488,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655331973679,2,0,0,0,0,SOMEWHAT_HELPFUL,0,...,0,0,0,1,0,0,0,0,0,-1
4,1535826508035592192,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655331889900,2,0,0,0,0,SOMEWHAT_HELPFUL,0,...,0,0,0,1,0,0,0,0,0,-1


In [9]:
notes = notes.merge(ratings.groupby('noteId')[['agree', 'disagree']].sum(), on='noteId', how='left')

In [10]:
notes['agree'] = notes['agree'].fillna(0)
notes['disagree'] = notes['disagree'].fillna(0)

In [11]:
notes['agreement'] = notes['agree'] / (notes['agree'] + notes['disagree'])

In [12]:
notes['agreement'] = notes['agreement'].fillna(0)

In [13]:
notes.head()

Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,tweetId,classification,believable,harmful,validationDifficulty,misleadingOther,misleadingFactualError,...,url,url_in_sources,bias_score,factuality_score,currentStatus,participantId,enrollmentState,agree,disagree,agreement
0,1537142913737428992,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318404027,1377030478167937024,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,"['forbes.com', 'washingtonpost.com']",True,0.5,3.0,NEEDS_MORE_RATINGS,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn,0.0,0.0,0.0
1,1537204430730211328,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655333070821,1537196168953974784,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,"['nytimes.com', 'wsj.com', 'nytimes.com']",True,0.67,3.67,NEEDS_MORE_RATINGS,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn,0.0,0.0,0.0
2,1540422295029551104,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1656100269455,1540087463099736065,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,['cnn.com'],True,1.0,3.0,NEEDS_MORE_RATINGS,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn,0.0,0.0,0.0
3,1586769867381669889,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1667150391800,1586411168880807936,NOT_MISLEADING,,,,0,0,...,"['nytimes.com', 'washingtonpost.com']",True,1.0,3.5,NEEDS_MORE_RATINGS,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn,0.0,0.0,0.0
4,1640795953472114688,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1680031214479,1640773789679230977,NOT_MISLEADING,,,,0,0,...,"['npr.org', 'statista.com']",True,0.5,4.0,NEEDS_MORE_RATINGS,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn,0.0,0.0,0.0


In [14]:
file_path = 'notes_with_bias_2.csv'
notes.to_csv(file_path, index=False)