# Note bias calculation

This notebook calculates and aggregates source bias and factuality scores for notes (RQ2 in the paper)

## Loading in data

In [None]:
import pandas as pd
import numpy as np

In [24]:
sources = pd.read_excel('annotated_sources_final.xlsx')

In [None]:
notes = pd.read_csv('notes_updated.csv')

In [26]:
sources.head()

Unnamed: 0,url,frequency,merged_urls,country,type,bias (MBFC),bias (AS),bias (AF),bias (final),factuality (MBFC),reliability (AF)
0,twitter.com,60168,"['pic.twitter.com', 'ads.twitter.com', 'blog.t...",USA,Social Media/Platforms,,,,,,
1,wikipedia.org,39750,"['m.wikipedia.org', 'th.m.wikipedia.org', 'lt....",USA,Dictionary/Encyclopedia,Least Biased,,,Center,Mixed Factuality,
2,x.com,23052,"['ads.x.com', 'help.x.com', 'business.x.com', ...",USA,Social Media/Platforms,,,,,,
3,youtube.com,18370,"['m.youtube.com', 'tv.youtube.com', 'music.you...",USA,Social Media/Platforms,,,,,,
4,bbc.co.uk,10522,"['bbc.com', 'news.bbc.co.uk', 'genome.ch.bbc.c...",GBR,News,Left-Center,Center,Middle,Center,High Factuality,"Reliable, Analysis/Fact Reporting"


In [27]:
notes.head()

Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,tweetId,classification,believable,harmful,validationDifficulty,misleadingOther,misleadingFactualError,...,misleadingSatire,notMisleadingOther,notMisleadingFactuallyCorrect,notMisleadingOutdatedButNotWhenWritten,notMisleadingClearlySatire,notMisleadingPersonalOpinion,trustworthySources,summary,isMediaNote,url
0,1537142913737428992,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318404027,1377030478167937024,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,0,0,0,1,Forbes has a good rundown of the investigation...,0,"['forbes.com', 'washingtonpost.com']"
1,1537145358521839617,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318986910,1536848327979016193,NOT_MISLEADING,,,,0,0,...,0,0,0,0,1,1,0,They are expressing a personal opinion in a st...,0,[]
2,1537147343715282945,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655319460217,1537080831751102467,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,LITTLE_HARM,EASY,0,0,...,0,0,0,0,0,0,1,Teslas purchased after 12/31/19 are not eligib...,0,['cleanvehiclerebate.org']
3,1537204430730211328,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655333070821,1537196168953974784,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,0,0,0,1,The Jan 6th riots were encouraged by the sitti...,0,"['nytimes.com', 'wsj.com', 'nytimes.com']"
4,1540422295029551104,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1656100269455,1540087463099736065,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,0,0,0,1,The Committee has been found by numerous court...,0,['cnn.com']


## Combining primary URL and associated merged urls together into a list of URLs

In [28]:
import ast
sources['merged_urls'] = sources['merged_urls'].fillna("[]").apply(ast.literal_eval)
sources['all_urls'] = sources.apply(lambda row: [row['url']] + row['merged_urls'], axis=1)

In [29]:
sources.head()

Unnamed: 0,url,frequency,merged_urls,country,type,bias (MBFC),bias (AS),bias (AF),bias (final),factuality (MBFC),reliability (AF),all_urls
0,twitter.com,60168,"[pic.twitter.com, ads.twitter.com, blog.twitte...",USA,Social Media/Platforms,,,,,,,"[twitter.com, pic.twitter.com, ads.twitter.com..."
1,wikipedia.org,39750,"[m.wikipedia.org, th.m.wikipedia.org, lt.wikip...",USA,Dictionary/Encyclopedia,Least Biased,,,Center,Mixed Factuality,,"[wikipedia.org, m.wikipedia.org, th.m.wikipedi..."
2,x.com,23052,"[ads.x.com, help.x.com, business.x.com, commun...",USA,Social Media/Platforms,,,,,,,"[x.com, ads.x.com, help.x.com, business.x.com,..."
3,youtube.com,18370,"[m.youtube.com, tv.youtube.com, music.youtube....",USA,Social Media/Platforms,,,,,,,"[youtube.com, m.youtube.com, tv.youtube.com, m..."
4,bbc.co.uk,10522,"[bbc.com, news.bbc.co.uk, genome.ch.bbc.co.uk,...",GBR,News,Left-Center,Center,Middle,Center,High Factuality,"Reliable, Analysis/Fact Reporting","[bbc.co.uk, bbc.com, news.bbc.co.uk, genome.ch..."


In [30]:
notes['url'] = notes['url'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

## Calculating the relative frequency of our annotated sources in the whole dataset

In [31]:

all_urls_set = set().union(*sources.head(500)['all_urls'])

def check_urls_in_sources(urls):
    if isinstance(urls, str):
        urls = [urls]
    return any(url in all_urls_set for url in urls)

notes['url_in_sources'] = notes['url'].apply(check_urls_in_sources)

count_included_urls = notes['url_in_sources'].sum()

total_notes = len(notes)
percentage_included = (count_included_urls / total_notes) * 100 if total_notes else 0

print(f"Number of notes with at least one URL found in the first 500 rows of 'all_urls' in 'sources': {count_included_urls}")
print(f"Percentage of such notes: {percentage_included:.2f}%")

Number of notes with at least one URL found in the first 500 rows of 'all_urls' in 'sources': 306578
Percentage of such notes: 56.25%


In [32]:
top500sources = sources.head(500)

print(f"Total frequency in sources is {top500sources['frequency'].sum()}")
print(f"Total frequency of news sources is {top500sources[top500sources['type']=='News']['frequency'].sum()}")
print(f"Percentage of news sources is {top500sources[top500sources['type']=='News']['frequency'].sum() / top500sources['frequency'].sum()}")

Total frequency in sources is 457934
Total frequency of news sources is 167294
Percentage of news sources is 0.3653233872130045


## Calculating bias scores

In [33]:
from tqdm import tqdm
tqdm.pandas()

bias_to_score = {
    'Right': -2,
    'Right-Center': -1,
    'Center': 0,
    'Left-Center': 1,
    'Left': 2
}

sources['bias_score'] = sources['bias (final)'].map(bias_to_score)

flat_sources = pd.DataFrame([(index, url, bias_score) for index, urls, bias_score in sources[['all_urls', 'bias_score']].itertuples() for url in urls], columns=['source_index', 'url', 'bias_score'])
url_to_bias = flat_sources.groupby('url')['bias_score'].mean().to_dict()

def calculate_bias(urls):
    if isinstance(urls, list) and urls:
        biases = [url_to_bias.get(url, np.nan) for url in urls]
        valid_biases = [bias for bias in biases if not np.isnan(bias)]
        if valid_biases:
            return np.mean(valid_biases)
    return np.nan

notes['bias_score'] = notes['url'].progress_apply(calculate_bias)

notes.head()

100%|██████████| 544995/544995 [00:03<00:00, 156478.54it/s]


Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,tweetId,classification,believable,harmful,validationDifficulty,misleadingOther,misleadingFactualError,...,notMisleadingFactuallyCorrect,notMisleadingOutdatedButNotWhenWritten,notMisleadingClearlySatire,notMisleadingPersonalOpinion,trustworthySources,summary,isMediaNote,url,url_in_sources,bias_score
0,1537142913737428992,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318404027,1377030478167937024,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,0,1,Forbes has a good rundown of the investigation...,0,"[forbes.com, washingtonpost.com]",True,0.5
1,1537145358521839617,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318986910,1536848327979016193,NOT_MISLEADING,,,,0,0,...,0,0,1,1,0,They are expressing a personal opinion in a st...,0,[],False,
2,1537147343715282945,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655319460217,1537080831751102467,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,LITTLE_HARM,EASY,0,0,...,0,0,0,0,1,Teslas purchased after 12/31/19 are not eligib...,0,[cleanvehiclerebate.org],False,
3,1537204430730211328,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655333070821,1537196168953974784,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,0,1,The Jan 6th riots were encouraged by the sitti...,0,"[nytimes.com, wsj.com, nytimes.com]",True,0.666667
4,1540422295029551104,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1656100269455,1540087463099736065,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,0,1,The Committee has been found by numerous court...,0,[cnn.com],True,1.0


## Calculating factuality scores

In [34]:
from tqdm import tqdm
tqdm.pandas()

factuality_to_score = {
    'Very High Factuality': 5,
    'High Factuality': 4,
    'Mostly Factual': 3,
    'Mixed Factuality': 2,
    'Low Factuality': 1,
    'Very Low Factuality': 0,
    'Satire': 0
}

sources['factuality_score'] = sources['factuality (MBFC)'].map(factuality_to_score)

flat_sources = pd.DataFrame([(index, url, factuality_score) for index, urls, factuality_score in sources[['all_urls', 'factuality_score']].itertuples() for url in urls], columns=['source_index', 'url', 'factuality_score'])
url_to_factuality = flat_sources.groupby('url')['factuality_score'].mean().to_dict()

def calculate_factuality(urls):
    if isinstance(urls, list) and urls:
        factuality_scores = [url_to_factuality.get(url, np.nan) for url in urls]
        valid_scores = [score for score in factuality_scores if not np.isnan(score)]
        if valid_scores:
            return np.mean(valid_scores)
    return np.nan

notes['factuality_score'] = notes['url'].progress_apply(calculate_factuality)

notes.head()

100%|██████████| 544995/544995 [00:03<00:00, 151236.74it/s]


Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,tweetId,classification,believable,harmful,validationDifficulty,misleadingOther,misleadingFactualError,...,notMisleadingOutdatedButNotWhenWritten,notMisleadingClearlySatire,notMisleadingPersonalOpinion,trustworthySources,summary,isMediaNote,url,url_in_sources,bias_score,factuality_score
0,1537142913737428992,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318404027,1377030478167937024,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,1,Forbes has a good rundown of the investigation...,0,"[forbes.com, washingtonpost.com]",True,0.5,3.0
1,1537145358521839617,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318986910,1536848327979016193,NOT_MISLEADING,,,,0,0,...,0,1,1,0,They are expressing a personal opinion in a st...,0,[],False,,
2,1537147343715282945,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655319460217,1537080831751102467,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,LITTLE_HARM,EASY,0,0,...,0,0,0,1,Teslas purchased after 12/31/19 are not eligib...,0,[cleanvehiclerebate.org],False,,
3,1537204430730211328,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655333070821,1537196168953974784,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,1,The Jan 6th riots were encouraged by the sitti...,0,"[nytimes.com, wsj.com, nytimes.com]",True,0.666667,3.666667
4,1540422295029551104,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1656100269455,1540087463099736065,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,1,The Committee has been found by numerous court...,0,[cnn.com],True,1.0,3.0


## Getting overview of score distribution for both bias and factuality

In [35]:
notes_nonan = notes.dropna(subset=['bias_score'])
notes_nonan = notes_nonan.dropna(subset=['factuality_score'])
len(notes_nonan)

155016

In [36]:
notes_nonan['bias_score'] = notes_nonan['bias_score'].round(2)
notes_nonan.bias_score.value_counts()

 0.00    70173
 1.00    57213
 0.50     6780
-1.00     6607
 0.67     4262
         ...  
-1.60        1
-1.80        1
-1.75        1
-0.14        1
 0.58        1
Name: bias_score, Length: 66, dtype: int64

In [37]:
notes_nonan['factuality_score'] = notes_nonan['factuality_score'].round(2)
notes_nonan.factuality_score.value_counts()

4.00    55259
2.00    39870
3.00    29977
5.00    12373
3.50     4284
        ...  
2.22        1
2.90        1
4.12        1
2.89        1
2.57        1
Name: factuality_score, Length: 94, dtype: int64

## Adding note status labels into the data

In [None]:
note_status = pd.read_csv('noteStatusHistory-00000.tsv', sep="\t")

In [39]:
note_status.head()

Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,timestampMillisOfFirstNonNMRStatus,firstNonNMRStatus,timestampMillisOfCurrentStatus,currentStatus,timestampMillisOfLatestNonNMRStatus,mostRecentNonNMRStatus,timestampMillisOfStatusLock,lockedStatus,timestampMillisOfRetroLock,currentCoreStatus,currentExpansionStatus,currentGroupStatus,currentDecidedBy,currentModelingGroup
0,1529283490486812673,81ED8E6CEC0FBCF3AB2C16F57A25B0C5C6BE3F96631F48...,1653444571503,,,1710729983975,NEEDS_MORE_RATINGS,,,1674003000000.0,NEEDS_MORE_RATINGS,,NEEDS_MORE_RATINGS,NEEDS_MORE_RATINGS,,CoreModel (v1.1),
1,1529288840854347782,AD7D7259DB3EA3A39D9605EB30D806CBEABDE945E2A613...,1653445847133,,,1710729983975,NEEDS_MORE_RATINGS,,,1674003000000.0,NEEDS_MORE_RATINGS,,NEEDS_MORE_RATINGS,NEEDS_MORE_RATINGS,NEEDS_MORE_RATINGS,CoreModel (v1.1),13.0
2,1529284698874081280,1C711540D4B87D068865F1645FAA915359E8DA023A8347...,1653444859604,,,1710729983975,NEEDS_MORE_RATINGS,,,1674003000000.0,NEEDS_MORE_RATINGS,,NEEDS_MORE_RATINGS,NEEDS_MORE_RATINGS,,CoreModel (v1.1),
3,1529288639947034624,1C711540D4B87D068865F1645FAA915359E8DA023A8347...,1653445799230,,,1710729983975,NEEDS_MORE_RATINGS,,,1674003000000.0,NEEDS_MORE_RATINGS,,NEEDS_MORE_RATINGS,NEEDS_MORE_RATINGS,,CoreModel (v1.1),
4,1529292633243275264,B8BE7E1FC4596B0FD69D252BB8E8F7AEBD628780872DEC...,1653446751305,,,1710729983975,NEEDS_MORE_RATINGS,,,1674003000000.0,NEEDS_MORE_RATINGS,,NEEDS_MORE_RATINGS,NEEDS_MORE_RATINGS,NEEDS_MORE_RATINGS,CoreModel (v1.1),13.0


In [40]:
notes_nonan = pd.merge(notes_nonan, note_status[['noteId', 'currentStatus']], on='noteId', how='left')

In [41]:
notes_nonan.head()

Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,tweetId,classification,believable,harmful,validationDifficulty,misleadingOther,misleadingFactualError,...,notMisleadingClearlySatire,notMisleadingPersonalOpinion,trustworthySources,summary,isMediaNote,url,url_in_sources,bias_score,factuality_score,currentStatus
0,1537142913737428992,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318404027,1377030478167937024,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,1,Forbes has a good rundown of the investigation...,0,"[forbes.com, washingtonpost.com]",True,0.5,3.0,NEEDS_MORE_RATINGS
1,1537204430730211328,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655333070821,1537196168953974784,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,1,The Jan 6th riots were encouraged by the sitti...,0,"[nytimes.com, wsj.com, nytimes.com]",True,0.67,3.67,NEEDS_MORE_RATINGS
2,1540422295029551104,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1656100269455,1540087463099736065,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,1,The Committee has been found by numerous court...,0,[cnn.com],True,1.0,3.0,NEEDS_MORE_RATINGS
3,1586769867381669889,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1667150391800,1586411168880807936,NOT_MISLEADING,,,,0,0,...,0,0,1,"Clinton is not alone in this claim, the F.B.I....",0,"[nytimes.com, washingtonpost.com]",True,1.0,3.5,NEEDS_MORE_RATINGS
4,1640795953472114688,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1680031214479,1640773789679230977,NOT_MISLEADING,,,,0,0,...,0,0,1,"Between 1982 and March 2023, 73 out of the 141...",0,"[npr.org, statista.com]",True,0.5,4.0,NEEDS_MORE_RATINGS


## Adding user enrollment labels into the data

In [42]:
noteauthors = pd.read_csv('userEnrollment-00000.tsv', sep="\t")

In [43]:
noteauthors.head()

Unnamed: 0,participantId,enrollmentState,successfulRatingNeededToEarnIn,timestampOfLastStateChange,timestampOfLastEarnOut,modelingPopulation,modelingGroup
0,B2D8708DD64F4A263D237C309DDFC40F96962E5B6CF8E2...,newUser,5,1709578909625,1,CORE,13.0
1,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn,5,1709573510566,1,CORE,13.0
2,1C0972F584F1BD912FB957D99A854609C93213D0A9C361...,newUser,5,1709578909625,1,CORE,13.0
3,78B383AA66981F722CB82AACB24CF853937CAC0F680DDB...,earnedIn,5,1709578909625,1,CORE,13.0
4,CAA3DEA7CFC07BD080F8BEFA3E6BC1575E6DE1BE2D3BBD...,earnedIn,5,1709578909625,1,CORE,13.0


In [44]:
notes_nonan = pd.merge(notes_nonan, noteauthors[['participantId', 'enrollmentState']], left_on='noteAuthorParticipantId', right_on='participantId', how='left')

In [45]:
notes_nonan.head()

Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,tweetId,classification,believable,harmful,validationDifficulty,misleadingOther,misleadingFactualError,...,trustworthySources,summary,isMediaNote,url,url_in_sources,bias_score,factuality_score,currentStatus,participantId,enrollmentState
0,1537142913737428992,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318404027,1377030478167937024,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,1,Forbes has a good rundown of the investigation...,0,"[forbes.com, washingtonpost.com]",True,0.5,3.0,NEEDS_MORE_RATINGS,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn
1,1537204430730211328,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655333070821,1537196168953974784,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,1,The Jan 6th riots were encouraged by the sitti...,0,"[nytimes.com, wsj.com, nytimes.com]",True,0.67,3.67,NEEDS_MORE_RATINGS,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn
2,1540422295029551104,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1656100269455,1540087463099736065,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,1,The Committee has been found by numerous court...,0,[cnn.com],True,1.0,3.0,NEEDS_MORE_RATINGS,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn
3,1586769867381669889,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1667150391800,1586411168880807936,NOT_MISLEADING,,,,0,0,...,1,"Clinton is not alone in this claim, the F.B.I....",0,"[nytimes.com, washingtonpost.com]",True,1.0,3.5,NEEDS_MORE_RATINGS,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn
4,1640795953472114688,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1680031214479,1640773789679230977,NOT_MISLEADING,,,,0,0,...,1,"Between 1982 and March 2023, 73 out of the 141...",0,"[npr.org, statista.com]",True,0.5,4.0,NEEDS_MORE_RATINGS,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,earnedIn


In [46]:
file_path = 'notes_with_bias.csv'
notes_nonan.to_csv(file_path, index=False)