In [None]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [31]:
import time
import csv
import os
import json
import math

In [41]:
directory = "Article-Bias-Prediction/data/jsons"
topic = "democratic_party"

all_topics = set()
all_topics.add(topic)

for filename in os.listdir(directory):
    in_file = f"{directory}/{filename}"

    file = None

    with open(in_file, "r") as read_file:
        file = json.load(read_file)
        if file["topic"] != topic:
            continue
        content = file["content"]
        chunks = []
        for i in range(0, len(content), 512):
            chunk = content[i: min(i + 512, len(content))]
            chunks.append(chunk)
        sentiment = sentiment_pipeline(chunks)
        file["sentiment"] = [s["label"] for s in sentiment]
        file["sentiment_score"] = [s["score"] for s in sentiment]

    with open(f"{topic}/{filename}", "w+") as write_file:
        json.dump(file, write_file)


In [43]:
directory = "democratic_party"

data = []

for filename in os.listdir(directory):
    in_file = f"{directory}/{filename}"

    with open(in_file, "r", encoding='utf-8', errors='ignore') as read_file:
        file = json.loads(read_file.read(), strict=False)
        current_score = 0
        sentiments = file["sentiment"]
        scores = file["sentiment_score"]
        for i in range(0, len(sentiments)):
            sign = 1 if sentiments[i] == "POSITIVE" else -1
            current_score += sign * scores[i]

        current_score /= len(sentiments)
        current_sentiment = "POSITIVE" if current_score > 0 else "NEGATIVE"

        del file["content"]
        del file["content_original"]
        file["total_sentiment"] = current_sentiment
        file["total_sentiment_score"] = current_score

        data.append(file)

out_file = f"{directory}/summary.csv"
write_file = open(out_file, "w")
writer = csv.writer(write_file)
for i, dp in enumerate(data):
    if i == 0:
        writer.writerow(dp.keys())

    writer.writerow(dp.values())

write_file.close()
        

In [48]:
import polars as pl

directory = "politics"

df = pl.read_csv(f"{directory}/summary.csv")

(df.group_by("bias_text")
    .agg(
        pl.len(),
        pl.mean("total_sentiment_score")
    )
).sort("bias_text")

bias_text,len,total_sentiment_score
str,u32,f64
"""center""",772,-0.427389
"""left""",950,-0.403039
"""right""",1071,-0.443822


In [51]:
topics = ["democratic_party", "food", "immigration", "politics", "republican_party"]
filepaths = [f"{topic}/summary.csv" for topic in topics]
data = pl.scan_csv(filepaths, try_parse_dates=True).collect()
df = pl.DataFrame(data)

In [65]:
sources = df["source"].value_counts(sort=True).filter(pl.col("count") > 100)["source"].to_list()

In [103]:
filtered = df.filter(pl.col("source").is_in(sources))

pl.Config.set_tbl_rows(-1)

res = filtered.group_by(["topic", "bias_text"]).agg(pl.len(), pl.mean("total_sentiment_score")).sort("topic").filter(pl.col("topic").is_in(["republican_party", "democratic_party"]))
res

topic,bias_text,len,total_sentiment_score
str,str,u32,f64
"""democratic_party""","""left""",70,-0.159792
"""democratic_party""","""right""",54,-0.457754
"""democratic_party""","""center""",20,-0.258535
"""republican_party""","""center""",69,-0.326017
"""republican_party""","""right""",110,-0.375123
"""republican_party""","""left""",218,-0.328608


In [104]:
import pandas as pd

print(res.drop("len").to_pandas().to_latex(index=False))

\begin{tabular}{llr}
\toprule
topic & bias_text & total_sentiment_score \\
\midrule
democratic_party & left & -0.159792 \\
democratic_party & right & -0.457754 \\
democratic_party & center & -0.258535 \\
republican_party & center & -0.326017 \\
republican_party & right & -0.375123 \\
republican_party & left & -0.328608 \\
\bottomrule
\end{tabular}

