In [15]:
import pandas as pd
import plotly.express as px
import plotly.colors as pc

In [2]:
df = pd.read_csv("data/news_dataset_all_refactored.csv")

In [3]:
df.head()

Unnamed: 0,content,domain,authors,type
0,"Life is an illusion, at least on a quantum lev...",express.co.uk,Sean Martin,rumor
1,"For as long as he can remember, Malcolm Turnbu...",theshovel.com.au,The Shovel,other
2,Senators from the Australian Greens will be st...,theshovel.com.au,The Shovel,other
3,Headline: Bitcoin & Blockchain Searches Exceed...,beforeitsnews.com,The Pirate'S Cove,fake
4,Red Alert: Bond Yields Are SCREAMING “Inflatio...,beforeitsnews.com,Phoenix Capital Research,fake


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2468197 entries, 0 to 2468196
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   content  object
 1   domain   object
 2   authors  object
 3   type     object
dtypes: object(4)
memory usage: 75.3+ MB


In [12]:
# Calculate the count of each domain, normalized to get proportions
domain_counts = df["domain"].value_counts(normalize=True)

# Filter out domains that are less than 1% of the data
valid_domains = domain_counts[domain_counts > 0.01].index

# Filter the dataframe to include only rows with valid domains
df_filtered = df[df["domain"].isin(valid_domains)]

In [13]:
prop_df = df_filtered.groupby(["type", "domain"]).size().reset_index(name="count")
prop_df["proportion"] = prop_df.groupby("type")["count"].transform(
    lambda x: x / x.sum()
)

In [19]:
color_discrete_sequence = [
    "#fdd1cd",
    "#fba49c",
    "#f97b70",
    "#f75e53",
    "#f25c4e",
    "#d74f43",
    "#b54137",
    "#912e2b",
]

fig = px.bar(
    prop_df,
    x="type",
    y="proportion",
    color="domain",
    barmode="group",
    color_discrete_sequence=color_discrete_sequence,
)

# Update layout for white text and transparent background
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",
    paper_bgcolor="rgba(0,0,0,0)",
    font=dict(color="white"),
    title=dict(x=0.5),
    legend_title_text="Domain",
    legend=dict(x=1, y=1),
    xaxis=dict(title="Type"),
    yaxis=dict(title="Proportion"),
)

fig.show()

In [22]:
df["authors"].value_counts(normalize=True)

authors
unknown                                                                                                        5.124222e-01
The Associated Press                                                                                           2.326273e-02
Truth Broadcast Network                                                                                        9.287751e-03
The Phaser                                                                                                     6.842647e-03
Posted On                                                                                                      4.647117e-03
                                                                                                                   ...     
Geoff Ho, Richard Branson                                                                                      4.051540e-07
Budhaditya Bhattacharjee, Shaurya Arya, Debleena Sarkar, Nico Parungo, Necta Casiple, Jennifer Ong             4.051540e-07
