In [32]:
import pandas as pd
from transformers import BertTokenizer
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
df1 = pd.read_csv("data/news_dataset_all_2.csv")
df2 = pd.read_csv("data/news_dataset_all.csv")

In [None]:
df1.head()

In [None]:
df1.shape

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
duplicates = df2[df2.duplicated(subset=["type", "content"], keep=False)]

In [None]:
duplicates.shape

In [None]:
duplicates = df2[df2.duplicated(subset=["content"], keep=False)]

In [None]:
duplicates.shape

In [None]:
df2 = df2.drop_duplicates(subset=["content"], keep="last")

In [None]:
df2.shape

In [None]:
print(f"df1 types: {df1['type'].unique()}")
print(f"df2 types: {df2['type'].unique()}")

In [None]:
df1.isna().sum()

In [None]:
df2.isna().sum()

In [None]:
df = pd.merge(left=df1, right=df2, how="left", on=["type", "content"])

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
null_rows = df[df[['token_count', 'language']].isnull().all(axis=1)]

In [None]:
null_rows.shape

In [None]:
null_rows.head()

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
tokenized_text_count = 0

In [None]:
def tokenize_text(text):
    global tokenized_text_count
    tokenized_text_count += 1
    print(tokenized_text_count)
    return len(tokenizer.encode(text, truncation=False, padding=False))

In [None]:
null_rows["token_count"] = null_rows["content"].apply(lambda x: tokenize_text(x))

In [None]:
df.loc[null_rows.index, "token_count"] = null_rows["token_count"]
df.loc[null_rows.index, "language"] = "en"

In [None]:
df.isna().sum()

In [None]:
df["language"].value_counts()

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.to_csv("data/news_dataset_all_3.csv", index=False)

In [None]:
df = pd.read_csv("data/news_dataset_all_3.csv")

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x="type", y="token_count")
plt.title("Token Count Distribution by Type")
plt.xlabel("Type")
plt.ylabel("Token Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
less_than_512 = df[df["token_count"] <= 512].copy()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=less_than_512, x="type", y="token_count")
plt.title("Token Count Distribution by Type")
plt.xlabel("Type")
plt.ylabel("Token Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df = less_than_512.copy()

In [None]:
df["domain"].value_counts(normalize=True)

In [None]:
# Calculate the count of each domain, normalized to get proportions
domain_counts = df['domain'].value_counts(normalize=True)

# Filter out domains that are less than 1% of the data
valid_domains = domain_counts[domain_counts > 0.007].index

# Filter the dataframe to include only rows with valid domains
df_filtered = df[df['domain'].isin(valid_domains)]

In [None]:
prop_df = df_filtered.groupby(['type', 'domain']).size().reset_index(name='count')
prop_df['proportion'] = prop_df.groupby('type')['count'].transform(lambda x: x / x.sum())

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x='type', y='proportion', hue='domain', data=prop_df)

# Customize plot
plt.title('Proportions of Each Domain Within Each Type')
plt.xlabel('Type')
plt.ylabel('Proportion')
plt.legend(title='Domain', loc='upper right')
plt.tight_layout()
plt.show()

In [None]:
df["authors"].value_counts(normalize=True)

In [None]:
df["authors"].fillna("unknown", inplace=True)

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df = df[df["language"] == "en"].copy()

In [None]:
category_counts = df["type"].value_counts().reset_index()
category_counts.columns = ["Type", "Count"]

fig = px.bar(
    category_counts,
    x="Type",
    y="Count",
    text="Count",
    title="Label Distribution",
    labels={"Type": "Label", "Count": "Frequency"},
)

# Update bar color
fig.update_traces(marker_color="#f45c4e", textfont_color="white")

# Update layout for dark theme
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",  # Transparent plot background
    paper_bgcolor="rgba(0,0,0,0)",  # Transparent overall background
    font=dict(color="white"),
    title_font=dict(color="white"),
    xaxis=dict(showgrid=False, tickfont=dict(color="white")),
    yaxis=dict(showgrid=False, tickfont=dict(color="white")),
)

fig.show()

In [None]:
def make_as_others(label):
    if label == "satire" or label == "junksci" or label == "unknown":
        return "other"
    else:
        return label

In [None]:
df["type"] = df["type"].apply(lambda x: make_as_others(x))

In [None]:
df = df[df["type"] != "political"].copy()

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
category_counts = df["type"].value_counts().reset_index()
category_counts.columns = ["Type", "Count"]

fig = px.bar(
    category_counts,
    x="Type",
    y="Count",
    text="Count",
    title="Label Distribution",
    labels={"Type": "Label", "Count": "Frequency"},
)

# Update bar color
fig.update_traces(marker_color="#f45c4e", textfont_color="white")

# Update layout for dark theme
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",  # Transparent plot background
    paper_bgcolor="rgba(0,0,0,0)",  # Transparent overall background
    font=dict(color="white"),
    title_font=dict(color="white"),
    xaxis=dict(showgrid=False, tickfont=dict(color="white")),
    yaxis=dict(showgrid=False, tickfont=dict(color="white")),
)

fig.show()

In [None]:
df.head()

In [None]:
df[["content", "domain", "authors", "type"]].to_csv("data/news_dataset_all_refactored.csv", index=False)