---
---

# **Data Cleaning and Analysis**

---

---


Import Libraries


In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from transformers import BertTokenizer

# import fasttext

---
---

## **All Data**

---

---


### **Cleaning**


In [None]:
df = pd.read_csv("data/news_cleaned_2018_02_13.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.dropna(subset=["content", "type"], inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df = df[["id", "content", "domain", "title", "authors", "meta_keywords", "type"]]

In [None]:
duplicates = df[df.duplicated(subset=["content", "type"], keep=False)]

In [None]:
duplicates.shape

In [None]:
duplicates.head()

In [None]:
df = df.drop_duplicates(subset=["content", "type"], keep="last")

In [None]:
df.shape

In [None]:
duplicates = df[df.duplicated(subset=["content"], keep=False)]

In [None]:
duplicates.shape

In [None]:
duplicates.head(10)

In [None]:
result = duplicates.groupby("content").filter(lambda x: x["type"].nunique() > 1)

In [None]:
result.shape

In [None]:
result.head(5)

In [None]:
df = df.drop_duplicates(subset=["content"], keep="last")

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.info()

In [None]:
df.to_csv("data/news_dataset_all_2.csv", index=False)

In [None]:
df = df.dropna()

In [None]:
df = pd.read_csv("data/news_dataset_all_2.csv")

In [None]:
df["type"].unique()

In [None]:
df = df[
    df["type"].isin(
        [
            "reliable",
            "political",
            "bias",
            "conspiracy",
            "fake",
            "junksci",
            "rumor",
            "satire",
            "unknown",
            "unreliable",
        ]
    )
]

In [None]:
df.shape

In [None]:
category_counts = df["type"].value_counts().reset_index()
category_counts.columns = ["Type", "Count"]

fig = px.bar(
    category_counts,
    x="Type",
    y="Count",
    text="Count",
    title="Label Distribution",
    labels={"Type": "Label", "Count": "Frequency"},
)

# Update bar color
fig.update_traces(marker_color="#f45c4e", textfont_color="white")

# Update layout for dark theme
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",  # Transparent plot background
    paper_bgcolor="rgba(0,0,0,0)",  # Transparent overall background
    font=dict(color="white"),
    title_font=dict(color="white"),
    xaxis=dict(showgrid=False, tickfont=dict(color="white")),
    yaxis=dict(showgrid=False, tickfont=dict(color="white")),
)

fig.show()

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.to_csv("data/news_dataset_all_2.csv", index=False)

### **Language Detection**


In [None]:
model = fasttext.load_model("/home/g03-s2025/lid.176.bin")

In [None]:
processed_text_count = 0

In [None]:
def detect_language(text):
    text = text.replace("\n", " ")
    predictions = model.predict(text)
    prediction = predictions[0][0].replace("__label__", "")
    global processed_text_count
    processed_text_count += 1
    print(processed_text_count)
    return prediction

In [None]:
df["language"] = df["content"].apply(
    lambda x: detect_language(str(x)) if pd.notnull(x) else None
)

In [None]:
category_counts = df["language"].value_counts().reset_index()
category_counts.columns = ["Language", "Count"]

fig = px.bar(
    category_counts,
    x="Language",
    y="Count",
    text="Count",
    title="Language Distribution",
    labels={"Language": "Language", "Count": "Frequency"},
)

# Update bar color
fig.update_traces(marker_color="#f45c4e", textfont_color="white")

# Update layout for dark theme
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",  # Transparent plot background
    paper_bgcolor="rgba(0,0,0,0)",  # Transparent overall background
    font=dict(color="white"),
    title_font=dict(color="white"),
    xaxis=dict(showgrid=False, tickfont=dict(color="white")),
    yaxis=dict(showgrid=False, tickfont=dict(color="white")),
)

fig.show()

In [None]:
df = df[df["language"] == "en"]

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.drop(columns=["language"], inplace=True)

In [None]:
category_counts = df["type"].value_counts().reset_index()
category_counts.columns = ["Type", "Count"]

fig = px.bar(
    category_counts,
    x="Type",
    y="Count",
    text="Count",
    title="Label Distribution",
    labels={"Type": "Label", "Count": "Frequency"},
)

# Update bar color
fig.update_traces(marker_color="#f45c4e", textfont_color="white")

# Update layout for dark theme
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",  # Transparent plot background
    paper_bgcolor="rgba(0,0,0,0)",  # Transparent overall background
    font=dict(color="white"),
    title_font=dict(color="white"),
    xaxis=dict(showgrid=False, tickfont=dict(color="white")),
    yaxis=dict(showgrid=False, tickfont=dict(color="white")),
)

fig.show()

In [None]:
df.to_csv("data/news_dataset_all_refactored.csv", index=False)

### **Tokenization**


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
tokenized_text_count = 0

In [None]:
def tokenize_text(text):
    global tokenized_text_count
    tokenized_text_count += 1
    print(tokenized_text_count)
    return len(tokenizer.encode(text, truncation=False, padding=False))

In [None]:
df["token_count"] = df["content"].apply(lambda x: tokenize_text(x))

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x=df["token_count"],
        nbinsx=500,
        name="Token Count Distribution",
        marker_color="#f45c4e",  # Set bar color
    )
)

fig.update_layout(
    title="Token Count Distribution with Percentiles",
    xaxis_title="Token Count",
    yaxis_title="Frequency",
    plot_bgcolor="rgba(0,0,0,0)",  # Transparent plot background
    paper_bgcolor="rgba(0,0,0,0)",  # Transparent overall background
    font=dict(color="white"),  # White text
    title_font=dict(color="white"),
    xaxis=dict(showgrid=False, tickfont=dict(color="white")),
    yaxis=dict(showgrid=False, tickfont=dict(color="white")),
)

fig.show()

In [None]:
less_than_512 = df[df["token_count"] <= 512].copy()

In [None]:
category_counts = less_than_512["type"].value_counts().reset_index()
category_counts.columns = ["Type", "Count"]

fig = px.bar(
    category_counts,
    x="Type",
    y="Count",
    text="Count",
    title="Label Distribution",
    labels={"Type": "Label", "Count": "Frequency"},
)

# Update bar color
fig.update_traces(marker_color="#f45c4e", textfont_color="white")

# Update layout for dark theme
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",  # Transparent plot background
    paper_bgcolor="rgba(0,0,0,0)",  # Transparent overall background
    font=dict(color="white"),
    title_font=dict(color="white"),
    xaxis=dict(showgrid=False, tickfont=dict(color="white")),
    yaxis=dict(showgrid=False, tickfont=dict(color="white")),
)

fig.show()

In [None]:
def make_as_others(label):
    if label == "satire" or label == "junksci" or label == "unknown":
        return "other"
    else:
        return label

In [None]:
less_than_512["type"] = less_than_512["type"].apply(lambda x: make_as_others(x))

In [None]:
category_counts = less_than_512["type"].value_counts().reset_index()
category_counts.columns = ["Type", "Count"]

fig = px.bar(
    category_counts,
    x="Type",
    y="Count",
    text="Count",
    title="Label Distribution",
    labels={"Type": "Label", "Count": "Frequency"},
)

# Update bar color
fig.update_traces(marker_color="#f45c4e", textfont_color="white")

# Update layout for dark theme
fig.update_layout(
    plot_bgcolor="rgba(0,0,0,0)",  # Transparent plot background
    paper_bgcolor="rgba(0,0,0,0)",  # Transparent overall background
    font=dict(color="white"),
    title_font=dict(color="white"),
    xaxis=dict(showgrid=False, tickfont=dict(color="white")),
    yaxis=dict(showgrid=False, tickfont=dict(color="white")),
)

fig.show()

In [None]:
df = less_than_512.copy()

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.info()

In [None]:
df.to_csv("data/news_dataset_all_refactored.csv", index=False)

### **Sampling**


In [2]:
df = pd.read_csv("data/news_dataset_all_refactored.csv")

In [3]:
df["type"].value_counts()

type
reliable      925532
fake          425765
bias          354055
conspiracy    280798
rumor         273374
other         178076
unreliable     30597
Name: count, dtype: int64

In [4]:
SAMPLE_SIZE = 5

In [None]:
# min_count = df["type"].value_counts().min()
# sample_size_per_label = min(SAMPLE_SIZE, min_count)

In [5]:
sampled_df = df.groupby("type", group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), SAMPLE_SIZE), random_state=42)
)

  sampled_df = df.groupby("type", group_keys=False).apply(


In [6]:
sampled_df["type"].value_counts()

type
bias          5
conspiracy    5
fake          5
other         5
reliable      5
rumor         5
unreliable    5
Name: count, dtype: int64

In [None]:
# sampled_df = pd.concat(sampled_dfs).reset_index(drop=True)

In [7]:
sampled_df.reset_index(drop=True, inplace=True)

In [8]:
sampled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  35 non-null     object
 1   domain   35 non-null     object
 2   authors  35 non-null     object
 3   type     35 non-null     object
dtypes: object(4)
memory usage: 1.2+ KB


In [9]:
sampled_df.rename(columns={"type": "label"}, inplace=True)

In [10]:
def preprocess_data(content_series, author_series):
    """
    Combine author and content text with [SEP] token for BERT processing

    Args:
        content_series: Series containing the text content
        author_series: Series containing author information

    Returns:
        combined_texts: Series containing "author [SEP] content" text
    """
    combined_texts = author_series.astype(str) + " [SEP] " + content_series.astype(str)
    return combined_texts

In [11]:
combined_texts = preprocess_data(sampled_df["content"], sampled_df["authors"])

In [12]:
combined_texts.head()

0    unknown [SEP] by: otterwood\n\nThe surge in Ca...
1    unknown [SEP] Several Republican and Democrati...
2    Margarita Bogatova [SEP] Internet, gadget, gam...
3    unknown [SEP] Democrats voted unanimously to s...
4    unknown [SEP] Jump to: navigation\n\nWhat link...
dtype: object

In [13]:
sampled_df["combined_texts"] = combined_texts
sampled_df.drop(columns=["content", "authors", "domain"], inplace=True)

In [14]:
sampled_df.head()

Unnamed: 0,label,combined_texts
0,bias,unknown [SEP] by: otterwood\n\nThe surge in Ca...
1,bias,unknown [SEP] Several Republican and Democrati...
2,bias,"Margarita Bogatova [SEP] Internet, gadget, gam..."
3,bias,unknown [SEP] Democrats voted unanimously to s...
4,bias,unknown [SEP] Jump to: navigation\n\nWhat link...


In [15]:
sampled_df.to_csv("data/sampled_dataset.csv", index=False)