## Exploratory analysis of the [Fake-News-Detection-dataset](https://huggingface.co/datasets/Pulk17/Fake-News-Detection-dataset)

In [None]:
import pandas as pd
from datasets import load_dataset
from typing import cast

# https://huggingface.co/datasets/Pulk17/Fake-News-Detection-dataset
ds = load_dataset("Pulk17/Fake-News-Detection-dataset", split="train")
df = pd.read_csv("hf://datasets/Pulk17/Fake-News-Detection-dataset/train.tsv", sep="\t")

### Size of the dataset (rows and columns)

In [30]:
print("Size:", df.shape)

Size: (30000, 6)


### Columns description in the dataset

In [43]:
i = 0   
descriptions = [
    "The unique identifier for each news article.",
    "The title of the news article.",
    "The content of the news article.",
    "The subject indicates the category of the news article.",
    "The publication date of the news article.",
    "The label indicating whether the news article is real (1) or fake (0)."
]
    
df_description = pd.DataFrame({
    "Column name": df.columns,
    "Description": descriptions
})

df_description.index = df_description.index + 1  # Start index at 1

df_description.style.set_table_styles([
    {'selector': 'th',
     'props': [('text-align', 'left')]},          # allinea header
    {'selector': 'td',
     'props': [('text-align', 'left')]}           # allinea celle
])

Unnamed: 0,Column name,Description
1,Unnamed: 0,The unique identifier for each news article.
2,title,The title of the news article.
3,text,The content of the news article.
4,subject,The subject indicates the category of the news article.
5,date,The publication date of the news article.
6,label,The label indicating whether the news article is real (1) or fake (0).


### Number of true vs fake news

In [50]:
real_news = ds.filter(lambda x: x["label"] == 1)
fake_news = ds.filter(lambda x: x["label"] == 0)
percent_real = (len(real_news) / len(ds)) * 100
percent_fake = (len(fake_news) / len(ds)) * 100

df_news_count = pd.DataFrame({
    "Type of News": ["Real News", "Fake News"],
    "Count": [len(real_news), len(fake_news)],
    "Percentage": [f"{percent_real:.2f}%", f"{percent_fake:.2f}%"]
})

df_news_count.style.set_table_styles([
    {'selector': 'th',
     'props': [('text-align', 'left')]},          # allinea header
    {'selector': 'td',
     'props': [('text-align', 'left')]}           # allinea celle
])



Unnamed: 0,Type of News,Count,Percentage
0,Real News,14522,48.41%
1,Fake News,15478,51.59%


### Duplicates in the dataset + example

In [58]:
from pandasql import sqldf

df_duplicates = df[df.duplicated(subset=['title', 'text'], keep=False)]
print("Number of duplicate rows based on 'title' and 'text':", len(df_duplicates))

q = "SELECT * FROM df WHERE title = 'YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STORE OWNER Swindles Tax Payers Out Of $1,116,924.27 In Latest Food Stamp Scam'"
sqldf(q, globals())

Number of duplicate rows based on 'title' and 'text': 5196


Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,16043,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0
1,18464,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,left-news,"Jun 19, 2017",0
