# Create Dataset for Training

In [None]:
# Install gdwon for download dataset from google drive
! pip install gdown



In [None]:
import csv
import os
import pandas as pd
import re

## Factual News Dataset


In [None]:
# import dataset
!gdown --id 15SDTi4XXqFqk4IXFYLiLulQy2jIUEnDW
!gdown --id 1zFH1XNe7Pd5g_9nSslYnHgLxv-qCkJtb

Downloading...
From: https://drive.google.com/uc?id=15SDTi4XXqFqk4IXFYLiLulQy2jIUEnDW
To: /content/dataset_detik.csv
100% 9.76M/9.76M [00:00<00:00, 138MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zFH1XNe7Pd5g_9nSslYnHgLxv-qCkJtb
To: /content/dataset_kompas.csv
100% 7.58M/7.58M [00:00<00:00, 18.9MB/s]


In [None]:
# dataset path
ds_detik = "dataset_detik.csv"
ds_kompas = "dataset_kompas.csv"

# List of CSV file names to merge
csv_files = [ds_detik, ds_kompas]

# Reads each CSV file and saves it in a list
data_frames = [pd.read_csv(file) for file in csv_files]

# Combines all DataFrames in the list into one DataFrame
combined_df = pd.concat(data_frames, ignore_index=True)

# remove duplicate title content
combined_df = combined_df.drop_duplicates(subset=['title'], keep='first')
combined_df['title'] = combined_df['title'].str.replace('\n', '')

# get 4000+ data only
sample_df = combined_df.sample(n=3674, random_state=42)

# Saves the merged DataFrame into a new CSV file
sample_df.to_csv('dataset_factual_news.csv', index=False)

print("CSV files merged successfully!")


CSV files merged successfully!


In [None]:
# Read CSV file
df = pd.read_csv("dataset_factual_news.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3674 entries, 0 to 3673
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       3674 non-null   object
 1   link        3674 non-null   object
 2   date        3674 non-null   object
 3   content     3674 non-null   object
 4   is_fake     3674 non-null   int64 
 5   media_bias  3674 non-null   object
dtypes: int64(1), object(5)
memory usage: 172.3+ KB


Unnamed: 0,title,link,date,content,is_fake,media_bias
0,Epy Kusnandar dan Yogi Gamblez Positif Narkoba,https://megapolitan.kompas.com/read/2024/05/11...,2024-05-11,"JAKARTA, KOMPAS.com - Polisi mengungkap hasil ...",0,netral
1,"Cerita Calon Haji, Rasakan Kemudahan Skrining ...",https://news.detik.com/berita/d-7304275/cerita...,22/04/2024,Salah satu peserta program Jaminan Kesehatan N...,0,netral
2,Polisi: Kami Butuh Partisipasi Warga untuk Ata...,https://megapolitan.kompas.com/read/2024/05/07...,2024-05-07,"JAKARTA, KOMPAS.com - Kapolres Metro Jakarta ...",0,netral
3,Pengunjung Ragunan Tembus 25 Ribu pada Sabtu S...,https://news.detik.com/berita/d-7335065/pengun...,11/05/2024,Sebanyak 25.555 orang tercatat mengunjungi Tam...,0,netral
4,Mensos Beri Santunan ke Ahli Waris Korban Banj...,https://news.detik.com/berita/d-7305239/mensos...,22/04/2024,Menteri Sosial Tri Rismaharini melakukan kunju...,0,netral


## Hoax News Dataset

In [None]:
# function to write data to CSV file
def write_to_csv(data, filename):
    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write header row
        writer.writerow(['title', 'link', 'date', 'content', 'is_fake', 'media_bias'])
        # Write data rows
        for row in data:
            writer.writerow(row)

In [None]:
# dataset hoax antara
!gdown --id 1X1M3WqTQTkyi4pdltgC23KCQvrOusRt7
!gdown --id 1dZp7_uuI8AZut28PVyIn2PcbLnQLl6XP

# dataset hoax kompas
!gdown --id 1BmORte8YUONeL8dOrpxwXumfCJ2YaxOV

# dataset hoax tempo
!gdown --id 1CbfvfAucyw20uGQ8RJCVATO-QorQvke6

Downloading...
From: https://drive.google.com/uc?id=1X1M3WqTQTkyi4pdltgC23KCQvrOusRt7
To: /content/dataset_antara_hoaks.csv
100% 1.43M/1.43M [00:00<00:00, 110MB/s]
Downloading...
From: https://drive.google.com/uc?id=1dZp7_uuI8AZut28PVyIn2PcbLnQLl6XP
To: /content/dataset_antara_hoaks_part2.csv
100% 909k/909k [00:00<00:00, 74.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1BmORte8YUONeL8dOrpxwXumfCJ2YaxOV
To: /content/dataset_kompas_hoaks_new.csv
100% 5.81M/5.81M [00:00<00:00, 81.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1CbfvfAucyw20uGQ8RJCVATO-QorQvke6
To: /content/dataset_tempo_hoaks_20_24.csv
100% 6.50M/6.50M [00:00<00:00, 17.4MB/s]


### Antara Hoax News Dataset

In [None]:
# merge two antara hoax news
antara_csv_files = ["dataset_antara_hoaks.csv", "dataset_antara_hoaks_part2.csv"]
antara_frames = [pd.read_csv(file) for file in antara_csv_files]
antara_df = pd.concat(antara_frames, ignore_index=True)
antara_df.to_csv('dataset_antara_hoaks_full.csv', index=False)

def extract_content(text):
    # Normalize text to lowercase
    text = text.lower()

    # Define the start and end phrases
    start_phrases = [
        "jakarta (antara/jacx)"
    ]
    end_phrases = [
        "namun, benarkah"
    ]

    # Initialize the pattern with the start phrase
    # pattern = re.escape(start_phrase) + r"(.*?)(" + "|".join(map(re.escape, end_phrases)) + ")"
    start_pattern = "|".join(map(re.escape, start_phrases))
    end_pattern = "|".join(map(re.escape, end_phrases))
    pattern = f"({start_pattern})(.*?)({end_pattern})"

    # Use regular expressions to find the content between the start and end phrases
    match = re.search(pattern, text, re.DOTALL)

    if match:
        # Extract and return the found content
        return match.group(2).strip()
    else:
        return None

# Open the CSV file in read mode
with open('dataset_antara_hoaks_full.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Create a list to store the modified data
    modified_data = []

    # Iterate over each row in the CSV file
    for row in reader:
        title = row[0]
        link = row[1]
        date = row[2]
        content = row[3]
        is_fake = row[4]
        media_bias = row[5]

        extracted_content = extract_content(content)

        # Append data to list
        modified_data.append([title, link, date, extracted_content, is_fake, media_bias])

    # File name for the CSV
    filename = 'dataset_antara_hoaks_clean.csv'

    # Write data to CSV file
    write_to_csv(modified_data, filename)

    print(f"Data has been written to {filename}")

Data has been written to dataset_antara_hoaks_clean.csv


In [None]:
# Read CSV File
antara_df = pd.read_csv("dataset_antara_hoaks_clean.csv")
antara_df = antara_df.dropna(subset=['title','content'])
antara_df.to_csv("dataset_hoax_antara.csv")
antara_df.info()
antara_df.tail()

<class 'pandas.core.frame.DataFrame'>
Index: 660 entries, 1 to 949
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       660 non-null    object
 1   link        660 non-null    object
 2   date        0 non-null      object
 3   content     660 non-null    object
 4   is_fake     660 non-null    object
 5   media_bias  660 non-null    object
dtypes: object(6)
memory usage: 36.1+ KB


Unnamed: 0,title,link,date,content,is_fake,media_bias
942,KFC bagi paket camilan gratis peringati Peremp...,https://www.antaranews.com/berita/2031894/hoak...,,- pesan berantai mengatasnamakan restoran cepa...,1,right
943,Kandungan potasium pada vaksin Pfizer-Moderna ...,https://www.antaranews.com/berita/2030609/hoak...,,- sebuah narasi yang diklaim berasal dari seor...,1,right
947,Pemerintah tidak sediakan kompensasi saat vaks...,https://www.antaranews.com/berita/2016444/hoak...,,- sebuah pesan beredar di aplikasi whatsapp da...,1,right
948,Pemilik e-KTP dapat bantuan Rp600 ribu,https://www.antaranews.com/berita/2016030/hoak...,,- sebuah unggahan di media sosial facebook men...,1,right
949,Foto mobil diklaim sebagai produk Esemka,https://www.antaranews.com/berita/2013048/hoak...,,- sebuah foto beredar di media sosial yang dik...,1,right


### Kompas Hoax News Dataset

In [None]:
def extract_content(text):
    # Normalize text to lowercase
    text = text.lower()

    # Define the start and end phrases
    start_phrases = [
        "kompas.com - ",
        ".kompas.com",
        "kompas.com-"]
    end_phrases = [
        "berdasarkan penelusuran tim",
        "berdasarkan hasil penelusuran tim",
        "namun, narasi tersebut tidak benar",
        "setelah ditelusuri",
        "berdasarkan konfirmasi tim"
    ]

    # Initialize the pattern with the start phrase
    # pattern = re.escape(start_phrase) + r"(.*?)(" + "|".join(map(re.escape, end_phrases)) + ")"
    start_pattern = "|".join(map(re.escape, start_phrases))
    end_pattern = "|".join(map(re.escape, end_phrases))
    pattern = f"({start_pattern})(.*?)({end_pattern})"

    # Use regular expressions to find the content between the start and end phrases
    match = re.search(pattern, text, re.DOTALL)

    if match:
        # Extract and return the found content
        return match.group(2).strip()
    else:
        return ''

# Open the CSV file in read mode
with open('dataset_kompas_hoaks_new.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Create a list to store the modified data
    modified_data = []

    # Iterate over each row in the CSV file
    for row in reader:
        title = row[0]
        link = row[1]
        date = row[2]
        content = row[3]
        is_fake = row[4]
        media_bias = row[5]

        extracted_content = extract_content(content)

        # Append data to list
        modified_data.append([title, link, date, extracted_content, is_fake, media_bias])

    # File name for the CSV
    filename = 'dataset_kompas_hoaks_clean.csv'

    # Write data to CSV file
    write_to_csv(modified_data, filename)

    print(f"Data has been written to {filename}")

Data has been written to dataset_kompas_hoaks_clean.csv


In [None]:
# Read CSV File
kompas_df = pd.read_csv("dataset_kompas_hoaks_clean.csv")
kompas_df = kompas_df.dropna(subset=['title','content'])
kompas_df.to_csv("dataset_hoax_kompas.csv")
kompas_df.info()
kompas_df.tail()

<class 'pandas.core.frame.DataFrame'>
Index: 2366 entries, 1 to 2528
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       2366 non-null   object
 1   link        2366 non-null   object
 2   date        2366 non-null   object
 3   content     2366 non-null   object
 4   is_fake     2366 non-null   object
 5   media_bias  2366 non-null   object
dtypes: object(6)
memory usage: 129.4+ KB


Unnamed: 0,title,link,date,content,is_fake,media_bias
2522,CDC Akui Tes PCR Tidak Bisa Bedakan Virus Coro...,https://www.kompas.com/cekfakta/read/2022/01/1...,2022-01-11,- beredar narasi yang menyebut bahwa pusat pen...,1,netral
2523,Video Pesawat Garuda Indonesia Mendarat Darura...,https://www.kompas.com/cekfakta/read/2022/01/1...,2022-01-11,- beredar video di media sosial facebook yang ...,1,netral
2524,Surat Pengangkatan Guru Honorer Usia 35 Tahun ...,https://www.kompas.com/cekfakta/read/2022/01/1...,2022-01-11,"- di media sosial facebook, beredar surat beri...",1,netral
2526,Kartu Keluarga Akan Dicetak Seperti KTP Mulai ...,https://www.kompas.com/cekfakta/read/2022/01/1...,2022-01-10,- beredar informasi di media sosial facebook y...,1,netral
2528,Komedian Sule Meninggal Dunia,https://www.kompas.com/cekfakta/read/2022/01/0...,2022-01-08,- beredar informasi di media sosial facebook y...,1,netral


### Tempo Hoax News Dataset

In [None]:
def extract_content(text):
    # Normalize text to lowercase
    text = text.lower()

    # Define the end phrases
    end_phrases = [
        "namun, benarkah",
        "namun benarkah",
        "lantas benarkah",
        "benarkah",
        "lantas, benarkah"
    ]

    # Initialize the pattern with the start phrase
    end_pattern = "|".join(map(re.escape, end_phrases))
    pattern = f"^(.*?)(?:{end_pattern})"

    # Use regular expressions to find the content between the start and end phrases
    match = re.search(pattern, text, re.DOTALL)

    if match:
        # Extract and return the found content
        return match.group(1).strip()
    else:
        return ''

# Open the CSV file in read mode
with open('dataset_tempo_hoaks_20_24.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Create a list to store the modified data
    modified_data = []

    # Iterate over each row in the CSV file
    for row in reader:
        title = row[0]
        link = row[1]
        date = row[2]
        content = row[3]
        is_fake = row[4]
        media_bias = row[5]

        extracted_content = extract_content(content)

        # Append data to list
        modified_data.append([title, link, date, extracted_content, is_fake, media_bias])

    # File name for the CSV
    filename = 'dataset_tempo_hoaks_20_24_clean.csv'

    # Write data to CSV file
    write_to_csv(modified_data, filename)

    print(f"Data has been written to {filename}")

Data has been written to dataset_tempo_hoaks_20_24_clean.csv


In [None]:
# Read CSV File
tempo_df = pd.read_csv("dataset_tempo_hoaks_20_24_clean.csv")
tempo_df = tempo_df.dropna(subset=['title','content'])
tempo_df.to_csv("dataset_hoax_tempo.csv")
tempo_df.info()
tempo_df.tail()

<class 'pandas.core.frame.DataFrame'>
Index: 688 entries, 1 to 1349
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       688 non-null    object
 1   link        688 non-null    object
 2   date        688 non-null    object
 3   content     688 non-null    object
 4   is_fake     688 non-null    object
 5   media_bias  688 non-null    object
dtypes: object(6)
memory usage: 37.6+ KB


Unnamed: 0,title,link,date,content,is_fake,media_bias
1345,Video FIFA dan AFC Gelar Pertandingan Ulang In...,http://cekfakta.tempo.co/fakta/2884/keliru-vid...,"Selasa, 7 Mei 2024 10:31 WIB",sebuah video beredar di facebook oleh akun ini...,1,left
1346,COVID-19 Merupakan Hasil Konspirasi Rockefelle...,http://cekfakta.tempo.co/fakta/2883/keliru-cov...,"Selasa, 7 Mei 2024 10:12 WIB",sebuah video reels berdurasi 1 menit 21 detik ...,1,left
1347,Pengakuan Sri Mulyani Tentang Obat Hipertensi ...,http://cekfakta.tempo.co/fakta/2881/keliru-pen...,"Senin, 6 Mei 2024 22:21 WIB",sebuah akun facebook mengunggah sebuah video d...,1,left
1348,Narasi yang Mengatakan Bahaya Polusi Udara Han...,http://cekfakta.tempo.co/fakta/2878/keliru-nar...,"Jumat, 3 Mei 2024 08:36 WIB",sebuah narasi beredar di facebook [arsip] yang...,1,left
1349,Video Berisi Klaim Semua Ahli Jantung dan Tera...,http://cekfakta.tempo.co/fakta/2877/keliru-vid...,"Jumat, 3 Mei 2024 08:15 WIB",sebuah akun facebook [arsip] membagikan video ...,1,left


## Combine Hoax News Dataset

In [None]:
# dataset path
ds_hoax_antara = "dataset_hoax_antara.csv"
ds_hoax_kompas = "dataset_hoax_kompas.csv"
ds_hoax_tempo = "dataset_hoax_tempo.csv"


# List of CSV file names to merge
hoax_csv_files = [ds_hoax_antara, ds_hoax_kompas, ds_hoax_tempo]

# Reads each CSV file and saves it in a list
hoax_data_frames = [pd.read_csv(file) for file in hoax_csv_files]

# Combines all DataFrames in the list into one DataFrame
hoax_combined_df = pd.concat(hoax_data_frames, ignore_index=True)

# remove duplicate title content
hoax_combined_df = hoax_combined_df.drop_duplicates(subset=['title'], keep='first')

# remove unnamed column index
hoax_combined_df = hoax_combined_df.loc[:, ~hoax_combined_df.columns.str.contains('^Unnamed')]

# Saves the merged DataFrame into a new CSV file
hoax_combined_df.to_csv('dataset_hoax_news.csv', index=False)

print("CSV files merged successfully!")

CSV files merged successfully!


In [None]:
# Read CSV file
hoax_df = pd.read_csv("dataset_hoax_news.csv")
hoax_df.info()
hoax_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3655 entries, 0 to 3654
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       3655 non-null   object
 1   link        3655 non-null   object
 2   date        3048 non-null   object
 3   content     3655 non-null   object
 4   is_fake     3655 non-null   int64 
 5   media_bias  3655 non-null   object
dtypes: int64(1), object(5)
memory usage: 171.5+ KB


Unnamed: 0,title,link,date,content,is_fake,media_bias
0,NIK KTP DKI Jakarta akan dinonatifkan permanen...,https://www.antaranews.com/berita/4129155/hoak...,,– sebuah unggahan di facebook mengimbau kepada...,1,right
1,"Kominfo, Raffi Ahmad hingga Rudy Salim ajukan ...",https://www.antaranews.com/berita/4125624/hoak...,,– sebuah unggahan video menarasikan sejumlah p...,1,right
2,Presiden Iran selamat dari kecelakaan helikopter,https://www.antaranews.com/berita/4123608/hoak...,,– presiden iran ebrahim raisi dinyatakan menin...,1,right
3,RS Sri Ratu Medan tutup pascaviral,https://www.antaranews.com/berita/4122069/hoak...,,– kasus viral di media sosial tiktoker asal me...,1,right
4,Video tepung gorengan campur narkoba,https://www.antaranews.com/berita/4119549/hoak...,,- beredar video pada platform instagram yang m...,1,right


## Merge Factual and Hoax News Dataset

In [None]:
# dataset path
ds_factual = "dataset_factual_news.csv"
ds_hoax = "dataset_hoax_news.csv"

# List of CSV file names to merge
csv_files = [ds_factual, ds_hoax]

# Reads each CSV file and saves it in a list
data_frames = [pd.read_csv(file) for file in csv_files]

# Combines all DataFrames in the list into one DataFrame
combined_df = pd.concat(data_frames, ignore_index=True)

# Saves the merged DataFrame into a new CSV file
combined_df.to_csv('dataset_factual_hoax_news.csv', index=False)

print("CSV files merged successfully!")

CSV files merged successfully!


In [None]:
# Read CSV file
df = pd.read_csv("dataset_factual_hoax_news.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7329 entries, 0 to 7328
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       7329 non-null   object
 1   link        7329 non-null   object
 2   date        6722 non-null   object
 3   content     7329 non-null   object
 4   is_fake     7329 non-null   int64 
 5   media_bias  7329 non-null   object
dtypes: int64(1), object(5)
memory usage: 343.7+ KB


Unnamed: 0,title,link,date,content,is_fake,media_bias
0,Epy Kusnandar dan Yogi Gamblez Positif Narkoba,https://megapolitan.kompas.com/read/2024/05/11...,2024-05-11,"JAKARTA, KOMPAS.com - Polisi mengungkap hasil ...",0,netral
1,"Cerita Calon Haji, Rasakan Kemudahan Skrining ...",https://news.detik.com/berita/d-7304275/cerita...,22/04/2024,Salah satu peserta program Jaminan Kesehatan N...,0,netral
2,Polisi: Kami Butuh Partisipasi Warga untuk Ata...,https://megapolitan.kompas.com/read/2024/05/07...,2024-05-07,"JAKARTA, KOMPAS.com - Kapolres Metro Jakarta ...",0,netral
3,Pengunjung Ragunan Tembus 25 Ribu pada Sabtu S...,https://news.detik.com/berita/d-7335065/pengun...,11/05/2024,Sebanyak 25.555 orang tercatat mengunjungi Tam...,0,netral
4,Mensos Beri Santunan ke Ahli Waris Korban Banj...,https://news.detik.com/berita/d-7305239/mensos...,22/04/2024,Menteri Sosial Tri Rismaharini melakukan kunju...,0,netral


## Save to Google Drive

In [None]:
from google.colab import drive
import shutil

drive.mount('/content/drive')

colab_file_path = '/content/dataset_factual_hoax_news.csv'
drive_file_path = '/content/drive/My Drive/Colab Notebooks/Bangkit | PukulRata/dataset_factual_hoax_news.csv'

shutil.copyfile(colab_file_path, drive_file_path)

Mounted at /content/drive


'/content/drive/My Drive/Colab Notebooks/Bangkit | PukulRata/dataset_factual_hoax_news.csv.csv'