# DATA PRE-PROCESSING

## IMPORT LIBRARY

In [None]:
import os
import pandas as pd
from pathlib import Path
from nltk.tokenize import RegexpTokenizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.utils import shuffle
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prata\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prata\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## RAW DATA

### GET ALL THE RAW DATA FROM DIRECTORY

In [11]:
base_dir = Path('../data/raw/base')

# Make empty dictionary to set place for all of the data   
datasets = {}

# Loop all files and insert to the dictionary
for file_name in os.listdir(base_dir):
    if file_name.endswith('.xlsx'):
        var_name = file_name.replace('.xlsx', '').lower()
        datasets[var_name] = pd.read_excel(base_dir/file_name)

# print(f"Keys: {list(datasets.keys())}") # list of all keys
# datasets['dataset_cnn_10k']  # access specific file

### MODIFIED CNN DATA

In [12]:
cnn_data = datasets['dataset_cnn_10k']

# Check shape and the sum of n/a data
print(cnn_data.shape)
cnn_data.isna().sum()
# na_data = cnn_data[cnn_data.isna().any(axis=1)]

# Remove n/a data
clear_data = cnn_data.dropna()
clear_data.isna().sum()
print(clear_data.shape)

# Modified the header into lowercase
clear_data.columns = clear_data.columns.str.lower()

# Remove unnecessary data and add 'hoax' label
clear_data['info'] = clear_data['title'] + clear_data['fulltext'].astype(str)
clear_data = clear_data.drop(['title', 'timestamp', 'fulltext', 'tags', 'author', 'url'], axis=1)
clear_data['hoax'] = 0

# Save the modified data into new csv file
clear_data.to_csv('../data/raw/modified/cnn_modified.csv', index=False, sep=',', header=True)

(10000, 6)
(9627, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clear_data['info'] = clear_data['title'] + clear_data['fulltext'].astype(str)


### MODIFIED KOMPAS DATA

In [13]:
kompas_data = datasets['dataset_kompas_4k']

# Check shape and the sum of n/a data
print(kompas_data.shape)
kompas_data.isna().sum()

# Remove n/a data
clear_data = kompas_data.dropna()
clear_data.isna().sum()
print(clear_data.shape)

# Modified the header into lowercase
clear_data.columns = clear_data.columns.str.lower()

# Remove unnecessary data and add 'hoax' label
clear_data['info'] = clear_data['title'] + clear_data['fulltext'].astype(str)
clear_data = clear_data.drop(['title', 'timestamp', 'fulltext', 'tags', 'author', 'url'], axis=1)
clear_data['hoax'] = 0

# Save the modified data into new csv file
clear_data.to_csv('../data/raw/modified/kompas_modified.csv', index=False, sep=',', header=True)

(4750, 6)
(4286, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clear_data['info'] = clear_data['title'] + clear_data['fulltext'].astype(str)


### MODIFIED TEMPO DATA

In [14]:
tempo_data = datasets['dataset_tempo_6k']

print(tempo_data.shape)
tempo_data.isna().sum()

clear_data = tempo_data.dropna()
clear_data.isna().sum()
print(clear_data.shape)

clear_data.columns = clear_data.columns.str.lower()

clear_data['info'] = clear_data['title'] + clear_data['fulltext'].astype(str)
clear_data = clear_data.drop(['title', 'fulltext', 'timestamp', 'tags', 'author', 'url'], axis=1)
clear_data['hoax'] = 0

clear_data.to_csv('../data/raw/modified/tempo_modified.csv', index=False, sep=',', header=True)

(6592, 6)
(6591, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clear_data['info'] = clear_data['title'] + clear_data['fulltext'].astype(str)


### MODIFIED TURNBACKHOAX DATA

In [15]:
turnbackhoax_data = datasets['dataset_turnbackhoax_10k']

print(turnbackhoax_data.shape)
turnbackhoax_data.isna().sum()

clear_data = turnbackhoax_data.dropna()
clear_data.isna().sum()
print(clear_data.shape)

clear_data.columns = clear_data.columns.str.lower()

clear_data['info'] = clear_data['title'] + clear_data['fulltext'].astype(str)
clear_data = clear_data.drop(['title','fulltext', 'timestamp', 'tags', 'author', 'url'], axis=1)
clear_data['hoax'] = 1

clear_data.to_csv('../data/raw/modified/turnbackhoax_modified.csv', index=False, sep=',', header=True)

(10384, 6)
(10381, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clear_data['info'] = clear_data['title'] + clear_data['fulltext'].astype(str)


## PROCESS THE MODIFIED DATA

### GET ALL THE RAW DATA FROM DIRECTORY

In [16]:
base_path = Path('../data/raw/modified')

modified_data = {}

for file_name in os.listdir(base_path):
    if (file_name.endswith('.csv')):
        key_name = file_name.replace('.csv', '').lower()
        modified_data[key_name] = pd.read_csv(base_path/file_name)

# print(list(modified_data.keys()))

#### Function for Pre-Processing Data in NLP

In [None]:
# Setup tools
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stopword_factory = StopWordRemoverFactory()
tokenizer = RegexpTokenizer(r'\w+')

# Define additional stopwords that need
custom_stopwords = {
    'yg', 'dg', 'rt', 'dgn', 'ny', 'dll', 'tsb', 'dr', 'pd',
    'scroll', 'resume', 'advertisement', 'iklan', 'sponsor', 'promo', 'baca',
    'klik', 'lanjut', 'selengkapnya', 'like', 'share', 'comment', 'subscribe',
    'follow', 'video', 'foto', 'gambar', 'infografis', 'caption', 'deskripsi',
    'sumber', 'reporter', 'editor', 'wartawan', 'penulis', 'kontributor', 'publisher',
    'simak', 'tonton', 'lihat', 'dengar', 'unduh', 'download',
    'republika', 'kompas', 'detik', 'tempo', 'cnn', 'bbc', 'rt', 'via',
    'twitter', 'facebook', 'instagram', 'youtube', 'tiktok',
    'halaman', 'kategori', 'narasi', 'verifikasi', 'referensi',
    'error', 'tagar', 'tulis', 'komen', 'read',
    'www', 'https', 'http', 'com', 'net', 'co', 'id'
}

# merge custom stopwords and default stopwords
stopwords = set(stopword_factory.get_stop_words()).union(custom_stopwords)

# exclude some words that don't need to be stemmed
excluded_from_stemming = {
    'politik', 'ekonomi', 'tokoh', 'jakarta', 'indonesia', 'pemerintah',
    'demokrasi', 'korupsi', 'hukum', 'budaya', 'sejarah', 'teknologi'
}

def preprocess_indonesian(text):
    # ensure the data are text or string
    if not isinstance(text, str) or not text.strip():
        return ""

    # case folding (lowercase + remove escape characters)
    text = text.lower().strip()

    # remove data that are not important
    text = re.sub(r'(https?://\S+|www\.\S+|\S+\.(com|id|net|org|co)(/\S*)?)', '', text, flags=re.IGNORECASE) #url + domain
    text = re.sub(r'\b\w+@\w+\.\w+', '', text)  #email
    text = re.sub(r'[^\w\s]', ' ', text) #symbols and punctuations  
    text = re.sub(r'\s+', ' ', text).strip() #space after several operations

    # tokenize the string (sentences into words)
    tokens = tokenizer.tokenize(text)

    # Filter the token from short token, stopwords, and remain words about URL
    tokens = [
        t for t in tokens 
        if len(t) > 3 and t not in stopwords and not re.match(r'^(http|https|www|com|net|co|id)$', t)
    ]

    # Stemming the words (only take the base form of a word)
    tokens = [t if t in excluded_from_stemming else stemmer.stem(t) for t in tokens]

    #return clean string
    return ' '.join(tokens)

### CLEANING CNN DATA

In [18]:
cnn_modified = modified_data['cnn_modified']
print(cnn_modified.shape)

# Remove whitespace and duplicate title value
cnn_modified['info'] = cnn_modified['info'].str.strip()
cnn_modified = cnn_modified.drop_duplicates(subset=['info'])
print(cnn_modified.shape)

# Re-calculate data index after remove duplicate value if any
cnn_modified = cnn_modified.reset_index(drop=True)

# Apply preprocessing function to all of the data
cnn_modified['cleaned_info'] = cnn_modified['info'].apply(preprocess_indonesian)

# Show results
# print("Original:", cnn_modified['title'].iloc[5])
# print("Cleaned:", cnn_modified['cleaned_title'].iloc[5])

cnn_modified.to_csv('../data/cleaned/clean_for_each/cnn_clean.csv', index=False, sep=',', header=True)

(9627, 2)
(9627, 2)


### CLEANING KOMPAS DATA

In [19]:
kompas_modified = modified_data['kompas_modified']
print(cnn_modified.shape)

# Remove whitespace and duplicate title value
kompas_modified['info'] = kompas_modified['info'].str.strip()
kompas_modified = kompas_modified.drop_duplicates(subset=['info'])
print(kompas_modified.shape)

# Re-calculate data index after remove duplicate value if any
kompas_modified = kompas_modified.reset_index(drop=True)

# Apply preprocessing function to all of the data
kompas_modified['cleaned_info'] = kompas_modified['info'].apply(preprocess_indonesian)

# Show results
# print("Original:", kompas_modified['title'].iloc[5])
# print("Cleaned:", kompas_modified['cleaned_title'].iloc[5])

kompas_modified.to_csv('../data/cleaned/clean_for_each/kompas_clean.csv', index=False, sep=',', header=True)

(9627, 3)
(4286, 2)


### CLEANING TEMPO DATA

In [20]:
tempo_modified = modified_data['tempo_modified']
print(tempo_modified.shape)

tempo_modified['info'] = tempo_modified['info'].str.strip()
tempo_modified = tempo_modified.drop_duplicates(subset=['info'])
print(tempo_modified.shape)

tempo_modified = tempo_modified.reset_index(drop=True)

tempo_modified['cleaned_info'] = tempo_modified['info'].apply(preprocess_indonesian)

tempo_modified.to_csv('../data/cleaned/clean_for_each/tempo_clean.csv', index=False, sep=',', header=True)

(6591, 2)
(6591, 2)


### CLEANING TURNBACKHOAX DATA

In [21]:
# Function for knowing all of the categories in the beginning of string
def extract_prefix(text):
    # Check the pattern [KATEGORI], (KATEGORI), or KATEGORI
    match = re.search(r'^(?:\[([^\]]+)\]|\(([^)]+)\)|([^:\s]+):)', str(text))
    if match:
        # Return back the match group (ignore the NONE)
        return next(item for item in match.groups() if item is not None)
    return None

In [22]:
# Function for removing all of the prefix
def remove_prefixes(text, prefix_list):
    text = text.strip()
    for prefix in prefix_list:
        if prefix in text:
            return text[len(prefix)+2:].strip()
    return text

In [23]:
turnbackhoax_modified = modified_data['turnbackhoax_modified']

# find the prefix in all of the strings and remove them from strings
prefixes = turnbackhoax_modified['info'].apply(extract_prefix).dropna().unique()
print(prefixes)
turnbackhoax_modified['info'] = turnbackhoax_modified['info'].apply(lambda data: remove_prefixes(data, prefixes))

['SALAH' 'FALSE'
 'SALAH) Akun Whatsapp Kepala BKPSDM Kab Tangerang Hendar Herawan “+6285841645716”hasil periksa fakta Rahmah an. \nAkun palsu. BKPSDM Kab Tangerang tidak pernah menawarkan/menjanjikan apapun atau melakukan pungutan apapun terkait pelayanan kepegawaian di BKPSDM Kab Tangerang.\nSelengkapnya di penjelasan.\n===[KATEGORI'
 'SALAH) Akun Whatsapp Bupati Bojonegoro Anna Mu’awanah “+628133728109”Hasil periksa fakta Rahmah an. \nAkun palsu. Kepala Bidang Pengelolaan Informasi dan Komunikasi Publik (PIKP) Dinas Kominfo Kabupaten Bojonegoro, Nanang Dwi Cahyono, memastikan bahwa nomor yang beredar bukan milik Bupati Anna.\nSelengkapnya di penjelasan.\n===[KATEGORI'
 'SALAH) Akun Whatsapp Kadis Kominfosanti Ketut Suwarmawan Kabupaten Buleleng “+6281324422253”Hasil periksa fakta Ari Dwi Prasetyo.\nAkun palsu. Melansir Cirt.bulelengkab.go.id, menerangkan bahwa akun Whatsapp dengan nomor tersebut bukanlah akun Whatsapp resmi dari Kadis Komunikasi, Informatika, Persandian, dan Statist

In [24]:
turnbackhoax_modified['info'] = turnbackhoax_modified['info'].str.strip()
turnbackhoax_modified = turnbackhoax_modified.drop_duplicates(subset=['info'])
turnbackhoax_modified = turnbackhoax_modified.reset_index(drop=True)
# print(turnbackhoax_modified['hoax'].value_counts())

turnbackhoax_modified['cleaned_info'] = turnbackhoax_modified['info'].apply(preprocess_indonesian)
# print("Original:", turnbackhoax_modified['title'].iloc[9])
# print("Cleaned:", turnbackhoax_modified['cleaned_title'].iloc[9])
# turnbackhoax_modified

turnbackhoax_modified.to_csv('../data/cleaned/clean_for_each/turnbackhoax_clean.csv', index=False, sep=',', header=True)

### MERGE DATA AND SAVE INTO DIRECTORIES

In [25]:
true_data = pd.concat([cnn_modified, kompas_modified, tempo_modified], ignore_index=True)
true_data['info'] = true_data['cleaned_info']
true_data = true_data.drop('cleaned_info', axis=1)

true_data_shuffled = shuffle(true_data).reset_index(drop=True)
print(true_data_shuffled.shape)

true_data_shuffled.to_csv('../data/cleaned/clean_part_merged/true_data.csv', index=False, sep=',', header=True)

(20504, 2)


In [26]:
false_data = turnbackhoax_modified
false_data['info'] = false_data['cleaned_info']
false_data = false_data.drop('cleaned_info', axis=1)

false_data.to_csv('../data/cleaned/clean_part_merged/false_data.csv', index=False, sep=',', header=True)

In [27]:
# merge true (fact) and false (hoax) data
all_clean_data = pd.concat([true_data, false_data], ignore_index=True)
all_clean_data_shuffled = shuffle(all_clean_data).reset_index(drop=True)

all_clean_data_shuffled.to_csv('../data/cleaned/clean_all_merged/clean_data_all.csv', index=False, sep=',', header=True)