# **Importing resources**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
dir  = '/content/gdrive/MyDrive/CSCI 199/Methodology/Datasets/'

In [3]:
!pip -q install langdetect scikit-learn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/981.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [4]:
import pandas as pd
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize
import numpy as np
import re

from langdetect import detect
from langdetect import DetectorFactory
DetectorFactory.seed = 0

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# **Functions**

In [5]:
def get_headlineLead(df):
    workingDF = pd.DataFrame(columns=['text', 'label'])

    for index, row in df.iterrows():
        try:
            lang = detect(df['Title'].iloc[index])
        except:
            lang = ""

        # Skip English headlines
        if lang == 'en':
            continue

        # Add the non-English headline to the new DataFrame
        new_row = pd.DataFrame({'text': df['Title'].iloc[index], 'label': ''}, index=[0])
        workingDF = pd.concat([workingDF.loc[:], new_row]).reset_index(drop=True)

        # Get the article text and split into sentences
        text = str(df['Text'].iloc[index])
        sentences = text.split('\n')

        # Find the first valid sentence
        for sentence in sentences:
            sentence = sentence.strip()

            # Clean the sentence
            sentence = re.sub(r"http\S+", "", sentence)  # Remove URLs
            sentence = re.sub(r"@\S+", "", sentence)     # Remove mentions
            sentence = re.sub(r"\S+.com\S+", "", sentence) # Remove domain names

            # Check if the sentence meets any of the exclusion criteria
            if (', GMA Integrated News' in sentence
                    or sentence == ''
                    or not sentence.strip()
                    or len(sentence.split()) < 10
                    or 'RELATED STORY' in sentence
                    or 'BASAHIN:' in sentence
                    or 'MAKI-BALITA:' in sentence
                    or re.search(r'\b(photo|photos|image|images|picture|pictures|courtesy|post|inquirer|contributed)\b[\s:\-]*', sentence, re.IGNORECASE)):
                # If the sentence meets any of the criteria, skip it
                continue
            else:
                # If a valid sentence is found, add it to the DataFrame
                new_row = pd.DataFrame({'text': sentence, 'label': ''}, index=[0])
                workingDF = pd.concat([workingDF.loc[:], new_row]).reset_index(drop=True)
                break  # Stop after finding the first valid sentence

    # Remove Blank or NaN Rows
    workingDF = workingDF.dropna(subset=['text'])  # Remove NaN values
    workingDF = workingDF[workingDF['text'].str.strip() != '']  # Remove empty strings
    workingDF = workingDF.reset_index(drop=True)  # Reset index after filtering

    return workingDF

# **Cleaning the Datasets**

## **Abante**

In [6]:
# Load the two Abante CSV files
abante1 = pd.read_csv(f'{dir}Raw/Abante_2025-01-30_NEWS.csv')
abante2 = pd.read_csv(f'{dir}Raw/Abante_2025-01-31_NEWS.csv')

# Concatenate them
abante = pd.concat([abante1, abante2], ignore_index=True)
abante

Unnamed: 0.1,Unnamed: 0,Section,Title,Author,Date,Text
0,0,ABANTV,AbanTV – Showbiz | Planong paglipat ng Eat Bul...,,"May 31, 2023","PLANONG PAGLIPAT NG EAT BULAGA SA TV5, SUMINGA..."
1,1,NEWS,Mga negosyante mula Bangkok type mamuhunan sa PH,Abante News,"May 31, 2023",Maglalagak ng $2.5 billion o P140 bilyon na pa...
2,2,NEWS,Marcos nanawagan na pabakunahan mga tsikiting,Abante News,"May 31, 2023",Nanawagan si Pangulong Ferdinand “Bongbong” Ma...
3,3,NEWS,Tulfo nakahanda 120 panukalang batas,Abante News,"May 31, 2023",Itutulak ni ACT-CIS party-list Rep. Erwin Tulf...
4,4,NEWS,BBM inarbor mga ‘tambaloslos’ kay Sara,Abante News,"May 31, 2023",NAGPAABOT ng pagbati si Pangulong Ferdinand “B...
...,...,...,...,...,...,...
11495,6495,ENTERTAINMENT,Angelica araw-araw iniiyakan si baby Amila,,"October 26, 2022","Halos araw-araw yata ay emosyonal, at umiiyak ..."
11496,6496,OPINION,Graftbuster Cascolan sa DOH,,"October 26, 2022",My own definition of leadership is this: The c...
11497,6497,OPINION,Ako Bicol Cup 2022 aarangkada na!,,"October 26, 2022",Magandang araw muli mga kababayan kong Bicolan...
11498,6498,ENTERTAINMENT,Bongga ni Sharifa: Ynez Veneracion hinarap mis...,,"October 26, 2022","Sa kauna-unahang pagkakataon ay nakita, nakasa..."


In [7]:
# Count the number of articles per section
section_counts = abante["Section"].value_counts().reset_index()
section_counts.columns = ["Section", "Article Count"]

section_counts

Unnamed: 0,Section,Article Count
0,NEWS,3105
1,SPORTS,2937
2,ENTERTAINMENT,2614
3,METRO,1003
4,LIFESTYLE,729
5,OPINION,378
6,VISMIN,339
7,ABANTV,182


In [8]:
# Filter only the NEWS, METRO, and VISMIN sections
filtered_abante = abante[abante["Section"].isin(["NEWS", "METRO", "VISMIN"])].reset_index(drop=True)
filtered_abante

Unnamed: 0.1,Unnamed: 0,Section,Title,Author,Date,Text
0,1,NEWS,Mga negosyante mula Bangkok type mamuhunan sa PH,Abante News,"May 31, 2023",Maglalagak ng $2.5 billion o P140 bilyon na pa...
1,2,NEWS,Marcos nanawagan na pabakunahan mga tsikiting,Abante News,"May 31, 2023",Nanawagan si Pangulong Ferdinand “Bongbong” Ma...
2,3,NEWS,Tulfo nakahanda 120 panukalang batas,Abante News,"May 31, 2023",Itutulak ni ACT-CIS party-list Rep. Erwin Tulf...
3,4,NEWS,BBM inarbor mga ‘tambaloslos’ kay Sara,Abante News,"May 31, 2023",NAGPAABOT ng pagbati si Pangulong Ferdinand “B...
4,5,NEWS,Presyo ng bilihin bumaba noong Mayo – BSP,Abante News,"May 31, 2023",Maaaring bumaba ang inflation nitong Mayo dahi...
...,...,...,...,...,...,...
4442,6477,NEWS,2 pang Omicron variant binabantayan,Abante News,"October 26, 2022",Hinikayat ng isang eksperto ang publiko na mag...
4443,6478,NEWS,Duda sa pagkamaty ng Bilibid middleman! Sino n...,Abante News,"October 26, 2022",Nais ni Pangulong Ferdinand ‘Bongbong’ Marcos ...
4444,6479,NEWS,15M Pinoy napraning sa droga – WHO,Abante News,"October 26, 2022",Tinaya ng World Health Organization (WHO) na m...
4445,6480,NEWS,Nursing topnotcher proud na ginapang pag-aaral,Abante News,"October 26, 2022",‘Di na kailangang magyabang kung nagsusumigaw ...


In [9]:
abanteHL_df = get_headlineLead(filtered_abante)
abanteHL_df

Unnamed: 0,text,label
0,Mga negosyante mula Bangkok type mamuhunan sa PH,
1,Maglalagak ng $2.5 billion o P140 bilyon na pa...,
2,Marcos nanawagan na pabakunahan mga tsikiting,
3,Nanawagan si Pangulong Ferdinand “Bongbong” Ma...,
4,Tulfo nakahanda 120 panukalang batas,
...,...,...
8312,Tinaya ng World Health Organization (WHO) na m...,
8313,Nursing topnotcher proud na ginapang pag-aaral,
8314,‘Di na kailangang magyabang kung nagsusumigaw ...,
8315,"Mycah Go, Lady Blazers maaga dinispatsa Beda",


In [10]:
abanteHL_df.to_csv(f'{dir}Cleaned/abante_HL_full.csv', index=False)

## **Bandera**

In [11]:
bandera = pd.read_csv(f'{dir}Raw/Bandera_2025-02-01_NEWS.csv')
bandera

Unnamed: 0.1,Unnamed: 0,Title,Author,Date,Text
0,0,"Bagyong Betty nakapasok na sa bansa, ‘Signal n...",Pauline del Rosario,"May 27,",PHOTO: Facebook/Dost_pagasa\n\nPHOTO: Facebook...
1,1,Super Typhoon lalo pang lalakas habang papalap...,Pauline del Rosario,"May 26,",PHOTO: Facebook/Dost_pagasa\n\nPHOTO: Facebook...
2,2,"Super Typhoon papasok na sa Biyernes o Sabado,...",Pauline del Rosario,"May 25,",PHOTO: Facebook/Dost_pagasa\n\nPHOTO: Facebook...
3,3,"7 sugatan, P300M halaga ng pinsala sa nasunog ...",Pauline del Rosario,"May 22,",INQUIRER photo\n\nINQUIRER photo\nPITO ang nai...
4,4,"BSP nagbabala sa modus na ‘sangla-ATM’, mga mg...",Pauline del Rosario,"May 21,",INQUIRER file photo\n\nINQUIRER file photo\nNA...
...,...,...,...,...,...
4995,4995,PAGASA: Amihan umiihip na,Leifbilly Begas,"September 26,",\nNAGSISIMULA na umanong umihip ang Hanging Am...
4996,4996,19 POGO worker naospital sa ‘food poisoning’,John Roson,"September 26,",\nLABING-siyam na Chinese national na nagtatra...
4997,4997,No show ni Duterte sa turnover ceremony ng AFP...,Bella Cariaso,"September 26,",RODRIGO DUTERTE\n\nRODRIGO DUTERTE\nIPINAGTANG...
4998,4998,Duterte tiwala pa rin kay Albayalde,Bella Cariaso,"September 26,",\nSINABI ng Palasyo na tiwala pa rin si Pangul...


In [12]:
banderaHL_df = get_headlineLead(bandera)
banderaHL_df

Unnamed: 0,text,label
0,"Bagyong Betty nakapasok na sa bansa, ‘Signal n...",
1,PUMASOK na ng ating bansa ang binabantayang Su...,
2,Super Typhoon lalo pang lalakas habang papalap...,
3,UNTI-UNTI nang lumalapit sa ating bansa ang Su...,
4,"Super Typhoon papasok na sa Biyernes o Sabado,...",
...,...,...
9255,IPINAGTANGGOL ng Palasyo ang hindi pagsipot ni...,
9256,Duterte tiwala pa rin kay Albayalde,
9257,SINABI ng Palasyo na tiwala pa rin si Pangulon...,
9258,DOH kinumpirmang diphtheria ang ikinamatay ng ...,


In [13]:
banderaHL_df.to_csv(f'{dir}Cleaned/bandera_HL_full.csv', index=False)

## **Balita**

In [14]:
balita = pd.read_csv(f'{dir}Raw/Balita_2025-01-31_NEWS.csv')
balita

Unnamed: 0,Title,Author,Date,Text
0,"GMA Network, ‘di inasahan ang tuluyang pagkala...",Raymond Lumagsao,"May 31, 2023",Ikinalungkot ng GMA Kapuso Network ang hindi i...
1,Mga may-ari ng nawasak na bahay sa bagyong Bet...,Rommel Tabbad,"May 31, 2023",Tumanggap na ng tulong pinansyal ang mga pamil...
2,"'Betty', inaasahang lalabas ng PAR ngayong Huw...",Balita Online,"May 31, 2023","Inaasahan ng Philippine Atmospheric, Geophysic..."
3,"BI, nagbabala vs call center job scam sa Myanm...",Balita Online,"May 31, 2023",Nagbabala ang Bureau of Immigration (BI) niton...
4,Sandro Marcos sa kaarawan ni VP Duterte: ‘We a...,Balita Online,"May 31, 2023",Para kay Senior Deputy Majority Leader at Iloc...
...,...,...,...,...
4992,"Guro-vlogger sa Misamis Occidental, may libren...",Richard de Leon,"January 10, 2023",Patuloy na kinalulugdan ng mga netizen ang vir...
4993,"Voter registration, matumal pa rin; publiko, h...",Mary Ann Santiago,"January 10, 2023",Hinikayat ng Commission on Elections (Comelec)...
4994,Ama ni McCoy de Leon hinggil sa hiwalayan ng M...,Nicole Therise Marcelo,"January 10, 2023",Hiyang-hiya raw ang ama ni McCoy de Leon na si...
4995,"DA, aprubado ang pag-angkat ng 21,060 MT na si...",Balita Online,"January 10, 2023",Sa hangaring mapababa ang tumataas na presyo n...


In [15]:
balitaHL_df = get_headlineLead(balita)
balitaHL_df

Unnamed: 0,text,label
0,"GMA Network, ‘di inasahan ang tuluyang pagkala...",
1,Ikinalungkot ng GMA Kapuso Network ang hindi i...,
2,Mga may-ari ng nawasak na bahay sa bagyong Bet...,
3,Tumanggap na ng tulong pinansyal ang mga pamil...,
4,"'Betty', inaasahang lalabas ng PAR ngayong Huw...",
...,...,...
8517,Hiyang-hiya raw ang ama ni McCoy de Leon na si...,
8518,"DA, aprubado ang pag-angkat ng 21,060 MT na si...",
8519,Sa hangaring mapababa ang tumataas na presyo n...,
8520,"Chad Kinis, ginaya 'Bakit ka sad' post ni Donn...",


In [16]:
balitaHL_df.to_csv(f'{dir}Cleaned/balita_HL_full.csv', index=False)

# **Splitting the Datasets**

In [17]:
# Original dataset sizes
abante_size = len(abanteHL_df)
bandera_size = len(banderaHL_df)
balita_size = len(balitaHL_df)

print(f"Abante: {abante_size}, Bandera: {bandera_size}, Balita: {balita_size}")

Abante: 8317, Bandera: 9260, Balita: 8522


In [18]:
target_size = 25000  # Required total

# Use all from Abante, distribute the rest equally
abante_final_size = abante_size
remaining_target = target_size - abante_final_size
bandera_final_size = min(bandera_size, remaining_target // 2)
balita_final_size = target_size - (abante_final_size + bandera_final_size)

# Print distribution
print(f"Abante: {abante_final_size}, Bandera: {bandera_final_size}, Balita: {balita_final_size}")

Abante: 8317, Bandera: 8341, Balita: 8342


In [19]:
# Take the first n rows
abante_final = abanteHL_df.head(abante_final_size)
bandera_final = banderaHL_df.head(bandera_final_size)
balita_final = balitaHL_df.head(balita_final_size)

abante_final

Unnamed: 0,text,label
0,Mga negosyante mula Bangkok type mamuhunan sa PH,
1,Maglalagak ng $2.5 billion o P140 bilyon na pa...,
2,Marcos nanawagan na pabakunahan mga tsikiting,
3,Nanawagan si Pangulong Ferdinand “Bongbong” Ma...,
4,Tulfo nakahanda 120 panukalang batas,
...,...,...
8312,Tinaya ng World Health Organization (WHO) na m...,
8313,Nursing topnotcher proud na ginapang pag-aaral,
8314,‘Di na kailangang magyabang kung nagsusumigaw ...,
8315,"Mycah Go, Lady Blazers maaga dinispatsa Beda",


In [20]:
abante_final.to_csv(f'{dir}Cleaned/abante_HL_final.csv', index=False)
bandera_final.to_csv(f'{dir}Cleaned/bandera_HL_final.csv', index=False)
balita_final.to_csv(f'{dir}Cleaned/balita_HL_final.csv', index=False)

In [21]:
# Split each dataset separately
abante_train, abante_test = train_test_split(abante_final, test_size=0.2, random_state=42, shuffle=True)
bandera_train, bandera_test = train_test_split(bandera_final, test_size=0.2, random_state=42, shuffle=True)
balita_train, balita_test = train_test_split(balita_final, test_size=0.2, random_state=42, shuffle=True)

In [22]:
# Save each train and test split for each news site
abante_train.to_csv(f'{dir}Cleaned/abante_HL_train.csv', index=False)
abante_test.to_csv(f'{dir}Cleaned/abante_HL_test.csv', index=False)

bandera_train.to_csv(f'{dir}Cleaned/bandera_HL_train.csv', index=False)
bandera_test.to_csv(f'{dir}Cleaned/bandera_HL_test.csv', index=False)

balita_train.to_csv(f'{dir}Cleaned/balita_HL_train.csv', index=False)
balita_test.to_csv(f'{dir}Cleaned/balita_HL_test.csv', index=False)

In [23]:
abante_train_sample = abante_train.sample(n=2666, random_state=42, replace=False)
abante_test_sample = abante_test.sample(n=666, random_state=42, replace=False)

bandera_train_sample = bandera_train.sample(n=2666, random_state=42, replace=False)
bandera_test_sample = bandera_test.sample(n=666, random_state=42, replace=False)

balita_train_sample = balita_train.sample(n=2667, random_state=42, replace=False)
balita_test_sample = balita_test.sample(n=667, random_state=42, replace=False)

In [24]:
# Save each train and test sample split for each news site
abante_train_sample.to_csv(f'{dir}Cleaned/abante_HL_train_sample.csv', index=False)
abante_test_sample.to_csv(f'{dir}Cleaned/abante_HL_test_sample.csv', index=False)

bandera_train_sample.to_csv(f'{dir}Cleaned/bandera_HL_train_sample.csv', index=False)
bandera_test_sample.to_csv(f'{dir}Cleaned/bandera_HL_test_sample.csv', index=False)

balita_train_sample.to_csv(f'{dir}Cleaned/balita_HL_train_sample.csv', index=False)
balita_test_sample.to_csv(f'{dir}Cleaned/balita_HL_test_sample.csv', index=False)

# **Combining the Datasets**

In [25]:
combined_HL = pd.concat([abanteHL_df, banderaHL_df, balitaHL_df], ignore_index=True)
combined_HL

Unnamed: 0,text,label
0,Mga negosyante mula Bangkok type mamuhunan sa PH,
1,Maglalagak ng $2.5 billion o P140 bilyon na pa...,
2,Marcos nanawagan na pabakunahan mga tsikiting,
3,Nanawagan si Pangulong Ferdinand “Bongbong” Ma...,
4,Tulfo nakahanda 120 panukalang batas,
...,...,...
26094,Hiyang-hiya raw ang ama ni McCoy de Leon na si...,
26095,"DA, aprubado ang pag-angkat ng 21,060 MT na si...",
26096,Sa hangaring mapababa ang tumataas na presyo n...,
26097,"Chad Kinis, ginaya 'Bakit ka sad' post ni Donn...",


In [26]:
combined_HL.to_csv(f'{dir}/Cleaned/combined_HL_full.csv', index=False)

In [27]:
combined_final = pd.concat([abante_final, bandera_final, balita_final], ignore_index=True)
combined_final

Unnamed: 0,text,label
0,Mga negosyante mula Bangkok type mamuhunan sa PH,
1,Maglalagak ng $2.5 billion o P140 bilyon na pa...,
2,Marcos nanawagan na pabakunahan mga tsikiting,
3,Nanawagan si Pangulong Ferdinand “Bongbong” Ma...,
4,Tulfo nakahanda 120 panukalang batas,
...,...,...
24995,Isinalaysay ng isang contestant na si Ellowe A...,
24996,"Galvez, nanumpa na bilang DND chief",
24997,Nanumpa sa tungkulin sina Department of Nation...,
24998,Guilty sa graft case: Ex-Maguindanao Governor ...,


In [28]:
combined_final.to_csv(f'{dir}/Cleaned/combined_HL_final.csv', index=False)

In [29]:
# Combine the original train datasets (before sampling)
train_combined = pd.concat([abante_train, bandera_train, balita_train], ignore_index=True)
train_combined

Unnamed: 0,text,label
0,"Sa 110, missing 33 pa",
1,Masarap talagang chumibog ng malalamig na pagk...,
2,Todas sa sama ng panahon 43 na — NDRRMC,
3,Mga gov’t worker may tig-20K bonus pa,
4,3 nirapido ng ‘Bonnet Gang’ sa kotse,
...,...,...
19993,"Kabataan Partylist, nakiisa sa kilos-protesta ...",
19994,"DOH, nagbabala sa publiko vs karaniwang sakit ...",
19995,Tuloy ang transport strike sa Marso 6 hanggang...,
19996,"Barko sa Palawan, nasunog, lumubog; 2 tripulan...",


In [30]:
# Combine the original test datasets (before sampling)
test_combined = pd.concat([abante_test, bandera_test, balita_test], ignore_index=True)
test_combined

Unnamed: 0,text,label
0,10 alagang hayop nalitson sa sunog,
1,"Kagawad, isa pa dedbol, tserman kritikal sa ts...",
2,Suwerte para sa isang magdyowa mula sa North Y...,
3,Isang limang-taong gulang na batang lalaki ang...,
4,"10 pumuga sa Pasay, balik-selda na lahat",
...,...,...
4997,Tuesday Vargas hindi nasikmurang binastos ni R...,
4998,"Anne Curtis, nagdiwang ng kaarawan sa ‘It’s Sh...",
4999,Kaagad na itinanggi ni Police Regional Office ...,
5000,Kamakailan lamang ay ibinida ng social media p...,


In [31]:
# Save the combined train and test datasets
train_combined.to_csv(f'{dir}Cleaned/combined_HL_train.csv', index=False)
test_combined.to_csv(f'{dir}Cleaned/combined_HL_test.csv', index=False)

In [32]:
# Combine the sampled train datasets
train_sample_combined = pd.concat([abante_train_sample, bandera_train_sample, balita_train_sample], ignore_index=True)
train_sample_combined

Unnamed: 0,text,label
0,Bagama’t patuloy pa sa pananasala ang Severe T...,
1,Bitayin mga promotor ng private army – Bato,
2,Altai mining sa Sibuyan kakalkalin ng Senado,
3,Sinalakay ng mga tauhan ng Bureau of Customs (...,
4,"Susan Ople biyaheng Saudi, kukulitin P4B utang...",
...,...,...
7994,Naospital ang social media star at viral singe...,
7995,Kaya raw niya naging kaibigan si Darryl dahil ...,
7996,Inanunsyo ng Miss Universe Philippines Organiz...,
7997,"Marcos, lumipad na pa-Japan",


In [33]:
# Combine the sampled test datasets
test_sample_combined = pd.concat([abante_test_sample, bandera_test_sample, balita_test_sample], ignore_index=True)
test_sample_combined

Unnamed: 0,text,label
0,Isang umano’y tinaguriang ‘shabu queen’ at lid...,
1,Anthrax infection kumalat sa Cagayan,
2,TESDA: Mga tech-voc graduate swak sa trabaho,
3,Nagkamit ng unang pwesto ang isang Filipina st...,
4,NEDA inaprub tapyas taripa sa e-vehicle,
...,...,...
1994,"Kinaroroonan ni Teves, nananatiling 'misteryo'",
1995,"Viy Cortez pinahanap ang lumuhod, nagmakaawang...",
1996,"Ginebra, kampeon sa PBA Commissioner's Cup--Ba...",
1997,Isang bouquet na gawa sa sari-saring pagkain t...,


In [34]:
# Save the combined train and test datasets
train_sample_combined.to_csv(f'{dir}Cleaned/combined_HL_train_8000.csv', index=False)
test_sample_combined.to_csv(f'{dir}Cleaned/combined_HL_test_2000.csv', index=False)