In [None]:
pip install rapidfuzz

In [None]:
# ==============================================
# A). EXTERNAL KNOWLEDGE BASE FETCHER
# ==============================================
import pandas as pd
import requests
import re
import json

print(" INITIALIZING KNOWLEDGE ACQUISITION ")

# 1. GENERATE STOPWORDS CSV
csv_content = """word
ang,sa,ng,mga,at,ay,na,ni,si,ko,ako,niya,kanya,kanyang,siyang
tayo,kami,kayo,sila,ito,iyan,iyon,nito,niyan,noon,ngayon
kanilang,kanila,namin,inyo,aming,aking,niyo,nila,kay,kina
hindi,dahil,isang,naman,noong,umano,matapos,sinabi,nitong
bilang,mula,para,pero,kung,upang,habang,bago,pagkatapos
pa,din,rin,lamang,lang,ba,kasi,pala,sana,daw,raw,maging
mo,ka,po,opo,may,wala,mas,bawat,iba,lahat,kapwa,tulad
gaya,ngunit,subalit,datapwat,kundi,kapag,maski,samantala
basta,kaysa,sakali,kabilang,ayon,tungo,laban,ukol,hinggil
ano,sino,saan,kailan,paano,bakit,alin,ilan,sang,naging
muli,man,kaya,dito,diyan,doon,san,jan,ganito,ganyan
"""
with open("filipino_stopwords.csv", "w", encoding='utf-8') as f:
    words = re.split(r'[,\n]+', csv_content)
    f.write("word\n")
    for w in words:
        if w.strip(): f.write(w.strip() + "\n")
print("‚úÖ 'filipino_stopwords.csv' created.")

# 2. FETCH OFFICIAL PH GEOGRAPHIC CODES (PSGC)
# Using the master branch raw link which is the most stable
base_url = "https://raw.githubusercontent.com/clavearnel/philippines-region-province-citymun-brgy/master/json"

def fetch_json(filename):
    url = f"{base_url}/{filename}"
    print(f"‚¨áÔ∏è Downloading: {filename}...")
    try:
        r = requests.get(url)
        r.raise_for_status()
        data = r.json()

        if isinstance(data, dict):
            # If data is wrapped like {"records": [...]}, try to find the list
            print(f"‚ö†Ô∏è Warning: {filename} is a Dictionary, looking for list content...")
            for key, val in data.items():
                if isinstance(val, list):
                    return val
            return [] # Fail safe
        elif isinstance(data, list):
            # Ensure elements are dicts
            if len(data) > 0 and isinstance(data[0], str):
                print(f"‚ùå Error: {filename} contains strings, expected objects.")
                return []
            return data
        else:
            print(f"‚ùå Error: Unknown format for {filename}")
            return []

    except Exception as e:
        print(f"‚ùå Error fetching {url}: {e}")
        return []

regions = fetch_json("refregion.json")
provinces = fetch_json("refprovince.json")
cities = fetch_json("refcitymun.json")

print(f"‚úÖ Downloaded: {len(regions)} Regions, {len(provinces)} Provinces, {len(cities)} Cities/Towns")

# 3. BUILD HIERARCHY DB
print("‚öôÔ∏è Building Relational Hierarchy...")

if len(regions) == 0 or len(cities) == 0:
    print("‚ö†Ô∏è CRITICAL WARNING: Download failed. Using fallback minimal database.")
    # Fallback so pipeline doesn't crash
    location_db = [
        {'alias': 'manila', 'official_name': 'Manila', 'province': 'Metro Manila', 'region': 'NCR', 'type': 'Local'},
        {'alias': 'quezon city', 'official_name': 'Quezon City', 'province': 'Metro Manila', 'region': 'NCR', 'type': 'Local'},
        {'alias': 'cebu', 'official_name': 'Cebu City', 'province': 'Cebu', 'region': 'Region VII', 'type': 'Local'},
        {'alias': 'davao', 'official_name': 'Davao City', 'province': 'Davao del Sur', 'region': 'Region XI', 'type': 'Local'}
    ]
else:
    # Build Map
    reg_map = {}
    for r in regions:
        if isinstance(r, dict) and 'regCode' in r:
            reg_map[r['regCode']] = r.get('regDesc', 'Unknown')

    prov_map = {}
    for p in provinces:
        if isinstance(p, dict) and 'provCode' in p:
            prov_map[p['provCode']] = {
                'name': p.get('provDesc', 'Unknown'),
                'regCode': p.get('regCode', '00')
            }

    location_db = []

    for city in cities:
        if not isinstance(city, dict): continue

        name = city.get('citymunDesc', '').title()
        prov_code = city.get('provCode')

        # Resolve Province & Region
        prov_name = "Metro Manila" # Default for NCR
        reg_name = "Unknown"

        if prov_code in prov_map:
            prov_name = prov_map[prov_code]['name'].title()
            reg_code = prov_map[prov_code]['regCode']
            if reg_code in reg_map:
                reg_name = reg_map[reg_code]
        elif city.get('regDesc'):
            reg_name = city['regDesc']

        location_db.append({
            'alias': name.lower(),
            'official_name': name,
            'province': prov_name,
            'region': reg_name,
            'type': 'Local'
        })

# Add Major Countries (International Scope)
countries = ['China', 'United States', 'USA', 'America', 'Japan', 'South Korea',
             'Singapore', 'Australia', 'Canada', 'Russia', 'Ukraine', 'UK', 'Saudi Arabia']
for c in countries:
    location_db.append({'alias': c.lower(), 'official_name': c, 'province': 'N/A', 'region': 'International', 'type': 'International'})

# Save
df_loc = pd.DataFrame(location_db).drop_duplicates(subset='alias')
df_loc.to_csv("master_locations.csv", index=False)
print(f"‚úÖ DATABASE READY: 'master_locations.csv' ({len(df_loc)} entries)")

In [None]:
# ==============================================
# B). TALA CORE PIPELINE (HYBRID + UNSUPERVISED)
# ==============================================
import json
import pandas as pd
import numpy as np
import re
import nltk
from rapidfuzz import process, fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from textblob import TextBlob
from nltk.corpus import stopwords

#  1. SETUP & LOADING
print(" INITIALIZING TALA ")

# Load Knowledge Base
try:
    loc_df = pd.read_csv("master_locations.csv")
    loc_db = loc_df.set_index('alias').T.to_dict()
    print(f"‚úÖ Loaded Knowledge Base: {len(loc_db)} Locations")
except FileNotFoundError:
    print("‚ùå Error: 'master_locations.csv' not found. Run Block 1.")
    loc_db = {}

# Load Stopwords
try:
    stop_df = pd.read_csv("filipino_stopwords.csv")
    custom_stops = set(stop_df['word'].str.lower().tolist())
except:
    custom_stops = set()

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

final_stop_list = list(custom_stops.union(set(stopwords.words('english'))))

#  2. FULL INGESTION
files = ['train.json', 'test.json', 'validation.json']
raw_data = []

print("\n PHASE 1: FULL INGESTION ")
import random # Ensure random is imported

for filename in files:
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data: entry['src'] = filename
            raw_data.extend(data)
            print(f"‚úÖ Loaded {len(data):,} from {filename}")
    except FileNotFoundError:
        print(f"‚ùå Missing: {filename}")

# This mixes AbanteTNT with the other outlets before processing starts
random.seed(42)
random.shuffle(raw_data)

df = pd.DataFrame(raw_data)
print(f"üìä Total Dataset: {len(df):,} Articles")
print("Outlet Check (Verify Mix):")
print(df['website'].value_counts().head())


#  3. CLEANING
print("\n PHASE 2: PROCESSING ")
def clean_text(text_obj):
    text = " ".join(text_obj) if isinstance(text_obj, list) else str(text_obj)
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    words = text.split()
    clean_words = [w for w in words if w not in final_stop_list and len(w) > 3]
    return " ".join(clean_words)

df['clean_text'] = df['body'].apply(clean_text)

#  4. HYBRID TOPIC MODELING (The Logic Fix)
print("\n PHASE 3: HYBRID AI TRAINING ")
# Strategy: Train Brain on Balanced Subset -> Predict on Full Dataset
training_df = df.groupby('website').apply(
    lambda x: x.sample(n=min(len(x), 3000), random_state=42)
).reset_index(drop=True)

print(f"üß† Training Vectorizer on {len(training_df):,} balanced articles...")
vectorizer = TfidfVectorizer(max_df=0.90, min_df=5, max_features=5000, stop_words=final_stop_list)
tfidf_train = vectorizer.fit_transform(training_df['clean_text'])

print("üß† Training Topic Model...")
nmf_model = NMF(n_components=12, random_state=42, init='nndsvd')
nmf_model.fit(tfidf_train)

# Label Topics
feature_names = vectorizer.get_feature_names_out()
topic_labels = {}
for i, topic in enumerate(nmf_model.components_):
    top_indices = topic.argsort()[:-4:-1]
    top_words = [feature_names[j].upper() for j in top_indices]
    topic_labels[i] = " | ".join(top_words)

print("üöÄ Applying Topic Model to FULL Dataset...")
tfidf_full = vectorizer.transform(df['clean_text'])
df['topic_id'] = nmf_model.transform(tfidf_full).argmax(axis=1)
df['topic_label'] = df['topic_id'].map(topic_labels)

#  5. ENTITY INTELLIGENCE (Locations & Personas)
print("\n PHASE 4: ENTITY RESOLUTION ")

def resolve_entities(text_obj):
    text = " ".join(text_obj) if isinstance(text_obj, list) else str(text_obj)
    matches = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', text)

    found_locs = set()
    potential_people = []

    blacklist = {'Abante', 'Tonite', 'News', 'Photo', 'Source', 'Courtesy'}

    for m in matches:
        m_lower = m.lower()
        if len(m) < 4 or m in blacklist: continue

        # Check Location DB
        if m_lower in loc_db:
            data = loc_db[m_lower]
            # Output: "City Name (Region)"
            found_locs.add(f"{data['official_name']} ({data['region']})")
        else:
            potential_people.append(m)

    return list(found_locs), potential_people

print("Extracting & Mapping Locations...")
results = df['body'].apply(resolve_entities)
df['locations'] = results.apply(lambda x: x[0])
df['raw_entities'] = results.apply(lambda x: x[1])

#  6. UNSUPERVISED PERSONA CLUSTERING
print("üß† Normalizing Personas (Fuzzy Logic)...")
# 1. Count frequency of all "potential people"
all_people = [p for sublist in df['raw_entities'] for p in sublist]
from collections import Counter
# 2. Identify "Anchors" (Top 200 most frequent names)
top_anchors = [x[0] for x in Counter(all_people).most_common(200)]

def normalize_personas(name_list):
    normalized = set()
    for name in name_list:
        # Check against Anchors
        # score_cutoff=90 means very high similarity required
        match = process.extractOne(name, top_anchors, scorer=fuzz.token_set_ratio, score_cutoff=90)
        if match:
            normalized.add(match[0]) # Map to Anchor
        else:
            if " " in name: normalized.add(name)

    return list(normalized)[:5]

df['personas'] = df['raw_entities'].apply(normalize_personas)

#  7. EXPORT & DATE REPAIR
print("\n PHASE 5: FINALIZING & DATE REPAIR ")

def simple_sentiment(text):
    blob = TextBlob(text).sentiment.polarity
    if 'hindi' in text and blob > 0: return -blob
    return blob

df['sentiment'] = df['clean_text'].apply(simple_sentiment)

# 1. Try strict parsing for Abante format (@)
df['date_clean'] = pd.to_datetime(df['date'].str.split('@').str[0], errors='coerce')

# 2. Check for failures (NaT)
missing_mask = df['date_clean'].isna()
failed_count = missing_mask.sum()

if failed_count > 0:
    print(f"‚ö†Ô∏è Warning: {failed_count:,} articles had invalid date formats.")
    print("üõ†Ô∏è Applying Repair: Filling invalid dates with the median date of the dataset.")

    # 3. Fill invalid dates with the Median (Middle) date so they don't get deleted
    median_date = df['date_clean'].median()
    df.loc[missing_mask, 'date_clean'] = median_date

# Select Columns
df_final = df[[
    'title', 'date_clean', 'category', 'website',
    'topic_label', 'sentiment', 'personas', 'locations', 'url', 'clean_text'
]]

output_file = 'tala_final.parquet'
df_final.to_parquet(output_file)
print(f"\n‚úÖ PIPELINE SUCCESS. Saved to {output_file}")
print("Final Outlet Verification (Should match Raw):")
print(df_final['website'].value_counts())

In [None]:
# ==============================================
# C). EXPORT TO DRIVE
# ==============================================

from google.colab import drive
import shutil

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Copy the file to your "My Drive" root folder
source = '/content/tala_final.parquet'
destination = '/content/drive/MyDrive/tala_final.parquet'

try:
    shutil.copy(source, destination)
    print(f"‚úÖ SUCCESS! File saved to your Google Drive at: {destination}")
    print("Go to drive.google.com to download it safely.")
except FileNotFoundError:
    print("‚ùå Error: Could not find 'tala_final.parquet'. Did the previous step finish?")