# Preprocess


## Initialize

In [None]:
from google.colab import drive
drive.mount('/content/drive')
base_directory = f'/content/drive/MyDrive/Analisis-Sentimen-CNBC'
dataset_file_name = 'dataset'

## Cleaning & Tokenize

In [None]:
import pandas as pd
import re
from transformers import BertTokenizer

# Inisialisasi tokenizer IndoBERT
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-large-p2')

# Baca dataset
df = pd.read_csv(f'{base_directory}/Dataset/{dataset_file_name}.csv')  # Sesuaikan path dataset Anda

def smart_lowercase(text):
    tokens = text.split()
    result = []

    for token in tokens:
        if token.isupper() and 2 <= len(token) <= 6:
            result.append(token)
        else:
            result.append(token.lower())

    return ' '.join(result)

def preprocess(text):
    text = smart_lowercase(text)
    text = re.sub(r'(\d+)\.(\d+)[ -]?an', r'\1\2-an', text)
    text = re.sub(r'(\d+),(\d+)%', r'\1.\2%', text)
    text = re.sub(r'(\bUS|\bUSD)\$', r'\1 $ ', text, flags=re.IGNORECASE)
    text = re.sub(r'(rp)(\d+),(\d+)\s*([tmk])', r'\1 \2.\3\4', text, flags=re.IGNORECASE)
    text = re.sub(r'(rp)(\d+),(\d+)', r'\1 \2.\3', text, flags=re.IGNORECASE)
    text = re.sub(r'(rp)(\d+)', r'\1 \2', text, flags=re.IGNORECASE)
    text = re.sub(r'(\d+),(\d+)', r'\1.\2', text)
    text = re.sub(r'(\d+\.\d+)([tmk])', r'\1 \2', text)
    text = re.sub(r'(?<=[^\d])([.,!?()":])|([.,!?()":])(?=[^\d])', r' \1\2 ', text)
    text = re.sub(r'[^a-zA-Z0-9.,!?()%":\s-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Preprocess dan Tokenize
texts_cleaned = []
tokens_cleaned = []

for text in df['judul']:
    cleaned = preprocess(text)
    texts_cleaned.append(cleaned)
    tokens_cleaned.append(tokenizer.tokenize(cleaned))

# Tambahkan ke dataframe
df.insert(df.columns.get_loc('judul') + 1, 'text', texts_cleaned)
df.insert(df.columns.get_loc('text') + 1, 'tokens', tokens_cleaned)

# Simpan hasil
output_path = f'{base_directory}/Dataset/{dataset_file_name}-preprocessed.csv'
df.to_csv(output_path, index=False)
print("✅ File berhasil disimpan dengan tokenisasi IndoBERT di kolom `tokens`!")


# Split with Jaccard Similiarity

In [None]:
!rm -rf ~/.cache/pip

!pip install transformers

In [None]:
from transformers import BertTokenizer
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import os
from collections import Counter
from datetime import datetime
import time

# === 1. Load Dataset ===
df = pd.read_csv(f'{base_directory}/Dataset/{dataset_file_name}-preprocessed.csv')

# === 2. Setup Split Ratio ===
test_size = 0.3  # Ubah sesuai rasio test yang diinginkan
train_size = int((1 - test_size) * 100)
validation_size = int(test_size * 100)
n_splits = 100  # jumlah percobaan random split

# === 3. Load Tokenizer IndoBERT ===
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-large-p2')

# === 4. Tokenisasi dan Jaccard Similarity ===
def get_token_set(token_lists):
    tokens = set()
    for token_list in token_lists:
        tokens.update(eval(token_list))  # karena disimpan sebagai string list saat CSV disimpan
    return tokens

def jaccard_similarity(a, b):
    return len(a & b) / len(a | b) if (a | b) else 0

# === 5. Lakukan Stratified Shuffle Split + Pilih yang Similarity Tertinggi ===
X = df['text']
y = df['sentimen']
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)

best_similarity = 0
best_split = None
best_iter = 0

print(f"🔁 Mulai proses {n_splits} kali Stratified Shuffle Split...\n")

for i, (train_idx, val_idx) in enumerate(sss.split(X, y), start=1):
    print(f"🔄 Iterasi {i}/{n_splits} ...", flush=True)
    start_time = time.time()

    train_tokens = get_token_set(df.iloc[train_idx]['tokens'])
    val_tokens = get_token_set(df.iloc[val_idx]['tokens'])
    sim_token = jaccard_similarity(train_tokens, val_tokens)

    duration = time.time() - start_time

    print(f"✅ Iterasi {i} selesai — Jaccard Similarity: {sim_token:.3f} — Waktu: {duration:.2f} detik\n", flush=True)

    if sim_token > best_similarity:
        best_similarity = sim_token
        best_split = (train_idx, val_idx)
        best_iter = i

# === 6. Simpan Data Split Terbaik ===
train_data = df.iloc[best_split[0]].reset_index(drop=True)
validation_data = df.iloc[best_split[1]].reset_index(drop=True)

# Hapus kolom 'tokens' setelah split
train_data = train_data.drop(columns=['tokens'])
validation_data = validation_data.drop(columns=['tokens'])

save_path = f'{base_directory}/Dataset/{train_size}-{validation_size}'
os.makedirs(save_path, exist_ok=True)

train_data.to_csv(f'{save_path}/train_data.csv', index=False)
validation_data.to_csv(f'{save_path}/validation_data.csv', index=False)

# === 7. Buat Laporan Split ===
def label_distribution(series):
    counter = Counter(series)
    return '\n'.join([f"  - {label} : {count}" for label, count in counter.items()])

timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
report = f"""
📊 Split Report

Waktu Eksekusi        : {timestamp}
Split Terbaik (Iterasi): {best_iter}
Jaccard Similarity     : {best_similarity:.3f}

Data Training          : {len(train_data)} rows
Data Validation        : {len(validation_data)} rows

Distribusi Label (Train):
{label_distribution(train_data['sentimen'])}

Distribusi Label (Validation):
{label_distribution(validation_data['sentimen'])}
"""

with open(f'{save_path}/split_report.txt', 'w', encoding='utf-8') as f:
    f.write(report.strip())

# === 8. Print Ringkasan ===
print("✅ Split data terbaik ditemukan menggunakan StratifiedShuffleSplit!")
print(f"📁 Folder disimpan di     : {save_path}")
print(f"📄 Laporan split          : {save_path}/split_report.txt")
print(f"📈 Best Token Similarity  : {best_similarity:.3f}")
