# Data Cleaning and Sentiment Labeling for MBG Tweets

This notebook performs data cleaning and sentiment analysis on tweets related to the Makan Bergizi Gratis (MBG) program using an Indonesian-specific sentiment classifier.

In [None]:
# Import required libraries
import json
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch
import numpy as np

## Load the Dataset

In [None]:
# Load the JSON data from the file
with open('../data/mbg_sentiment_db.tweets_November.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Loaded {len(data)} tweets")
print("Sample tweet:")
print(json.dumps(data[0], indent=2, ensure_ascii=False))

## Convert to DataFrame for easier processing

In [None]:
# Convert the JSON data to a pandas DataFrame
df_list = []

for tweet in data:
    flat_tweet = {
        '_id': tweet['_id'],
        'text': tweet['content']['text'],
        'clean_text': tweet['content'].get('clean_text', ''),
        'author_handle': tweet['metadata']['author_handle'],
        'created_at': tweet['metadata']['created_at']['$date'],
        'tweet_url': tweet['metadata']['tweet_url'],
        'reply_count': tweet['metrics']['reply_count'],
        'retweet_count': tweet['metrics']['retweet_count'],
        'like_count': tweet['metrics']['like_count'],
        'sentiment_analyzed': tweet['processing_status']['sentiment_analyzed']
    }
    df_list.append(flat_tweet)

df = pd.DataFrame(df_list)
print(f"DataFrame shape: {df.shape}")
df.head()

## Data Cleaning

In [None]:
def clean_tweet_text(text):
    """
    Clean tweet text by removing URLs, mentions, hashtags, and extra whitespaces
    """
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions (@username)
    text = re.sub(r'@\w+', '[MENTION]', text)
    
    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '[HASHTAG]', text)
    
    # Remove extra whitespaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading and trailing spaces
    text = text.strip()
    
    return text.lower()

# Apply cleaning to the text column
df['cleaned_text'] = df['text'].apply(clean_tweet_text)

print("Original vs Cleaned text sample:")
for i in range(3):
    print(f"Original: {df.iloc[i]['text'][:100]}...")
    print(f"Cleaned:  {df.iloc[i]['cleaned_text'][:100]}...")
    print("---")

## Install and Import the Indonesian RoBERTa Sentiment Classifier

In [None]:
# Install the required packages if needed
# Note: Uncomment the following line if transformers is not installed
# !pip install transformers torch

# Load the Indonesian sentiment classification model
model_name = "w11wo/indonesian-roberta-base-sentiment-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create a sentiment analysis pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

## Apply Sentiment Labeling

In [None]:
# Function to classify sentiment for a single text
def classify_sentiment(text):
    try:
        result = sentiment_pipeline(text)[0]
        return result['label'], result['score']
    except Exception as e:
        print(f"Error processing text: {text[:50]}..., Error: {str(e)}")
        return 'ERROR', 0.0

# Apply sentiment analysis to the cleaned text
# Limit the number of records for initial testing
sample_size = min(len(df), 1000)  # Adjust this number based on your computational resources
print(f"Processing sentiment for {sample_size} tweets...")

# Apply sentiment analysis
results = df['cleaned_text'][:sample_size].apply(lambda x: classify_sentiment(x))

# Extract labels and scores
labels, scores = zip(*results)
df.loc[:sample_size-1, 'sentiment_label'] = labels
df.loc[:sample_size-1, 'sentiment_score'] = scores

# Show results
print(f"Sentiment distribution:")
print(df['sentiment_label'][:sample_size].value_counts())

print(f"\nSample predictions:")
for i in range(min(5, sample_size)):
    print(f"Label: {df.iloc[i]['sentiment_label']}, Score: {df.iloc[i]['sentiment_score']:.3f}")
    print(f"Text: {df.iloc[i]['cleaned_text'][:100]}...")
    print("---")

## Save the labeled dataset

In [None]:
# Update the original data with the sentiment labels
for i in range(sample_size):
    # Find the corresponding tweet in the original data
    tweet_id = df.iloc[i]['_id']
    
    # Find the index in the original data
    for idx, tweet in enumerate(data):
        if tweet['_id'] == tweet_id:
            data[idx]['processing_status']['sentiment_analyzed'] = True
            data[idx]['sentiment_analysis'] = {
                'label': df.iloc[i]['sentiment_label'],
                'confidence_score': float(df.iloc[i]['sentiment_score'])
            }
            break

# Save the updated data back to a new JSON file
output_filename = '../data/mbg_sentiment_db.tweets_November_labeled.json'
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"Saved labeled data to {output_filename}")
print(f"Number of tweets analyzed: {sample_size}")
print(f"Total tweets in dataset: {len(data)}")

## Summary of Analysis

In [None]:
print(f"Dataset Size: {len(df)} tweets")
print(f"Sentiment Distribution:")
if 'sentiment_label' in df.columns:
    print(df['sentiment_label'].value_counts())
    
    # Show statistics for sentiment scores if available
    if 'sentiment_score' in df.columns:
        print(f"\nAverage Confidence Score: {df['sentiment_score'].mean():.3f}")
        print(f"Min Confidence Score: {df['sentiment_score'].min():.3f}")
        print(f"Max Confidence Score: {df['sentiment_score'].max():.3f}")
else:
    print("Sentiment analysis hasn't been performed yet.")