In [16]:
import pandas as pd
import json

# Load JSONL file (line-by-line)
data = []
with open('/Users/mraffyzeidan/Learning/TransKI/dev.01.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))  # Parse each line as a JSON object

# Convert to DataFrame
df = pd.DataFrame(data)

# Function to flatten nested lists (recursively) and join them into a string
def flatten_and_join(paragraph):
    # Recursively flatten the list
    flat_list = []
    def flatten(nested_list):
        for item in nested_list:
            if isinstance(item, list):
                flatten(item)  # Recurse into sublist
            else:
                flat_list.append(item)  # Add item to flat_list
    flatten(paragraph)
    return ' '.join(flat_list)

# Apply the function to both 'paragraphs' and 'summary'
df['text'] = df['paragraphs'].apply(lambda x: flatten_and_join(x))
df['summary'] = df['summary'].apply(lambda x: flatten_and_join(x))

# Drop the original columns if no longer needed
df_summarized = df[['text', 'summary']]

# Show a preview of the DataFrame
print(df_summarized.head())

                                                text  \
0  Ketua MPR Zulkifli Hasan menyesalkan kisruh ya...   
1  Suara.com - Cerita sekuel terbaru James Bond b...   
2  Menteri Pertanian Andi Amran Sulaiman mengatak...   
3  SPANYOL ‚Äì Barcelona berhasil memboyong kemenan...   
4  Wamena ( ANTARA News ) - Pemerintah Kabupaten ...   

                                             summary  
0  Ketua MPR Zulkifli Hasan menyesalkan kisruh ya...  
1  Cerita sekuel terbaru James Bond bocor . Menur...  
2  Saat ini pemerintah terus meningkatkan pendapa...  
3  Barcelona berhasil memboyong kemenangan besar ...  
4  Pemerintah Kabupaten Jayawijaya , Papua , mend...  


In [17]:
df_summarized.to_csv('SummaryINDO2.csv', index=False)

In [2]:
import torch
from transformers import AutoModel

# Load IndoBERT model
model = AutoModel.from_pretrained("indobenchmark/indobert-lite-base-p1")

# Ensure that dropout layers are using a float value for `p`
for name, layer in model.named_modules():
    if isinstance(layer, torch.nn.Dropout):
        layer.p = float(layer.p)  # Ensure p is a float

# Create an input tensor with the correct type (long integers for input_ids)
example_input = torch.randint(0, 1000, (1, 512), dtype=torch.long)

# Save in TorchScript format (with strict=False to allow more flexibility with outputs)
traced_model = torch.jit.trace(model, example_input, strict=False)  # Use strict=False
traced_model.save("indobert_model_traced.pt")

# Save the state_dict
torch.save(model.state_dict(), "indobert_state_dict.pt")
model.save_pretrained("./indobert-lite-base-p1")



In [3]:
import torch
import json
from transformers import AutoModel, AutoTokenizer, AutoConfig
import os

def debug_file_structure(file_path):
    """Debug what's actually in the file"""
    print(f"üîç Debugging file: {file_path}")
    
    try:
        # Try to load with torch
        data = torch.load(file_path, map_location='cpu')
        print(f"üìä File type: {type(data)}")
        
        if isinstance(data, dict):
            print(f"üìÅ Dictionary keys: {list(data.keys())}")
            for key in list(data.keys())[:5]:  # Show first 5 keys
                value = data[key]
                print(f"   {key}: {type(value)} - shape: {getattr(value, 'shape', 'N/A')}")
        elif hasattr(data, 'state_dict'):
            print("üìÅ Model object with state_dict")
            state_dict = data.state_dict()
            print(f"   State dict keys: {list(state_dict.keys())[:5]}")
        else:
            print(f"üìÅ Other type: {type(data)}")
            
    except Exception as e:
        print(f"‚ùå Failed to load: {e}")

def convert_to_simple_format():
    """Convert to a simple format that Rust can read"""
    
    # First, debug the current file
    debug_file_structure("indobert_state_dict.pt")
    
    print("\nüîÑ Attempting conversion...")
    
    try:
        # Load the problematic file
        original_data = torch.load("indobert_state_dict.pt", map_location='cpu')
        
        # Extract state dict based on file structure
        if isinstance(original_data, dict):
            if 'state_dict' in original_data:
                state_dict = original_data['state_dict']
            elif 'model_state_dict' in original_data:
                state_dict = original_data['model_state_dict']
            else:
                # Assume it's already a state dict
                state_dict = original_data
        elif hasattr(original_data, 'state_dict'):
            state_dict = original_data.state_dict()
        else:
            print("‚ùå Unknown file structure")
            return False
        
        # Clean the state dict - remove any None values or complex structures
        clean_state_dict = {}
        for key, value in state_dict.items():
            if value is not None and isinstance(value, torch.Tensor):
                clean_state_dict[key] = value
            else:
                print(f"‚ö†Ô∏è  Skipping non-tensor key: {key}")
        
        print(f"‚úÖ Cleaned state dict with {len(clean_state_dict)} tensors")
        
        # Save as a simple state dict
        torch.save(clean_state_dict, "indobert_clean.pt")
        print("üíæ Saved clean state dict: indobert_clean.pt")
        
        return True
        
    except Exception as e:
        print(f"‚ùå Conversion failed: {e}")
        return False

def download_fresh_model():
    """Download a fresh model from HuggingFace"""
    print("\nüì• Downloading fresh IndoBERT model...")
    
    try:
        model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
        
        # Get the state dict
        state_dict = model.state_dict()
        print(f"üìä Model layers: {len(state_dict)}")
        
        # Save just the state dict
        torch.save(state_dict, "indobert_fresh.pt")
        print("üíæ Saved fresh model: indobert_fresh.pt")
        
        # Also save config for reference
        config = model.config
        config_dict = {
            "vocab_size": config.vocab_size,
            "hidden_size": config.hidden_size,
            "num_hidden_layers": config.num_hidden_layers,
            "num_attention_heads": config.num_attention_heads,
            "intermediate_size": config.intermediate_size,
            "hidden_dropout_prob": config.hidden_dropout_prob,
        }
        
        with open("indobert_config_fresh.json", "w") as f:
            json.dump(config_dict, f, indent=2)
        print("üíæ Saved fresh config: indobert_config_fresh.json")
        
        return True
        
    except Exception as e:
        print(f"‚ùå Download failed: {e}")
        return False

if __name__ == "__main__":
    print("üöÄ IndoBERT Model Converter")
    print("=" * 50)
    
    # Try conversion first
    if convert_to_simple_format():
        print("\nüéâ Conversion successful! Use 'indobert_clean.pt'")
    else:
        print("\nüîÑ Conversion failed, trying fresh download...")
        if download_fresh_model():
            print("\nüéâ Fresh download successful! Use 'indobert_fresh.pt'")
        else:
            print("\nüí• All methods failed.")

üöÄ IndoBERT Model Converter
üîç Debugging file: indobert_state_dict.pt
üìä File type: <class 'collections.OrderedDict'>
üìÅ Dictionary keys: ['embeddings.word_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.LayerNorm.weight', 'embeddings.LayerNorm.bias', 'encoder.embedding_hidden_mapping_in.weight', 'encoder.embedding_hidden_mapping_in.bias', 'encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight', 'encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias', 'encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight', 'encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias', 'encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight', 'encoder.albert_layer_groups.0.albert_layers.0.attention.key.bias', 'encoder.albert_layer_groups.0.albert_layers.0.attention.value.weight', 'encoder.albert_layer_groups.0.albert_layers.0.attention.value.bias', 'encod

  data = torch.load(file_path, map_location='cpu')
  original_data = torch.load("indobert_state_dict.pt", map_location='cpu')


üíæ Saved clean state dict: indobert_clean.pt

üéâ Conversion successful! Use 'indobert_clean.pt'


In [2]:
import pandas as pd
data = pd.read_csv('/Users/mraffyzeidan/Learning/TransKI/fine_tuned104/SummaryINDO2.csv')
data = data.iloc[:150, :]

In [4]:
data.to_csv('Reduced500.csv', index=False)

In [32]:
data = data.iloc[:20, :].to_csv("Benchmark.csv", index=False)