# 5G Network Operations Insights with Fine Tuning of EleutherAI/gpt-j-6B
## Project Overview
This project aims to deliver a 5g network insight with fine tuning a network performant LLM

In [None]:
%pip install -r requirements.txt

In [None]:
import lzma
import shutil
import pandas as pd
import os
from datasets import Dataset
from transformers import GPTJForCausalLM, AutoTokenizer, Trainer, TrainingArguments

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# Extract the .xz file
with lzma.open('data/5G_netops_data.csv.xz', 'rb') as f_in:
    with open('data/5G_netops_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Load the synthetic telecom data
data_path = "data/5G_netops_data.csv"
data = pd.read_csv(data_path)

# Display basic information about the dataset
data.info()
data.head()

In [None]:
# Convert DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data)

# Define preprocessing function with actual column names
def preprocess_function(examples):
    input_texts = [
        " ".join([
            f"Date: {date}",
            f"Cell Availability: {cell_avail}",
            f"MTTR: {mttr}",
            f"Throughput: {throughput}",
            f"Latency: {latency}",
            f"Packet Loss Rate: {packet_loss}",
            f"Call Drop Rate: {call_drop}",
            f"Handover Success Rate: {handover_success}",
            f"Alarm Count: {alarm_count}",
            f"Critical Alarm Count: {critical_alarm}",
            f"Parameter Changes: {param_changes}",
            f"Successful Configuration Changes: {success_config}",
            f"Data Usage: {data_usage}",
            f"User Count: {user_count}",
            f"Signal Strength: {signal_strength}",
            f"Jitter: {jitter}",
            f"Connection Setup Success Rate: {connection_success}",
            f"Security Incidents: {security_incidents}",
            f"Authentication Failures: {auth_failures}"
        ]) for date, cell_avail, mttr, throughput, latency, packet_loss, call_drop, handover_success, alarm_count, critical_alarm, param_changes, success_config, data_usage, user_count, signal_strength, jitter, connection_success, security_incidents, auth_failures in zip(
            examples['Date'], 
            examples['Cell Availability (%)'], 
            examples['MTTR (hours)'], 
            examples['Throughput (Mbps)'], 
            examples['Latency (ms)'], 
            examples['Packet Loss Rate (%)'], 
            examples['Call Drop Rate (%)'], 
            examples['Handover Success Rate (%)'], 
            examples['Alarm Count'], 
            examples['Critical Alarm Count'], 
            examples['Parameter Changes'], 
            examples['Successful Configuration Changes (%)'], 
            examples['Data Usage (GB)'], 
            examples['User Count'], 
            examples['Signal Strength (dBm)'], 
            examples['Jitter (ms)'], 
            examples['Connection Setup Success Rate (%)'], 
            examples['Security Incidents'], 
            examples['Authentication Failures'])
    ]
    target_texts = [str(fault_rate) for fault_rate in examples['Fault Occurrence Rate']]
    
    return {'input_text': input_texts, 'target_text': target_texts}

# Apply preprocessing
tokenized_dataset = hf_dataset.map(preprocess_function, batched=True)


In [None]:
# Setup Model Structure
model_name = "EleutherAI/gpt-j-6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPTJForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    inputs = tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=512)
    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids']
    }

tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)

columns = ['input_ids', 'attention_mask', 'labels']
tokenized_dataset.set_format(type='torch', columns=columns)

# Check the tokenized dataset
print(tokenized_dataset[0])

In [5]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Reduce batch size
    gradient_accumulation_steps=8,  # Accumulate gradients over 16 steps
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
# Results
results = trainer.evaluate()
print(results)
