# 5G Network Operations Insights with Fine Tuning of T5-Small (This is the smallest version of T5.)
## Project Overview
Author: Fatih E. NAR<br>
This project aims to deliver a 5g network insight with fine tuning a network performant encoder-decoder TransformerNN<br>

In [1]:
%pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import lzma
import shutil
import pandas as pd
import os
import torch
import threading
import sys
import time
import gc
from datasets import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, get_linear_schedule_with_warmup, Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig, TaskType

# Save the model and tokenizer
model_save_path = "models/5g_oss_model"
model_name = "t5-small"

# Set TOKENIZERS_PARALLELISM to false to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Clear GPU cache before starting
torch.cuda.empty_cache()
gc.collect()

# Cap memory usage to a specific size (e.g., 8 GB) for cuda
max_memory_gb = 8
max_memory_mb = max_memory_gb * 1024
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = f'max_split_size_mb:{max_memory_mb}'

# Check if any accelerator is available 
if torch.cuda.is_available():
    print("Using CUDA (NVIDIA GPU)")
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
    # Leverage multi-gpu if available
    device = torch.device("cuda")
    print("Using CUDA")
    # Clear GPU cache before starting
    torch.cuda.empty_cache()
# Check if MPS (Apple Silicon GPU) is available
elif torch.backends.mps.is_available():
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
    # Leverage multi-gpu if available
    device = torch.device("mps")
    print("Using MPS")
else:
    device = torch.device("cpu")
    print("Using CPU")

# Extract the .xz file
with lzma.open('data/5G_netops_data_1M.csv.xz', 'rb') as f_in:
    with open('data/5G_netops_data_1M.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Load the synthetic telecom data
data_path = "data/5G_netops_data_1M.csv"
data = pd.read_csv(data_path)

# Display basic information about the full dataset
data.info()
data.head()

  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

2024-06-03 09:25:23.776506: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using MPS
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 27 columns):
 #   Column                                Non-Null Count    Dtype  
---  ------                                --------------    -----  
 0   Season                                1000000 non-null  object 
 1   Cell Availability (%)                 1000000 non-null  float64
 2   MTTR (hours)                          1000000 non-null  float64
 3   Throughput (Mbps)                     1000000 non-null  float64
 4   Latency (ms)                          1000000 non-null  float64
 5   Packet Loss Rate (%)                  1000000 non-null  float64
 6   Call Drop Rate (%)                    1000000 non-null  float64
 7   Handover Success Rate (%)             1000000 non-null  float64
 8   Alarm Count                           1000000 non-null  int64  
 9   Critical Alarm Count                  1000000 non-null  int64  
 10  Parameter Changes                     1000000

Unnamed: 0,Season,Cell Availability (%),MTTR (hours),Throughput (Mbps),Latency (ms),Packet Loss Rate (%),Call Drop Rate (%),Handover Success Rate (%),Alarm Count,Critical Alarm Count,...,Security Incidents,Authentication Failures,Temperature (°C),Humidity (%),Weather,Issue Reported,City,State,Zip,Fault Occurrence Rate
0,Fall,95.7495,5.81,18.736,92.32,1.63,1.64255,96.65,5,2,...,1,8,16.82,38.56,Clear,no,Port Jennifer,VA,43568,2
1,Winter,99.3145,7.91,-19.6452,61.249,1.2054,2.78315,95.64,6,1,...,4,0,38.52,53.13,Clouds,no,East John,WA,56449,1
2,Winter,92.2955,1.89,138.774,74.48,0.01,1.97045,91.9,4,1,...,4,7,19.21,55.48,Clear,yes,Andreview,PR,77788,2
3,Summer,97.421,1.58,-126.9951,46.8395,5.1783,0.183,97.49,4,0,...,2,7,24.83,87.41,Clouds,no,Stacybury,GU,21375,2
4,Summer,97.215,1.1,29.308,38.1225,3.2718,1.0408,92.47,3,0,...,1,9,37.49,61.02,Clouds,no,East Laurastad,AR,4893,1


In [3]:
# Fill NaN values and prepare input and target texts
# Ensure all NaN values are filled with empty strings
data = data.fillna('')

# Ensure 'Zip' column is treated as a string
data['Zip'] = data['Zip'].astype(str)

# Prepare the input_text and target_text columns
data['input_text'] = data.apply(lambda row: f"Season: {row['Season']} Cell Availability: {row['Cell Availability (%)']} MTTR: {row['MTTR (hours)']} Throughput: {row['Throughput (Mbps)']} Latency: {row['Latency (ms)']} Packet Loss Rate: {row['Packet Loss Rate (%)']} Call Drop Rate: {row['Call Drop Rate (%)']} Handover Success Rate: {row['Handover Success Rate (%)']} Alarm Count: {row['Alarm Count']} Critical Alarm Count: {row['Critical Alarm Count']} Parameter Changes: {row['Parameter Changes']} Successful Configuration Changes: {row['Successful Configuration Changes (%)']} Data Usage: {row['Data Usage (GB)']} User Count: {row['User Count']} Signal Strength: {row['Signal Strength (dBm)']} Jitter: {row['Jitter (ms)']} Connection Setup Success Rate: {row['Connection Setup Success Rate (%)']} Security Incidents: {row['Security Incidents']} Authentication Failures: {row['Authentication Failures']} Temperature: {row['Temperature (°C)']} Humidity: {row['Humidity (%)']} Weather: {row['Weather']} Issue Reported: {row['Issue Reported']} City: {row['City']} State: {row['State']} Zip: {row['Zip']}", axis=1)
data['target_text'] = data['Fault Occurrence Rate'].astype(str)


# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(data)

# Split the dataset into training and evaluation
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Check the loaded dataset
print(f"Training Dataset size: {len(train_dataset)}")
print(f"Evaluation Dataset size: {len(eval_dataset)}")
print(train_dataset[0])

Training Dataset size: 800000
Evaluation Dataset size: 200000
{'Season': 'Fall', 'Cell Availability (%)': 90.865, 'MTTR (hours)': 7.5, 'Throughput (Mbps)': -9.001500000000002, 'Latency (ms)': 68.77, 'Packet Loss Rate (%)': 3.6408, 'Call Drop Rate (%)': 2.0601, 'Handover Success Rate (%)': 96.74, 'Alarm Count': 11, 'Critical Alarm Count': 2, 'Parameter Changes': 12, 'Successful Configuration Changes (%)': 99.68, 'Data Usage (GB)': 31.68, 'User Count': 4873, 'Signal Strength (dBm)': -66.96, 'Jitter (ms)': 26.9, 'Connection Setup Success Rate (%)': 99.96, 'Security Incidents': 4, 'Authentication Failures': 8, 'Temperature (°C)': 34.51, 'Humidity (%)': 86.18, 'Weather': 'Clouds', 'Issue Reported': 'yes', 'City': 'Smithville', 'State': 'OK', 'Zip': '54923', 'Fault Occurrence Rate': 3, 'input_text': 'Season: Fall Cell Availability: 90.865 MTTR: 7.5 Throughput: -9.001500000000002 Latency: 68.77 Packet Loss Rate: 3.6408 Call Drop Rate: 2.0601 Handover Success Rate: 96.74 Alarm Count: 11 Critic

In [4]:
# Load the tokenizer from the pretrained model
tokenizer = T5Tokenizer.from_pretrained(model_name)
# Match Tokenizer to the model
tokenizer.add_tokens([f'<SPL_{i}' for i in range(0,28)])
# Add the pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
else:
    tokenizer.pad_token = tokenizer.eos_token
#print len of tokenizer
print(len(tokenizer))

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)
model = T5ForConditionalGeneration.from_pretrained(model_name)


# Tokenize datasets
def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=512, padding='max_length', truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

columns = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=columns)
eval_dataset.set_format(type='torch', columns=columns)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


32128


Map: 100%|██████████| 800000/800000 [10:55<00:00, 1220.84 examples/s]
Map: 100%|██████████| 200000/200000 [02:48<00:00, 1188.67 examples/s]


In [11]:
# Define PEFT/LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=4, # it was 2
    lora_alpha=32, # it was 16
    lora_dropout=0.1, # it was 0.05
    target_modules=['q', 'v', 'k', 'o']
)
model = get_peft_model(model, lora_config)
model.to(device)  # Language modeling head to GPU 
model.eval()

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=512, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=4, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=4, out_features=512, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
               

In [12]:
# Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Output directory
    overwrite_output_dir=True,  # Overwrite the content of the output directory
    num_train_epochs=5,  # Number of training epochs
    per_device_train_batch_size=42,  # Batch size per device during training
    gradient_accumulation_steps=7,  # Accumulate gradients over multiple steps
    learning_rate=5e-5,  # Learning rate
    save_steps=2000,  # Save checkpoint every 2000 steps
    save_total_limit=2,  # Limit the total amount of checkpoints
    eval_strategy="steps",  # Evaluate during training at each `logging_steps`
    logging_steps=500,  # Log every 500 steps
    eval_steps=2000,  # Evaluate every 2000 steps
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="loss",  # Use loss to evaluate the best model
    predict_with_generate=True,  # Use generation for evaluation
    fp16=False,  # Disable mixed precision training for MPS
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

  4%|▎         | 500/13605 [54:44<23:52:57,  6.56s/it]

{'loss': 0.3141, 'grad_norm': 0.018248245120048523, 'learning_rate': 4.816244027930908e-05, 'epoch': 0.18}


  7%|▋         | 1000/13605 [1:49:30<22:57:14,  6.56s/it]

{'loss': 0.0152, 'grad_norm': 0.006048723589628935, 'learning_rate': 4.6324880558618154e-05, 'epoch': 0.37}


 11%|█         | 1500/13605 [2:44:08<22:02:06,  6.55s/it]

{'loss': 0.0147, 'grad_norm': 0.005775829777121544, 'learning_rate': 4.448732083792723e-05, 'epoch': 0.55}


 15%|█▍        | 2000/13605 [3:38:47<21:07:24,  6.55s/it]

{'loss': 0.0145, 'grad_norm': 0.004081073682755232, 'learning_rate': 4.264976111723631e-05, 'epoch': 0.73}


                                                         


{'eval_loss': 0.014094140380620956, 'eval_runtime': 6014.8956, 'eval_samples_per_second': 33.251, 'eval_steps_per_second': 4.156, 'epoch': 0.73}


 18%|█▊        | 2500/13605 [6:14:01<20:11:35,  6.55s/it]    

{'loss': 0.0144, 'grad_norm': 0.0053463950753211975, 'learning_rate': 4.081220139654539e-05, 'epoch': 0.92}


 22%|██▏       | 3000/13605 [7:09:49<19:49:08,  6.73s/it]

{'loss': 0.0144, 'grad_norm': 0.005180933978408575, 'learning_rate': 3.897464167585447e-05, 'epoch': 1.1}


 26%|██▌       | 3500/13605 [8:06:23<18:57:57,  6.76s/it]

{'loss': 0.0143, 'grad_norm': 0.0043270946480333805, 'learning_rate': 3.713708195516354e-05, 'epoch': 1.29}


 29%|██▉       | 4000/13605 [9:02:37<17:57:42,  6.73s/it]

{'loss': 0.0143, 'grad_norm': 0.006634572520852089, 'learning_rate': 3.529952223447262e-05, 'epoch': 1.47}


                                                         
 29%|██▉       | 4000/13605 [9:50:02<17:57:42,  6.73s/it]

{'eval_loss': 0.014063416980206966, 'eval_runtime': 2845.3757, 'eval_samples_per_second': 70.289, 'eval_steps_per_second': 8.786, 'epoch': 1.47}


 33%|███▎      | 4500/13605 [10:46:14<17:01:21,  6.73s/it]  

{'loss': 0.0142, 'grad_norm': 0.0033459693659096956, 'learning_rate': 3.34619625137817e-05, 'epoch': 1.65}


 37%|███▋      | 5000/13605 [11:42:24<16:04:40,  6.73s/it]

{'loss': 0.0142, 'grad_norm': 0.004528353456407785, 'learning_rate': 3.162440279309078e-05, 'epoch': 1.84}


 40%|████      | 5500/13605 [12:38:39<14:44:16,  6.55s/it]

{'loss': 0.0142, 'grad_norm': 0.006115181837230921, 'learning_rate': 2.9786843072399855e-05, 'epoch': 2.02}


 44%|████▍     | 6000/13605 [13:33:15<13:50:16,  6.55s/it]

{'loss': 0.0141, 'grad_norm': 0.006036865524947643, 'learning_rate': 2.7949283351708934e-05, 'epoch': 2.2}


                                                          
 44%|████▍     | 6000/13605 [14:20:44<13:50:16,  6.55s/it]

{'eval_loss': 0.0139160742983222, 'eval_runtime': 2849.012, 'eval_samples_per_second': 70.2, 'eval_steps_per_second': 8.775, 'epoch': 2.2}


 48%|████▊     | 6500/13605 [15:15:21<12:55:54,  6.55s/it]   

{'loss': 0.0141, 'grad_norm': 0.006144976243376732, 'learning_rate': 2.611172363101801e-05, 'epoch': 2.39}


 51%|█████▏    | 7000/13605 [16:09:57<12:01:24,  6.55s/it]

{'loss': 0.0141, 'grad_norm': 0.00606174860149622, 'learning_rate': 2.4274163910327085e-05, 'epoch': 2.57}


 55%|█████▌    | 7500/13605 [17:04:34<11:06:35,  6.55s/it]

{'loss': 0.0141, 'grad_norm': 0.006004191003739834, 'learning_rate': 2.2436604189636164e-05, 'epoch': 2.76}


 59%|█████▉    | 8000/13605 [17:59:10<10:11:59,  6.55s/it]

{'loss': 0.014, 'grad_norm': 0.0038633786607533693, 'learning_rate': 2.059904446894524e-05, 'epoch': 2.94}


                                                          


{'eval_loss': 0.013824737630784512, 'eval_runtime': 2848.7985, 'eval_samples_per_second': 70.205, 'eval_steps_per_second': 8.776, 'epoch': 2.94}


 62%|██████▏   | 8500/13605 [19:41:20<9:23:15,  6.62s/it]    

{'loss': 0.014, 'grad_norm': 0.00508967787027359, 'learning_rate': 1.876148474825432e-05, 'epoch': 3.12}


 66%|██████▌   | 9000/13605 [20:36:00<8:23:24,  6.56s/it]

{'loss': 0.014, 'grad_norm': 0.003918828908354044, 'learning_rate': 1.6923925027563398e-05, 'epoch': 3.31}


 70%|██████▉   | 9500/13605 [21:30:39<7:27:43,  6.54s/it]

{'loss': 0.0386, 'grad_norm': nan, 'learning_rate': 1.5086365306872474e-05, 'epoch': 3.49}


 74%|███████▎  | 10000/13605 [22:25:12<6:33:19,  6.55s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.3248805586181551e-05, 'epoch': 3.67}


                                                          
 74%|███████▎  | 10000/13605 [23:14:03<6:33:19,  6.55s/it]

{'eval_loss': nan, 'eval_runtime': 2931.6979, 'eval_samples_per_second': 68.22, 'eval_steps_per_second': 8.527, 'epoch': 3.67}


 77%|███████▋  | 10500/13605 [24:08:42<5:38:53,  6.55s/it]   

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.1411245865490629e-05, 'epoch': 3.86}


 81%|████████  | 11000/13605 [25:03:41<4:53:05,  6.75s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 9.573686144799706e-06, 'epoch': 4.04}


 85%|████████▍ | 11500/13605 [25:59:59<3:59:10,  6.82s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 7.736126424108783e-06, 'epoch': 4.23}


 88%|████████▊ | 12000/13605 [26:56:47<3:02:12,  6.81s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5.8985667034178614e-06, 'epoch': 4.41}


                                                          
 88%|████████▊ | 12000/13605 [27:54:35<3:02:12,  6.81s/it]

{'eval_loss': nan, 'eval_runtime': 3468.047, 'eval_samples_per_second': 57.669, 'eval_steps_per_second': 7.209, 'epoch': 4.41}


 92%|█████████▏| 12500/13605 [28:51:24<2:06:45,  6.88s/it]    

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 4.061006982726939e-06, 'epoch': 4.59}


 96%|█████████▌| 13000/13605 [29:48:11<1:08:41,  6.81s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 2.223447262036016e-06, 'epoch': 4.78}


 99%|█████████▉| 13500/13605 [30:44:59<11:56,  6.82s/it]  

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 3.858875413450937e-07, 'epoch': 4.96}


100%|██████████| 13605/13605 [30:56:55<00:00,  8.19s/it]

{'train_runtime': 111415.7272, 'train_samples_per_second': 35.902, 'train_steps_per_second': 0.122, 'train_loss': 0.02188537514240415, 'epoch': 5.0}





TrainOutput(global_step=13605, training_loss=0.02188537514240415, metrics={'train_runtime': 111415.7272, 'train_samples_per_second': 35.902, 'train_steps_per_second': 0.122, 'total_flos': 5.449646524763996e+17, 'train_loss': 0.02188537514240415, 'epoch': 4.999737505249895})

In [13]:
# Save the model and tokenizer
print(f"Tokenizer Final Size = {len(tokenizer)}")
print(f"Model Final Size = {model.get_input_embeddings().weight.shape[0]}")
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print("Training complete and model saved.")

Tokenizer Final Size = 32128
Model Final Size = 32128
Training complete and model saved.




In [14]:
# Results
results = trainer.evaluate(eval_dataset)
print("Evaluation Results:", results)

100%|██████████| 25000/25000 [58:03<00:00,  7.18it/s]  

Evaluation Results: {'eval_loss': 0.013824737630784512, 'eval_runtime': 3483.8647, 'eval_samples_per_second': 57.408, 'eval_steps_per_second': 7.176, 'epoch': 4.999737505249895}



