# Old Code

In [13]:
import json
from sklearn.model_selection import train_test_split

# Define the file path to read the data
file_path = 'resource_allocation_se.txt'

# Define the instruction
instruction = ("Take a deep breath and work on this problem step-by-step. "
               "You are a mathematical tool to predict some model. Your job is to predict B for given A. "
               "The following is the dataset that you can use for the prediction.")

# Initialize a list to store formatted data
formatted_data = []

# Read and process the file
with open(file_path, 'r') as file:
    for line in file:
        if line.strip():  # Check if the line is not empty
            # Extract input and output
            parts = line.split("then")
            input_clause = parts[0].strip()
            output_clause = parts[1].strip()

            # Create the data entry
            data_entry = {
                "instruction": instruction,
                "input": input_clause,
                "output": output_clause
            }

            # Append to the formatted data list
            formatted_data.append(data_entry)

# Split the data into train and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(formatted_data, test_size=0.2, random_state=42)

# Save the train data to a JSONL file
train_file = 'train_dataset.jsonl'
with open(train_file, 'w') as jsonl_file:
    for entry in train_data:
        jsonl_file.write(json.dumps(entry) + '\n')

# Save the validation data to a JSONL file
val_file = 'val_dataset.jsonl'
with open(val_file, 'w') as jsonl_file:
    for entry in val_data:
        jsonl_file.write(json.dumps(entry) + '\n')

print(f"Train and validation datasets have been saved as {train_file} and {val_file}.")


Train and validation datasets have been saved as train_dataset.jsonl and val_dataset.jsonl.


In [7]:
!pip install datasets transformers pyarrow==14.0.1



In [14]:
from datasets import load_dataset

# Load the formatted data from the JSONL files
dataset_dict = load_dataset('json', data_files={'train': 'train_dataset.jsonl', 'validation': 'val_dataset.jsonl'})

# Save the dataset to the Hugging Face Hub (replace 'your_dataset_name' with your desired name)
dataset_dict.push_to_hub("tayyibsupercool/resource_allocation_telecom_spectral_efficiency_instruct", private=False)


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/80 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/353 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/tayyibsupercool/resource_allocation_telecom_spectral_efficiency_instruct/commit/e28a652dc5677b8fabfe164f897420fb60994e49', commit_message='Upload dataset', commit_description='', oid='e28a652dc5677b8fabfe164f897420fb60994e49', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
from datasets import load_dataset

# Load the dataset directly from Hugging Face Hub
dataset_dict = load_dataset("tayyibsupercool/resource_allocation_telecom_spectral_efficiency_instruct")

# Print one row from the training set
print(dataset_dict['train'][0])  # Replace 0 with any other index to print a different row


Downloading readme:   0%|          | 0.00/471 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/343k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/79999 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/20000 [00:00<?, ? examples/s]

{'instruction': 'Take a deep breath and work on this problem step-by-step. You are a mathematical tool to predict some model. Your job is to predict B for given A. The following is the dataset that you can use for the prediction.', 'input': 'If A is -148, -331, -445, -149,', 'output': 'B is 100, 100.'}


# Upgraded code

In [6]:
# !huggingface-cli login # do in terminal

In [8]:
import json
from datasets import load_dataset, DatasetDict, Dataset

# Paths to the query text files
ee_train_file = "../../datasets/reproduced method/detailed/ee/ee_train_query_text.txt"
ee_val_file = "../../datasets/reproduced method/detailed/ee/ee_val_query_text.txt"
se_train_file = "../../datasets/reproduced method/detailed/se/se_train_query_text.txt"
se_val_file = "../../datasets/reproduced method/detailed/se/se_val_query_text.txt"

# Define the instruction
instruction = ("Take a deep breath and work on this problem step-by-step. "
               "You are a mathematical tool to predict some model. Your job is to predict B for given A. "
               "The following is the dataset that you can use for the prediction.")

# Function to load query text files and convert to instruction-based format
def convert_to_instruction_format(file_path):
    dataset_records = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:  # Skip empty lines
                # Extract sample_index from the line
                try:
                    sample_index, query_text = line.split(":", 1)
                    
                    sample_index = sample_index.replace("sample_index", "").strip()
                    input_text, output_text = query_text.split("then B is")
                    input_text = input_text.strip()
                    output_text = output_text.strip()

                    # Append formatted record
                    dataset_records.append({
                        "instruction": instruction,
                        "input": input_text,
                        "output": output_text,
                        "sample_index": sample_index
                    })
                except ValueError:
                    print(f"Skipping line due to parsing error: {line}")
    return dataset_records

# Convert each text file into the appropriate instruction format
ee_train_data = convert_to_instruction_format(ee_train_file)
ee_val_data = convert_to_instruction_format(ee_val_file)
se_train_data = convert_to_instruction_format(se_train_file)
se_val_data = convert_to_instruction_format(se_val_file)

# Ensure the data is formatted as a dictionary of lists
def format_for_dataset(records):
    formatted_data = {
        "instruction": [],
        "input": [],
        "output": [],
        "sample_index": []
    }
    for record in records:
        formatted_data["instruction"].append(record["instruction"])
        formatted_data["input"].append(record["input"])
        formatted_data["output"].append(record["output"])
        formatted_data["sample_index"].append(record["sample_index"])
    return formatted_data

# Create Hugging Face datasets from the formatted data
ee_dataset = DatasetDict({
    'train': Dataset.from_dict(format_for_dataset(ee_train_data)),
    'validation': Dataset.from_dict(format_for_dataset(ee_val_data))
})

se_dataset = DatasetDict({
    'train': Dataset.from_dict(format_for_dataset(se_train_data)),
    'validation': Dataset.from_dict(format_for_dataset(se_val_data))
})

# Save datasets to the Hugging Face Hub
ee_dataset.push_to_hub("tayyibsupercool/resource_allocation_telecom_energy_efficiency_instruct", private=False)
se_dataset.push_to_hub("tayyibsupercool/resource_allocation_telecom_spectral_efficiency_instruct", private=False)

print("Datasets pushed successfully to Hugging Face Hub.")


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/90 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/509 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/90 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/509 [00:00<?, ?B/s]

Datasets pushed successfully to Hugging Face Hub.
