# Setup for running customization notebooks both for fine-tuning and continued pre-training using Amazon Bedrock

In this notebook, we will create set of roles and an s3 bucket which will be used for other notebooks in this module. 

## Setup
Install and import all the needed libraries and dependencies to complete this notebook.

<div class="alert alert-block alert-warning">
<b>Warning:</b> Please ignore error messages related to pip's dependency resolver.
</div>

In [1]:
!pip install --upgrade pip
%pip install --no-build-isolation --force-reinstall \
    "boto3>=1.28.57" \
    "awscli>=1.29.57" \
    "botocore>=1.31.57"
!pip install pandas
!pip install datasets

Collecting boto3>=1.28.57
  Downloading boto3-1.35.8-py3-none-any.whl.metadata (6.6 kB)
Collecting awscli>=1.29.57
  Downloading awscli-1.34.8-py3-none-any.whl.metadata (11 kB)
Collecting botocore>=1.31.57
  Downloading botocore-1.35.8-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.28.57)
  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3>=1.28.57)
  Using cached s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Collecting docutils<0.17,>=0.10 (from awscli>=1.29.57)
  Using cached docutils-0.16-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting PyYAML<6.1,>=3.10 (from awscli>=1.29.57)
  Using cached PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting colorama<0.4.7,>=0.2.5 (from awscli>=1.29.57)
  Using cached colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting rsa<4.8,>=3.1.2 (from awscli>=1.29.57)
  Using cached rsa-4.7.2-py3-none-any.whl.metadata

In [2]:
# restart kernel for packages to take effect
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [3]:
import pandas as pd
from datasets import Dataset

# Load the CSV file
csv_file_path = "sourcestraining.csv"  # Ensure this path points to your CSV file
df = pd.read_csv(csv_file_path, delimiter=';')  # Assuming the file is semicolon-separated

# Rename columns to match input/output for translation
df.columns = ['translation_input', 'translation_target']

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into training, validation, and test sets
train_test_split = dataset.train_test_split(test_size=0.2)
train_val_split = train_test_split['train'].train_test_split(test_size=0.1)

train_dataset = train_val_split['train']
validation_dataset = train_val_split['test']
test_dataset = train_test_split['test']

# Display the size of each split
print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(validation_dataset)}")
print(f"Test set size: {len(test_dataset)}")

  from .autonotebook import tqdm as notebook_tqdm


FileNotFoundError: [Errno 2] No such file or directory: 'sourcestraining.csv'

In [None]:
import os

# Define dataset folder and file names
dataset_folder = "fine-tuning-datasets"
train_file_name = "train.csv"
validation_file_name = "validation.csv"
test_file_name = "test.csv"

# Create the directory if it doesn't exist
os.makedirs(dataset_folder, exist_ok=True)

# Save the datasets to CSV files
train_dataset.to_pandas().to_csv(os.path.join(dataset_folder, train_file_name), index=False)
validation_dataset.to_pandas().to_csv(os.path.join(dataset_folder, validation_file_name), index=False)
test_dataset.to_pandas().to_csv(os.path.join(dataset_folder, test_file_name), index=False)

# Get the absolute path of the dataset folder
abs_path = os.path.abspath(dataset_folder)
print(f"Datasets saved to {abs_path}")

## Storing variables to be used in other notebooks if needed. 

> Please make sure to use the same kernel as used for 00_setup.ipynb for other notebooks on fine-tuning and continued pre-training. 

In [None]:
#%store role_arn
#%store bucket_name
#%store role_name
#%store policy_arn
#%store s3_train_uri
#%store s3_validation_uri
#%store s3_test_uri