In [1]:
!pip install google-cloud google-cloud-secret-manager

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting google-cloud
  Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)
Collecting google-cloud-secret-manager
  Downloading google_cloud_secret_manager-2.16.1-py2.py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.7/116.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting grpc-google-iam-v1<1.0.0dev,>=0.12.4
  Downloading grpc_google_iam_v1-0.12.6-py2.py3-none-any.whl (26 kB)
Installing collected packages: google-cloud, grpc-google-iam-v1, google-cloud-secret-manager
Successfully installed google-cloud-0.34.0 google-cloud-secret-manager-2.16.1 grpc-google-iam-v1-0.12.6


In [2]:
from google.cloud import secretmanager
from google.colab import auth


def get_secret(secret_name, proj='aigamer-383501'):
    auth.authenticate_user()
    client = secretmanager.SecretManagerServiceClient()
    response = client.access_secret_version(request={"name": f"projects/{proj}/secrets/{secret_name}/versions/latest"})
    return response.payload.data.decode('UTF-8')


def get_aws_creds():
    aws_key = get_secret('aws-key')
    aws_secret = get_secret('aws-secret')
    return aws_key, aws_secret


def get_github_creds(out_path='github-key'):
    key_str = get_secret('github-key')
    with open(out_path, 'w') as f:
        f.write(key_str)


In [3]:
!git clone https://github.com/yetanotheruseless/labs.git

Cloning into 'labs'...
remote: Enumerating objects: 89, done.[K
remote: Counting objects: 100% (89/89), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 89 (delta 31), reused 60 (delta 15), pack-reused 0[K
Unpacking objects: 100% (89/89), 57.96 KiB | 2.07 MiB/s, done.


In [4]:
!cd labs/src

In [5]:
!pwd

/content


In [6]:
aws_key, aws_secret = get_aws_creds()

In [7]:
import sys
sys.path.insert(0, '/content/labs/src')

In [8]:
!cd labs && pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Obtaining file:///content/labs (from -r requirements.txt (line 2))
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pre-commit
  Downloading pre_commit-3.2.2-py2.py3-none-any.whl (202 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m202.7/202.7 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting coverage
  Downloading coverage-7.2.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.5/227.5 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting awscli>=1.27.94
  Downloading awscli-1.27.111-py3-none-any.whl (4.0 MB)
[2K     

In [9]:
from data.make_dataset import *

In [10]:
books_prefix = 'projects/AI-RPG/interim/'
core_rulebook = books_prefix + "The_Strange_Consumer_PDF_Bookmarked_and_Linked_2015-06-23-text.tgz"
out_base = 'data'
!mkdir data


In [11]:
import boto3
import io
import tarfile

def download_and_uncompress_tgzs(aws_key, aws_secret, bucket_name, exclude_string, local_directory):
    # Initialize a boto3 client with the provided AWS credentials
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_key,
        aws_secret_access_key=aws_secret
    )

    # List all objects in the S3 bucket
    s3_objects = s3_client.list_objects_v2(Bucket=bucket_name)

    # Iterate through each object and download/uncompress it if it's a .tgz file and doesn't contain the specified string
    for s3_object in s3_objects['Contents']:
        s3_object_key = s3_object['Key']
        if s3_object_key.endswith('.tgz') and exclude_string not in s3_object_key:
            # Download the .tgz file from the S3 bucket into memory
            s3_response = s3_client.get_object(Bucket=bucket_name, Key=s3_object_key)
            tgz_data = s3_response['Body'].read()

            # Load the .tgz data into a file-like buffer
            tgz_buffer = io.BytesIO(tgz_data)

            # Uncompress the .tgz file and extract its contents to the specified local directory
            with tarfile.open(fileobj=tgz_buffer, mode="r:gz") as tar:
                tar.extractall(path=local_directory)

            print(f"Downloaded and uncompressed {s3_object_key} from bucket {bucket_name} to {local_directory}")



In [12]:
import os
import random
import shutil

def create_data_split(source_dir, output_dir, train_ratio=0.8, test_ratio=0.1, valid_ratio=0.1):
    assert train_ratio + test_ratio + valid_ratio == 1, "Ratios must sum to 1."

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for split in ['train', 'test', 'validation']:
        split_dir = os.path.join(output_dir, split)
        if not os.path.exists(split_dir):
            os.makedirs(split_dir)

    all_files = []
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    random.shuffle(all_files)

    train_files = all_files[:int(len(all_files) * train_ratio)]
    test_files = all_files[int(len(all_files) * train_ratio):int(len(all_files) * (train_ratio + test_ratio))]
    valid_files = all_files[int(len(all_files) * (train_ratio + test_ratio)):]

    for file in train_files:
        filename = os.path.basename(file)
        dirname = os.path.basename(os.path.dirname(file))
        new_filename = f"{dirname}_{filename}"
        shutil.copy(file, os.path.join(output_dir, 'train', f"train_{new_filename}"))

    for file in test_files:
        filename = os.path.basename(file)
        dirname = os.path.basename(os.path.dirname(file))
        new_filename = f"{dirname}_{filename}"
        shutil.copy(file, os.path.join(output_dir, 'test', f"test_{new_filename}"))

    for file in valid_files:
        filename = os.path.basename(file)
        dirname = os.path.basename(os.path.dirname(file))
        new_filename = f"{dirname}_{filename}"
        shutil.copy(file, os.path.join(output_dir, 'validation', f"validation_{new_filename}"))


In [13]:
download_and_uncompress_tgzs(aws_key, aws_secret, 'yetanotheruseless-data', "-images", 'data/core')

Downloaded and uncompressed projects/AI-RPG/interim/Alternate_Origins-2015-10-27-text.tgz from bucket yetanotheruseless-data to data/core
Downloaded and uncompressed projects/AI-RPG/interim/Cults_Factions_and_Syndicates_Glimmer-HyperlinkedBookmarked-2016-02-27-text.tgz from bucket yetanotheruseless-data to data/core
Downloaded and uncompressed projects/AI-RPG/interim/Cypher_System_Rulebook-HLBM-2015-07-06-text.tgz from bucket yetanotheruseless-data to data/core
Downloaded and uncompressed projects/AI-RPG/interim/Encyclopedia_of_Impossible_Things-2016-02-04-text.tgz from bucket yetanotheruseless-data to data/core
Downloaded and uncompressed projects/AI-RPG/interim/Impossible_Vehicles-2015-11-16-text.tgz from bucket yetanotheruseless-data to data/core
Downloaded and uncompressed projects/AI-RPG/interim/In_Translation_ST_Character_Options-HLBM-2015-02-23-text.tgz from bucket yetanotheruseless-data to data/core
Downloaded and uncompressed projects/AI-RPG/interim/Mastodon-BMHL-2015-09-19-te

In [14]:
create_data_split('data/core', 'data/training')

In [15]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone


In [16]:
from models.train_model import *


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [17]:
def get_model_and_tokenizers(model_name="facebook/opt-6.7b", torch_dtype=torch.float32):
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_8bit=True,
        device_map='auto',
    )
    model = model.half() if torch_dtype == torch.float16 else model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    for param in model.parameters():
        param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)
    model.gradient_checkpointing_enable()  # reduce number of stored activations
    model.enable_input_require_grads()
    model.lm_head = CastOutputToFloat(model.lm_head)
    config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, config)
    print_trainable_parameters(model)
    return model, tokenizer

In [18]:
model, tokenizer = get_model_and_tokenizers()

Downloading (…)lve/main/config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]



Downloading (…)model.bin.index.json:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

trainable params: 8388608 || all params: 6666862592 || trainable%: 0.12582542214183376


In [19]:
import transformers
from datasets import load_dataset
data = load_dataset("data/training")
data = data.map(lambda samples: tokenizer(samples['text']), batched=True)

trainer = transformers.Trainer(
    model=model, 
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4, 
        gradient_accumulation_steps=4,
        warmup_steps=100, 
        max_steps=500, 
        learning_rate=2e-6, 
        fp16=True,
        tf32=True,
        logging_steps=1, 
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
) 
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Resolving data files:   0%|          | 0/2320 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/290 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/290 [00:00<?, ?it/s]

Downloading and preparing dataset text/training to /root/.cache/huggingface/datasets/text/training-03f14aba2524a26e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/training-03f14aba2524a26e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/80088 [00:00<?, ? examples/s]

Map:   0%|          | 0/11052 [00:00<?, ? examples/s]

Map:   0%|          | 0/9072 [00:00<?, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,3.9733
2,4.613
3,5.5064
4,4.2481
5,5.0281
6,4.8627
7,4.4009
8,4.5123
9,4.9654
10,5.08


TrainOutput(global_step=500, training_loss=4.512078310012817, metrics={'train_runtime': 1523.0495, 'train_samples_per_second': 5.253, 'train_steps_per_second': 0.328, 'total_flos': 1.658470967083008e+16, 'train_loss': 4.512078310012817, 'epoch': 0.1})

In [None]:
from IPython.display import Markdown as md


batch = tokenizer("What kind of recursions could my character in the game 'The Strange' come from?", return_tensors='pt')
with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, min_length=100, max_new_tokens=400, repetition_penalty=2.0)

md('\n\n' + tokenizer.decode(output_tokens[0], skip_special_tokens=True))