In [1]:
import boto3
import sagemaker

SAGEMAKER_ROLE = 'arn:aws:iam::483285841698:role/service-role/AmazonSageMaker-ExecutionRole-20200612T174829'
BUCKET = "cci-data-science-devaccount"
PROFILE_NAME = "dsa"
REGION = "us-east-1"

boto_session = boto3.Session(
    region_name=REGION,
    profile_name=PROFILE_NAME,
)

sagemaker_client = boto_session.client(
    service_name="sagemaker"
)

sagemaker_runtime_client = boto_session.client(
    service_name="sagemaker-runtime",
)

s3_client = boto_session.client(
    service_name="s3",
)

sagemaker_session = sagemaker.Session(
    boto_session = boto_session,
    sagemaker_client = sagemaker_client,
    sagemaker_runtime_client = sagemaker_runtime_client,
    default_bucket = BUCKET
)

In [2]:
import pandas as pd
from io import StringIO, BytesIO
from itertools import chain

from datasets import load_dataset, Dataset
import evaluate
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling

In [70]:
# load datasets
ds = load_dataset("bigcode/the-stack", data_dir="data/yaml", streaming=True, split="train")

train_list, test_list = [], []
counter = 0

for sample in iter(ds):
    if '.circleci/config.yml' in sample['max_stars_repo_path']:
        if counter <= 10000:
            train_list.append(sample['content'])
        elif counter <= 20000:
            test_list.append(sample['content'])
        else:
            break
        counter += 1

In [71]:
train_list[:5]

['cache_version_keys: &cache_version_keys\n  CACHE_VERSION_OF_PROJECT_DEPS: v2\n  CACHE_VERSION_OF_DANGER_CACHE: v1\n\ncache_keys:\n  gradle_cache:\n    primary: &primary_key_of_gradle_cache gradle-cache-{{ checksum "~/CACHE_VERSION_OF_PROJECT_DEPS" }}}-{{ checksum "~/project_hash.txt" }}\n    keys: &all_keys_of_gradle_cache\n      - *primary_key_of_gradle_cache\n      - gradle-cache-{{ checksum "~/CACHE_VERSION_OF_PROJECT_DEPS" }}-\n  danger_cache:\n    primary: &primary_key_of_danger_cache danger-cache-{{ checksum "~/CACHE_VERSION_OF_DANGER_CACHE" }}-{{ checksum "~/danger_cache" }}\n    keys: &all_keys_of_danger_cache\n      - *primary_key_of_danger_cache\n      - danger-cache-{{ checksum "~/CACHE_VERSION_OF_DANGER_CACHE" }}-\n\ndocker_env:\n  android_defaults: &android_defaults\n    working_directory: ~/conference-app-2019\n    docker:\n      - image: circleci/android:api-28-alpha\n    environment:\n      <<: *cache_version_keys\n      JAVA_OPTS: "-Xmx1024m"\n      GRADLE_OPTS: \'-D

In [72]:
test_list[:5]

['shared_configs:\n  simple_job_steps: &simple_job_steps\n    - checkout\n    - run:\n        name: Run tests\n        command: |\n          make deps test\n\n\n# Use the latest 2.1 version of CircleCI pipeline process engine. See: https://circleci.com/docs/2.0/configuration-reference\nversion: 2.1\njobs:\n  build-1-12:\n    working_directory: ~/repo\n    docker:\n      - image: circleci/golang:1.12\n    environment:\n      GO111MODULE: "on"\n    steps: *simple_job_steps\n\n  build-1-13:\n    working_directory: ~/repo\n    docker:\n      - image: circleci/golang:1.13\n    steps: *simple_job_steps\n\n  build-1-14:\n    working_directory: ~/repo\n    docker:\n      - image: circleci/golang:1.14\n    steps: *simple_job_steps\n\n  build-1-15:\n    working_directory: ~/repo\n    docker:\n      - image: circleci/golang:1.15\n    environment:\n      GO111MODULE: "on"\n    steps:\n      - checkout\n      - restore_cache:\n          keys:\n            - go-mod-v4-{{ checksum "go.sum" }}\n      

In [83]:
df_train = pd.DataFrame()
df_train['content'] = train_list

df_test = pd.DataFrame()
df_test['content'] = test_list

In [85]:
csv_buffer = StringIO()
df_train.to_csv(csv_buffer, index=False)
s3_client.put_object(
    Bucket=BUCKET,
    Body=csv_buffer.getvalue(),
    Key=f'hackweek-2023/train/train.csv'
)

csv_buffer = StringIO()
df_test.to_csv(csv_buffer, index=False)
s3_client.put_object(
    Bucket=BUCKET,
    Body=csv_buffer.getvalue(),
    Key=f'hackweek-2023/test/test.csv'
)

{'ResponseMetadata': {'RequestId': 'N2211F02SA7XGERK',
  'HostId': 'zHVavKRhzctY1HgohdE8QjTN2kJHGE/0aD2bxLHG7yTMbTxjGZNqtk0Xm2UGBf0upoc8hhaP86g=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'zHVavKRhzctY1HgohdE8QjTN2kJHGE/0aD2bxLHG7yTMbTxjGZNqtk0Xm2UGBf0upoc8hhaP86g=',
   'x-amz-request-id': 'N2211F02SA7XGERK',
   'date': 'Tue, 13 Jun 2023 20:46:34 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"3b0d13b22a235f38a1953bdd644ab50f"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"3b0d13b22a235f38a1953bdd644ab50f"',
 'ServerSideEncryption': 'AES256'}

In [4]:
obj = s3_client.get_object(
    Bucket=BUCKET,
    Key=f'hackweek-2023/train/train.csv'
)
df_train_from_s3 = pd.read_csv(BytesIO(obj['Body'].read()))

obj = s3_client.get_object(
    Bucket=BUCKET,
    Key=f'hackweek-2023/test/test.csv'
)
df_test_from_s3 = pd.read_csv(BytesIO(obj['Body'].read()))

In [6]:
df_train_from_s3

Unnamed: 0,content
0,cache_version_keys: &cache_version_keys\n CAC...
1,version: 2.1\n\norbs:\n rn: react-native-comm...
2,version: 2.1\njobs:\n python_steps:\n dock...
3,# Javascript Node CircleCI 2.0 configuration f...
4,version: 2.1\n\ncommands:\n tests:\n ...
...,...
9996,# CircleCI Pre-Built Docker Images\n# - https:...
9997,version: 2\n\njobs:\n deploy:\n docker:\n ...
9998,jobs:\n test:\n docker:\n - image: no...
9999,version: 2.1\njobs:\n build:\n docker:\n ...


In [7]:
df_test_from_s3

Unnamed: 0,content
0,shared_configs:\n simple_job_steps: &simple_j...
1,\nversion: 2.1\n\ncommands:\n greeting:\n ...
2,version: 2.1\n\ncommands:\n docker_hub_login:...
3,orbs: # declare what orbs we are going to use\...
4,version: 2.1\norbs:\n node: circleci/node@4.1...
...,...
9995,"version: ""2.1""\ncontext: pypi\nworkflows:\n b..."
9996,version: 2.1\n# This CircleCI orb doesn't seem...
9997,version: 2\nbuild:\n machine:\n java: orac...
9998,---\ndefaults:\n defaults: &defaults\n wor...


In [8]:
train_dataset = Dataset.from_dict({"content": df_train_from_s3.values.tolist()})

train_dataset

Dataset({
    features: ['content'],
    num_rows: 10001
})

In [9]:
list(train_dataset.features)

['content']

In [1]:
import boto3
import sagemaker

SAGEMAKER_ROLE = 'arn:aws:iam::483285841698:role/service-role/AmazonSageMaker-ExecutionRole-20200612T174829'
BUCKET = "cci-data-science-devaccount"
PROFILE_NAME = "dsa"
REGION = "us-east-1"

boto_session = boto3.Session(
    region_name=REGION,
    profile_name=PROFILE_NAME,
)

sagemaker_client = boto_session.client(
    service_name="sagemaker"
)

sagemaker_runtime_client = boto_session.client(
    service_name="sagemaker-runtime",
)

s3_client = boto_session.client(
    service_name="s3",
)

sagemaker_session = sagemaker.Session(
    boto_session = boto_session,
    sagemaker_client = sagemaker_client,
    sagemaker_runtime_client = sagemaker_runtime_client,
    default_bucket = BUCKET
)

In [2]:
from sagemaker.huggingface import HuggingFace

hyperparameters = {
	'model_name':'huggingface/CodeBERTa-small-v1',
	'max_samples':10
	# add your remaining hyperparameters
	# more info here https://github.com/huggingface/transformers/tree/v4.26.0/examples/pytorch/language-modeling
}

# creates Hugging Face estimator
huggingface_estimator = HuggingFace(
	entry_point='train.py',
	source_dir='./scripts',
	# instance_type='ml.p3.2xlarge',
 	instance_type='ml.g4dn.2xlarge',
	instance_count=1,
	role=SAGEMAKER_ROLE,
	sagemaker_session=sagemaker_session,
	transformers_version='4.26',
	pytorch_version='1.13',
	py_version='py39',
	hyperparameters = hyperparameters
)

training_input_path = 's3://cci-data-science-devaccount/hackweek-2023/train'
test_input_path = 's3://cci-data-science-devaccount/hackweek-2023/test'

In [3]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2023-06-14-19-16-30-209


2023-06-14 19:16:31 Starting - Starting the training job...
2023-06-14 19:16:47 Starting - Preparing the instances for training......
2023-06-14 19:18:01 Downloading - Downloading input data......
2023-06-14 19:18:57 Training - Downloading the training image.........
2023-06-14 19:20:32 Training - Training image download completed. Training in progress....bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2023-06-14 19:20:58,700 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2023-06-14 19:20:58,720 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2023-06-14 19:20:58,731 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2023-06-14 19:20:58,733 sagemaker_pytorch_container.training INFO     Invoking user training script.
2023-06-14 19:20:58,981 sagemaker-training-toolkit INFO     Installing dependencies from

In [4]:
from sagemaker.huggingface import HuggingFaceModel

env = {'HF_TASK': 'fill-mask'}

huggingface_model = HuggingFaceModel(
   model_data="s3://cci-data-science-devaccount/huggingface-pytorch-training-2023-06-14-19-16-30-209/output/model.tar.gz",
   role=SAGEMAKER_ROLE,
   env=env,
   sagemaker_session=sagemaker_session,
   transformers_version="4.26",
   pytorch_version="1.13",
   py_version='py39',
)

predictor = huggingface_model.deploy(1, "ml.g4dn.xlarge")

INFO:sagemaker:Creating model with name: huggingface-pytorch-inference-2023-06-14-19-23-24-560
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-inference-2023-06-14-19-23-25-375
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-inference-2023-06-14-19-23-25-375


--

In [12]:
fillmask_input = {"inputs": """
version: 2.1

# Define the jobs we want to run for this project
<mask>:
  build:
    docker:
      - image: cimg/base:2023.03
    steps:
      - checkout
      - run: echo "this is the build job"
  test:
    docker:
      - image: cimg/base:2023.03
    steps:
      - checkout
      - run: echo "this is the test job"

# Orchestrate our job run sequence
workflows:
  build_and_test:
    jobs:
      - build
      - test  
"""}

predictor.predict(fillmask_input)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{
  "code": 400,
  "type": "InternalServerException",
  "message": "/opt/ml/model does not appear to have a file named config.json. Checkout \u0027https://huggingface.co//opt/ml/model/None\u0027 for available files."
}
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/huggingface-pytorch-inference-2023-06-14-18-30-26-554 in account 483285841698 for more information.