In [1]:
!pip install --upgrade sagemaker
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [2]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator
from sagemaker.session import s3_input
import boto3

In [3]:
# SageMaker 실행 역할 가져오기
role = get_execution_role()

# SageMaker 세션 생성
sagemaker_session = sagemaker.Session()


In [4]:
# BlazingText 학습을 위한 데이터 경로
region_name = boto3.Session().region_name
train_data = 's3://dothis-ai-test/data/video_data_20230414.txt'
s3_output_location = 's3://dothis-ai-test/models'
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
# 유효한 trainingJobName으로 변경
training_job_name = 'BlazingText-job-skipgram'
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

train_use_spot_instances = True
train_max_run = 3600
train_max_wait = 3600 if train_use_spot_instances else None



bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, # 학습 인스턴스의 수입니다. 병렬 학습을 위해 여러 인스턴스를 사용할 수 있습니다.
                                         train_instance_type='ml.m5.xlarge', # 학습 인스턴스의 유형입니다. 인스턴스 유형은 학습 작업에 사용되는 리소스의 크기와 성
                                         train_volume_size=5, # 학습 인스턴스의 EBS 볼륨 크기입니다. 학습 데이터 및 모델 아티팩트를 저장하기 위해 필요한 디스크 공간을 지정합니다.
                                         input_mode='File', # 입력 데이터의 형식을 지정하는 옵션입니다. 여기서는 "File"을 사용하여 파일 형식의 데이터를 지정합니다.
                                         output_path=s3_output_location, # 학습된 모델 아티팩트의 저장 위치입니다. 학습 작업이 완료된 후, 모델 아티팩트는 지정된 S3 경로에 저장됩니다.
                                         train_max_run=train_max_run, # 학습 작업의 최대 실행 시간(초)입니다. 지정된 시간을 초과하면 학습 작업이 중지됩니다.
                                         sagemaker_session=sagemaker_session,
                                         train_use_spot_instances=train_use_spot_instances,
                                         train_max_wait=train_max_wait,
                                         base_job_name=training_job_name
                                        )

bt_model.set_hyperparameters(mode="skipgram",
                             epochs=5,
                             min_count=5,
                             sampling_threshold=0.0001,
                             learning_rate=0.05,
                             window_size=5,
                             vector_dim=100,
                             negative_samples=5,
                             batch_size=11, #  = (2*window_size + 1) (Preferred. Used only if mode is batch_skipgram)
                             evaluation=False,# Perform similarity evaluation on WS-353 dataset at the end of training
                             subwords=False) # Subword embedding learning is not supported by batch_skipgram

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Using SageMaker BlazingText container: 306986355934.dkr.ecr.ap-northeast-2.amazonaws.com/blazingtext:1 (ap-northeast-2)


In [5]:
train_data = sagemaker.session.s3_input(train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data}

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [6]:
bt_model.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating training-job with name: BlazingText-job-skipgram-2023-06-13-02-41-22-038


2023-06-13 02:41:22 Starting - Starting the training job......
2023-06-13 02:41:58 Starting - Preparing the instances for training...
2023-06-13 02:42:49 Downloading - Downloading input data...
2023-06-13 02:43:20 Training - Training image download completed. Training in progress.....[34mArguments: train[0m
[34m[06/13/2023 02:43:32 INFO 140048191964992] nvidia-smi took: 0.05026364326477051 secs to identify 0 gpus[0m
[34m[06/13/2023 02:43:32 INFO 140048191964992] Running single machine CPU BlazingText training using skipgram mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[06/13/2023 02:43:32 INFO 140048191964992] Processing /opt/ml/input/data/train/video_data_20230414.txt . File size: 433.05553245544434 MB[0m
[34mRead 10M words[0m
[34mRead 20M words[0m
[34mRead 30M words[0m
[34mRead 40M words[0m
[34mRead 45M words[0m
[34mNumber of words:  349278[0m
[34m##### Alpha: 0.0490  Progress: 2.00%  Million Words/sec: 0.41 #####[0m
[34m##### Alpha: 0.04

In [19]:
bt_model.model_data

's3://dothis-ai-test/models/BlazingText-job-skipgram-2023-06-13-02-41-22-038/output/model.tar.gz'

In [7]:
bucket = 'dothis-ai-test'  # Replace with your own bucket name if needed

s3 = boto3.resource('s3')
key = bt_model.model_data[bt_model.model_data.find("/", 5)+1:]
s3.Bucket(bucket).download_file(key, 'model.tar.gz')

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [8]:
!tar -xvzf model.tar.gz

vectors.txt
vectors.bin


In [9]:
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('vectors.txt', binary=False, encoding='utf-8')
word_vectors.most_similar(positive=['서울', '도쿄'], negative=['한국'])

INFO:gensim.models.keyedvectors:loading projection weights from vectors.txt
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (349278, 100) matrix of type float32 from vectors.txt', 'binary': False, 'encoding': 'utf-8', 'datetime': '2023-06-13T02:55:22.213410', 'gensim': '4.3.1', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:26:04) [GCC 10.4.0]', 'platform': 'Linux-5.10.157-139.675.amzn2.x86_64-x86_64-with-glibc2.26', 'event': 'load_word2vec_format'}


[('엠갤러리', 0.6206796169281006),
 ('소피텔', 0.5910489559173584),
 ('서울벚꽃', 0.5787157416343689),
 ('에비뉴엘', 0.5724790096282959),
 ('인터컨', 0.5693978667259216),
 ('코엑스몰', 0.5510305762290955),
 ('서울불꽃축제', 0.5491561889648438),
 ('긴자', 0.5473495721817017),
 ('합정역', 0.5470298528671265),
 ('서울세계불꽃축제', 0.5457656979560852)]

In [10]:
word_vectors.most_similar('택시')

[('우버택시', 0.8518080711364746),
 ('택시티비', 0.8410633206367493),
 ('막까파TV', 0.8402019143104553),
 ('막가파', 0.8248759508132935),
 ('승차난', 0.822329044342041),
 ('부제해제', 0.8217628598213196),
 ('카카오t블루', 0.8184545040130615),
 ('사납금', 0.8119112849235535),
 ('막까파', 0.8114938735961914),
 ('막가파TV', 0.8083483576774597)]

In [26]:
key

'models/BlazingText-job-skipgram-2023-06-13-02-41-22-038/output/model.tar.gz'

In [28]:
bucket = 'dothis-ai-test'  # Replace with your own bucket name if needed
s3 = boto3.resource('s3')
key = 'models/BlazingText-job-skipgram-2023-06-13-02-09-19-137/output/model.tar.gz'
s3.Bucket(bucket).download_file(key, 'model.tar.gz')

In [29]:
!tar -xvzf model.tar.gz

vectors.bin
vectors.txt


In [30]:
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('vectors.txt', binary=False, encoding='utf-8')
word_vectors.most_similar(positive=['서울', '도쿄'], negative=['한국'])

INFO:gensim.models.keyedvectors:loading projection weights from vectors.txt
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (349278, 100) matrix of type float32 from vectors.txt', 'binary': False, 'encoding': 'utf-8', 'datetime': '2023-06-13T03:08:43.433846', 'gensim': '4.3.1', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:26:04) [GCC 10.4.0]', 'platform': 'Linux-5.10.157-139.675.amzn2.x86_64-x86_64-with-glibc2.26', 'event': 'load_word2vec_format'}


[('잠실역', 0.5522750020027161),
 ('사당역', 0.5393338799476624),
 ('빈드', 0.5191420316696167),
 ('신사역', 0.5098044872283936),
 ('잠실', 0.5079963803291321),
 ('서울숲', 0.5060157179832458),
 ('우이동', 0.5046820044517517),
 ('문정동', 0.5037183165550232),
 ('상계동', 0.5033244490623474),
 ('올림픽공원', 0.5030790567398071)]

In [31]:
word_vectors.most_similar('택시')

[('택시호출', 0.7801095247268677),
 ('택시비', 0.7335453033447266),
 ('기본요금', 0.7140587568283081),
 ('카카오택시', 0.7060292363166809),
 ('막까파TV', 0.6872467398643494),
 ('마카롱택시', 0.6766425371170044),
 ('막가파', 0.6697238683700562),
 ('택시기사', 0.6683712601661682),
 ('타다금지법', 0.6601734757423401),
 ('할증', 0.6584334969520569)]