In [1]:
!pip install --upgrade sagemaker
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting sagemaker
  Downloading sagemaker-2.165.0.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.4/803.4 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting attrs<24,>=23.1.0 (from sagemaker)
  Using cached attrs-23.1.0-py3-none-any.whl (61 kB)
Collecting PyYAML==6.0 (from sagemaker)
  Using cached PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (682 kB)
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.165.0-py2.py3-none-any.whl size=1082430 sha256=a64ee89e45a43d3d6f3c66917089eae5cf2d253b8d97c90648177c8c99e9faf6
  Stored in directory: /home/ec2-user/.cache/pip/wheels/a1/fe/a8/22f3ba84480fbe8002da7043e9c7f2ad73e2d0949bf9

Successfully installed gensim-4.3.1 smart-open-6.3.0


In [2]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator
from sagemaker.session import s3_input
import boto3

In [3]:
# SageMaker 실행 역할 가져오기
role = get_execution_role()

# SageMaker 세션 생성
sagemaker_session = sagemaker.Session()


In [4]:
# BlazingText 학습을 위한 데이터 경로
region_name = boto3.Session().region_name
train_data = 's3://dothis-ai-test/data/video_data_20230414.txt'
s3_output_location = 's3://dothis-ai-test/models'
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
# 유효한 trainingJobName으로 변경
training_job_name = 'BlazingText-cbow-job'
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

train_use_spot_instances = True
train_max_run = 3600
train_max_wait = 3600 if train_use_spot_instances else None



bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, # 학습 인스턴스의 수입니다. 병렬 학습을 위해 여러 인스턴스를 사용할 수 있습니다.
                                         train_instance_type='ml.m5.xlarge', # 학습 인스턴스의 유형입니다. 인스턴스 유형은 학습 작업에 사용되는 리소스의 크기와 성
                                         train_volume_size=5, # 학습 인스턴스의 EBS 볼륨 크기입니다. 학습 데이터 및 모델 아티팩트를 저장하기 위해 필요한 디스크 공간을 지정합니다.
                                         input_mode='File', # 입력 데이터의 형식을 지정하는 옵션입니다. 여기서는 "File"을 사용하여 파일 형식의 데이터를 지정합니다.
                                         output_path=s3_output_location, # 학습된 모델 아티팩트의 저장 위치입니다. 학습 작업이 완료된 후, 모델 아티팩트는 지정된 S3 경로에 저장됩니다.
                                         train_max_run=train_max_run, # 학습 작업의 최대 실행 시간(초)입니다. 지정된 시간을 초과하면 학습 작업이 중지됩니다.
                                         sagemaker_session=sagemaker_session,
                                         train_use_spot_instances=train_use_spot_instances,
                                         train_max_wait=train_max_wait,
                                         base_job_name=training_job_name
                                        )

bt_model.set_hyperparameters(mode="cbow",
                             epochs=5,
                             min_count=5,
                             sampling_threshold=0.0001,
                             learning_rate=0.05,
                             window_size=5,
                             vector_dim=100,
                             negative_samples=5,
                             batch_size=11, #  = (2*window_size + 1) (Preferred. Used only if mode is batch_skipgram)
                             evaluation=False,# Perform similarity evaluation on WS-353 dataset at the end of training
                             subwords=False) # Subword embedding learning is not supported by batch_skipgram

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Using SageMaker BlazingText container: 306986355934.dkr.ecr.ap-northeast-2.amazonaws.com/blazingtext:1 (ap-northeast-2)


In [5]:
train_data = sagemaker.session.s3_input(train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data}

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [6]:
bt_model.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating training-job with name: BlazingText-cbow-job-2023-06-16-06-54-56-454


2023-06-16 06:54:56 Starting - Starting the training job...
2023-06-16 06:55:12 Starting - Preparing the instances for training......
2023-06-16 06:56:07 Downloading - Downloading input data...
2023-06-16 06:56:38 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[06/16/2023 06:56:48 INFO 139965656786752] nvidia-smi took: 0.07536935806274414 secs to identify 0 gpus[0m
[34m[06/16/2023 06:56:48 INFO 139965656786752] Running single machine CPU BlazingText training using cbow mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[06/16/2023 06:56:48 INFO 139965656786752] Processing /opt/ml/input/data/train/video_data_20230414.txt . File size: 433.05553245544434 MB[0m
[34mRead 10M words[0m
[34mRead 20M words[0m
[34mRead 30M words[0m
[34mRead 40M words[0m
[34mRead 45M words[0m
[34mNumber of words:  349278[0m
[34m##### Alpha: 0.0490  Progress: 2.02%  Million Words/sec: 1.14 #####[0m
[34m##### Alpha: 0.0465  Pro

In [7]:
bt_model.model_data

's3://dothis-ai-test/models/BlazingText-cbow-job-2023-06-16-06-54-56-454/output/model.tar.gz'

In [12]:
bucket = 'dothis-ai-test'  # Replace with your own bucket name if needed
key = bt_model.model_data[bt_model.model_data.find("/", 5)+1:]

# S3에 액세스할 수 있는 클라이언트 생성
s3 = boto3.client('s3')
s3.download_file(bucket, key, 'models/related/model.tar.gz')

In [13]:
!tar -xvzf models/related/model.tar.gz

vectors.bin
vectors.txt


In [17]:
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('models/related/vectors.txt', binary=False, encoding='utf-8')

INFO:gensim.models.keyedvectors:loading projection weights from models/related/vectors.txt
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (349278, 100) matrix of type float32 from models/related/vectors.txt', 'binary': False, 'encoding': 'utf-8', 'datetime': '2023-06-16T07:12:44.730442', 'gensim': '4.3.1', 'python': '3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:08:06) [GCC 11.3.0]', 'platform': 'Linux-5.10.178-162.673.amzn2.x86_64-x86_64-with-glibc2.26', 'event': 'load_word2vec_format'}


In [39]:
word = "소셜네트워크"

# 키워드의 앞글자로 시작하는 것만
[i[0] for i in word_vectors.most_similar(word, topn=10) if i[0].startswith(word[0])]

['소셜미디어']