In [2]:
!pip install google-api-python-client

Defaulting to user installation because normal site-packages is not writeable
Collecting google-api-python-client
  Downloading google_api_python_client-2.169.0-py3-none-any.whl.metadata (6.7 kB)
Collecting httplib2<1.0.0,>=0.19.0 (from google-api-python-client)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0,>=1.32.0 (from google-api-python-client)
  Downloading google_auth-2.39.0-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client)
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5 (from google-api-python-client)
  Downloading google_api_core-2.24.2-py3-none-any.whl.metadata (3.0 kB)
Collecting uritemplate<5,>=3.0.1 (from google-api-python-client)
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting googleapis-common-protos<2.0.



In [4]:
!pip install tqdm

Defaulting to user installation because normal site-packages is not writeable
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1




In [3]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [6]:
# YouTube API key
API_KEY = "AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE" # YOUR-API-KEY
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [7]:
# Get video ids for query. Youtube API allows only up to 50 videos
# 검색어(query))로 유튜브에서 최대 max_results만큼의 동영상 ID를 가져옴.

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [8]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
# 해당 video_id의 최상위 댓글들을 가져오기.


def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        elif error_reason == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [13]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
# video_id: [댓글 리스트] 형태의 데이터를 DataFrame으로 만들어 CSV 파일로 저장.

def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("data-files/youtube_comments.csv", index=False)

In [14]:
participants = ["랩/힙합","빅나티", "애쉬아일랜드", "릴러말즈", "한요한", "정진우", "기리보이", "자이언티", "쏠",
 "로꼬", "죠지", "앰비션뮤직", "VMC","딥플로우", "던밀스", "넉살", "CAMO", "TOIL"]

In [12]:
video_comments = {}
# Ex: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}

start = time.time()
query_baisic = "랩/힙합"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  6%|▌         | 1/18 [00:14<04:09, 14.67s/it]

14.735228061676025s for query: 랩/힙합 랩/힙합
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=-cYLGLT_2N4&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 11%|█         | 2/18 [00:26<03:29, 13.06s/it]

26.673197031021118s for query: 랩/힙합 빅나티


 17%|█▋        | 3/18 [00:39<03:11, 12.80s/it]

39.15550780296326s for query: 랩/힙합 애쉬아일랜드


 22%|██▏       | 4/18 [00:51<02:57, 12.70s/it]

51.70840644836426s for query: 랩/힙합 릴러말즈
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=JXmypr49UZk&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 28%|██▊       | 5/18 [01:03<02:40, 12.36s/it]

63.44963073730469s for query: 랩/힙합 한요한
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=K31xK2A3luw&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=5PUPEcu2o44&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v

 33%|███▎      | 6/18 [01:11<02:11, 10.97s/it]

An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=MO2ooi28O44&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=YIMlHQBhNlg&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">vid

 39%|███▉      | 7/18 [01:23<02:05, 11.37s/it]

83.93239521980286s for query: 랩/힙합 기리보이
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=QzLFcfCEm-I&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 44%|████▍     | 8/18 [01:36<01:58, 11.88s/it]

96.89785933494568s for query: 랩/힙합 자이언티


 50%|█████     | 9/18 [01:47<01:44, 11.56s/it]

107.75288653373718s for query: 랩/힙합 쏠


 56%|█████▌    | 10/18 [01:58<01:31, 11.41s/it]

118.83516597747803s for query: 랩/힙합 로꼬


 61%|██████    | 11/18 [02:09<01:18, 11.25s/it]

129.7018027305603s for query: 랩/힙합 죠지


 67%|██████▋   | 12/18 [02:22<01:09, 11.59s/it]

142.09296655654907s for query: 랩/힙합 앰비션뮤직


 72%|███████▏  | 13/18 [02:33<00:57, 11.58s/it]

153.6302888393402s for query: 랩/힙합 VMC


 78%|███████▊  | 14/18 [02:43<00:44, 11.15s/it]

163.7895634174347s for query: 랩/힙합 딥플로우


 83%|████████▎ | 15/18 [02:55<00:34, 11.46s/it]

175.96963739395142s for query: 랩/힙합 던밀스


 89%|████████▉ | 16/18 [03:08<00:23, 11.69s/it]

188.17973852157593s for query: 랩/힙합 넉살


 94%|█████████▍| 17/18 [03:20<00:11, 11.78s/it]

200.19695210456848s for query: 랩/힙합 CAMO
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=wZgsxnYqHps&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


100%|██████████| 18/18 [03:30<00:00, 11.72s/it]

210.97791957855225s for query: 랩/힙합 TOIL





In [4]:
comments = pd.read_csv("data-files/youtube_comments.csv")

In [5]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,VtJ_2SbYoIM,0:00:01 Paul Blanco - Summer\r\n0:03:17 김승민 - ...
1,VtJ_2SbYoIM,외국에서 듣는 노래 너무 좋아
2,VtJ_2SbYoIM,paul Blanco
3,VtJ_2SbYoIM,밴쿠버ㄹㅈㄷ
4,VtJ_2SbYoIM,ㅡ


In [17]:
pip install konlpy

Defaulting to user installation because normal site-packages is not writeable
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.2-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting lxml>=4.1.0 (from konlpy)
  Downloading lxml-5.4.0-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
   ---------------------------------------- 0.0/19.4 MB ? eta -:--:--
   ---- ----------------------------------- 2.1/19.4 MB 10.7 MB/s eta 0:00:02
   --------- ------------------------------ 4.5/19.4 MB 11.2 MB/s eta 0:00:02
   ------------ --------------------------- 6.0/19.4 MB 10.0 MB/s eta 0:00:02
   ----------------- ---------------------- 8.4/19.4 MB 10.0 MB/s eta 0:00:02
   --------------------- ------------------ 10.5/19.4 MB 10.4 MB/s eta 0:00:01
   ------------------------- -------------- 12.6/19.4 MB 10.2 MB/s eta 0:00:01
   -----------------------

In [None]:
# from konlpy.tag import Okt
# okt = Okt()

In [None]:
# # NULL check
# print(comments.isnull().values.any()) # => True

# comments = comments.dropna(how = 'any') # drop rows with null values

# print(comments.isnull().values.any()) # => False

True
False


In [None]:
# stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

# tokenized_data = []

# # merged_df['text'] => comments['Comment']
# for sentence in tqdm.tqdm(comments['Comment']): 
#     sentence = str(sentence).strip()
    
#     if not sentence:  # 빈 문자열이면 건너뛰기
#         continue
        
#     tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
#     stopwords_removed_sentence = [word for word in tokenized_sentence 
#                                   if not word in stopwords # 조건1
#                                      and len(word) >= 2 # 조건2   
#                                      and word.isalpha()]  # 한글이나 영어 
    
#     if stopwords_removed_sentence:  # 빈 리스트가 아니라면 추가
#         tokenized_data.append(stopwords_removed_sentence)


  0%|          | 0/35848 [00:00<?, ?it/s]

100%|██████████| 35848/35848 [02:22<00:00, 251.79it/s]


In [None]:
# len(tokenized_data)

33402

In [None]:
!pip install gensim

In [None]:
# tokenized_data

[['Paul',
  'Blanco',
  'Summer',
  '김승민',
  '기쁨',
  '벤틀리',
  '끄다',
  '거야',
  '애쉬',
  '아일랜드',
  '악몽',
  'TOIL',
  '검정색',
  '하트',
  'Skinny',
  'Brown',
  'TOIL',
  'Don',
  'flex',
  'on',
  'me',
  '서동현',
  'vancouver',
  '재다',
  '안다'],
 ['외국', '에서', '듣다', '노래', '너무', '좋다'],
 ['paul', 'Blanco'],
 ['밴쿠버', 'ㄹㅈㄷ'],
 ['운전', '매번', '호강'],
 ['플레이', '노동요', '최고'],
 ['어떻다', '좋아하다', '노래', '담다', '두다'],
 ['넴새', '나다'],
 ['플리', '훔치다', '보다', '똑같다'],
 ['오다', '플리', '진짜', '최애'],
 ['쌍용', '고등학교', '이현준', 'ㅣㅂ'],
 ['안녕하다',
  '인의',
  '초등학교',
  '채린',
  '이다',
  '달기',
  '숙제',
  '위해',
  '댓글',
  '있다',
  '좋다',
  '노래',
  '들려주다',
  '감사하다'],
 ['우연히',
  '되다',
  '채널',
  '인데',
  '음악',
  '담기다',
  '감정',
  '노력',
  '너무',
  '오다',
  '닿다',
  '감동',
  '받다',
  '만들다',
  '노래',
  '채널',
  '운영',
  '인데',
  '시간',
  '되다',
  '들다',
  '함께',
  '음악',
  '이야기',
  '나누다',
  '계속',
  '응원'],
 ['이건', '진짜', '취향', '저격', '주인', '센스', '미치다', '감성', '터지다', '힙합', '최고'],
 ['그때',
  '그만하다',
  '끄다',
  '물다',
  '잡다',
  '진짜',
  '후회되다',
  '상황',
  '생각',
  '하고',
  '매번'

In [None]:
# from gensim.models import Word2Vec

# model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [None]:
# model.wv.vectors.shape

(5386, 100)

In [None]:
# print(model.wv.most_similar("감성"))

[('이노', 0.9982504844665527), ('ㅎㅎ', 0.996608555316925), ('중독', 0.996231198310852), ('충격', 0.9956544637680054), ('생각나다', 0.9955595135688782), ('음원', 0.9952946901321411), ('모으다', 0.9952518343925476), ('ㅜㅜ', 0.9948887825012207), ('스타일', 0.9947054982185364), ('연말', 0.9946438074111938)]


In [None]:
# Word2Vec 모델을 .bin 형태가 아니라 텍스트 포맷으로 저장

# model.wv.save_word2vec_format("data-files/ko_w2v")

In [None]:
# Gensim에서 제공하는 스크립트로, Word2Vec 데이터를 TensorBoard에서 Embedding Projector로 볼 수 있도록 .tsv 파일 2개로 변환

# ko_w2v_tensor.tsv (vectors)
# ko_w2v_metadata.tsv (단어 목록)

# !python -m gensim.scripts.word2vec2tensor --input outputs/word2vec/ko_w2v --output outputs/word2vec/ko_w2v
# !python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

# 경로에 맞게 실행문장
# !python -m gensim.scripts.word2vec2tensor --input data-files/ko_w2v --output data-files/ko_w2v


2025-05-05 11:24:48,018 - word2vec2tensor - INFO - running C:\Users\tq100\AppData\Roaming\Python\Python311\site-packages\gensim\scripts\word2vec2tensor.py --input data-files/ko_w2v --output data-files/ko_w2v
2025-05-05 11:24:48,018 - keyedvectors - INFO - loading projection weights from data-files/ko_w2v
2025-05-05 11:24:48,894 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (5386, 100) matrix of type float32 from data-files/ko_w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-05-05T11:24:48.783354', 'gensim': '4.3.3', 'python': '3.11.11 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:34:19) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'load_word2vec_format'}
2025-05-05 11:24:49,481 - word2vec2tensor - INFO - 2D tensor file saved to data-files/ko_w2v_tensor.tsv
2025-05-05 11:24:49,482 - word2vec2tensor - INFO - Tensor metadata file saved to data-files/ko_w2v_metadata.tsv
2025-05-05 11:24:49,482 - word2vec2tensor - INFO -