## 유튜브 댓글 크롤링

In [1]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [2]:
# YouTube API key
API_KEY = "AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE" # YOUR-API-KEY
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [3]:
# Get video ids for query. Youtube API allows only up to 50 videos
# 검색어(query))로 유튜브에서 최대 max_results만큼의 동영상 ID를 가져옴.

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [4]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
# 해당 video_id의 최상위 댓글들을 가져오기.


def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        elif error_reason == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [5]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
# video_id: [댓글 리스트] 형태의 데이터를 DataFrame으로 만들어 CSV 파일로 저장.

def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [6]:
participants = ["랩/힙합","빅나티", "애쉬아일랜드", "릴러말즈", "한요한", "정진우", "기리보이", "자이언티", "쏠",
 "로꼬", "죠지", "앰비션뮤직", "VMC","딥플로우", "던밀스", "넉살"]

In [7]:
video_comments = {}
# Ex: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}

start = time.time()
query_baisic = "랩/힙합"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  6%|▋         | 1/16 [00:08<02:02,  8.19s/it]

8.207509756088257s for query: 랩/힙합 랩/힙합
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=-cYLGLT_2N4&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 12%|█▎        | 2/16 [00:15<01:43,  7.38s/it]

15.026161193847656s for query: 랩/힙합 빅나티


 19%|█▉        | 3/16 [00:21<01:32,  7.11s/it]

21.80440092086792s for query: 랩/힙합 애쉬아일랜드


 25%|██▌       | 4/16 [00:28<01:21,  6.77s/it]

28.06779932975769s for query: 랩/힙합 릴러말즈
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=JXmypr49UZk&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 31%|███▏      | 5/16 [00:33<01:08,  6.21s/it]

33.28658676147461s for query: 랩/힙합 한요한
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=K31xK2A3luw&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=5PUPEcu2o44&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The video identified by the <code><a href="/youtube/v

 38%|███▊      | 6/16 [00:36<00:52,  5.29s/it]

36.77244973182678s for query: 랩/힙합 정진우


 44%|████▍     | 7/16 [00:42<00:49,  5.52s/it]

42.76108169555664s for query: 랩/힙합 기리보이


 50%|█████     | 8/16 [00:49<00:46,  5.77s/it]

49.06302452087402s for query: 랩/힙합 자이언티


 56%|█████▋    | 9/16 [00:53<00:37,  5.41s/it]

53.69961214065552s for query: 랩/힙합 쏠


 62%|██████▎   | 10/16 [00:59<00:33,  5.63s/it]

59.815402030944824s for query: 랩/힙합 로꼬
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=X81kPPqtLWI&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=h7_KH6w_Z5o&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot

 69%|██████▉   | 11/16 [01:03<00:25,  5.11s/it]

An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=_8gk6_TNfNo&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=OGGSi4JYeUQ&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded

100%|██████████| 16/16 [01:06<00:00,  4.13s/it]

An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=ZpNOs2UQkwI&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=Fff0VL_KvwI&maxResults=100&textFormat=plainText&key=AIzaSyAL7DBUEkBaAYiiya7iXYFNrxMiQjJwBBE&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded




## Merge youtube_comments with movie_rating_dataset

In [8]:
comments = pd.read_csv("youtube_comments.csv")

In [9]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,VtJ_2SbYoIM,0:00:01 Paul Blanco - Summer\r\n0:03:17 김승민 - ...
1,VtJ_2SbYoIM,ㅡ
2,VtJ_2SbYoIM,"운전 시, 매번 귀 호강."
3,VtJ_2SbYoIM,일하면서 1일1플레이 하고있습니다. 노동요로 최고임
4,VtJ_2SbYoIM,어떻게 내가 좋아하는 노래들만 담아둔거지


## SKIP

In [10]:
# import urllib.request
# # download naver movie ratings dataset
# urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

In [11]:
# movie_data = pd.read_table('ratings.txt')
# movie_data.head()

In [12]:
# comments.head()

In [13]:
# print(f"movie data length: {len(movie_data)}")
# print(f"comments data length: {len(comments)}")

In [14]:
# # Merge two dataset because number of Comments dataset is not big enough to train word vectors.
# df1_text = movie_data[['document']].rename(columns={'document': 'text'})
# df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# # merge movie_data and yt_comments_data
# merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
# merged_df

In [15]:
# NULL check
# print(merged_df.isnull().values.any())

In [16]:
# merged_df = merged_df.dropna(how = 'any') # drop rows with null values
# print(merged_df.isnull().values.any()) 

In [17]:
# print(len(merged_df)) 

In [18]:
# remove all characters other than Hangeul
# merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

In [19]:
# SKIP END

In [20]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.1/494.1 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.2 konlpy-0.6.0
Note: you may need to restart the kernel to use updated packages.


In [21]:
from konlpy.tag import Okt
okt = Okt()

In [22]:
# NULL check
print(comments.isnull().values.any()) # => True

comments = comments.dropna(how = 'any') # drop rows with null values

print(comments.isnull().values.any()) # => False

True
False


In [23]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized_data = []

# merged_df['text'] => comments['Comment']
for sentence in tqdm.tqdm(comments['Comment']): 
    sentence = str(sentence).strip()
    
    if not sentence:  # 빈 문자열이면 건너뛰기
        continue
        
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence 
                                  if not word in stopwords # 조건1
                                     and len(word) >= 2 # 조건2   
                                     and word.isalpha()]  # 한글이나 영어 
    
    if stopwords_removed_sentence:  # 빈 리스트가 아니라면 추가
        tokenized_data.append(stopwords_removed_sentence)


100%|██████████| 23813/23813 [01:05<00:00, 361.85it/s]


In [24]:
pip install gensim

Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully uninstalled scipy-1.14.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.3 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
libpysal 4.9.2 requires packaging>=22, but 

In [25]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [26]:
model.wv.vectors.shape

(3961, 100)

In [27]:
print(model.wv.most_similar("애쉬"))

[('부분', 0.9958155155181885), ('파트', 0.9951147437095642), ('넘다', 0.9949182868003845), ('듣기', 0.9948691129684448), ('부르다', 0.9948614239692688), ('라이브', 0.9948403835296631), ('로꼬', 0.9946848154067993), ('ㄹㅇ', 0.9945330619812012), ('선곡', 0.9943381547927856), ('멜로디', 0.9942612648010254)]


In [28]:
print(model.wv.most_similar("아일랜드"))

[('윤진영', 0.9981279969215393), ('이번', 0.9978144764900208), ('엄청', 0.9977579712867737), ('아티스트', 0.997726559638977), ('느끼다', 0.9977185726165771), ('어렵다', 0.9976742267608643), ('예요', 0.9975926280021667), ('최고', 0.9975568652153015), ('조합', 0.997468113899231), ('모든', 0.9974402785301208)]


In [29]:
print(model.wv.most_similar("빅나티"))

[('ㅅㅂ', 0.998960018157959), ('깔다', 0.9989535808563232), ('ㅋㅋㅋㅋㅋㅋ', 0.998914897441864), ('무슨', 0.9989140629768372), ('잘생기다', 0.9988647103309631), ('박다', 0.9988424181938171), ('릴러', 0.9988407492637634), ('뭐라다', 0.998818039894104), ('니까', 0.9988167881965637), ('솔직하다', 0.9987940192222595)]


## Save W2V model

In [30]:
model.wv.save_word2vec_format('ko_w2v')

In [31]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

## Visualization for embedding

In [32]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv