In [69]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [70]:
# YouTube API key
API_KEY = "your-api-key"
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [71]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [72]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [73]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [74]:
participants = ["흑백요리사","백종원","안성재","에드워드 리","나폴리 맛피아","트리플스타","요리하는 돌아이","최현석","장호준","여경래","안유성","정지선","최강록","조은주","오세득","파브리치오 페라리","이영숙","선경 롱게스트","김도윤","박준우"]

In [75]:
video_comments = {}

start = time.time()
query_baisic = "흑백요리사"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  5%|▌         | 1/20 [00:05<01:47,  5.65s/it]

5.652115106582642s for query: 흑백요리사 흑백요리사


 10%|█         | 2/20 [00:11<01:43,  5.77s/it]

11.513113737106323s for query: 흑백요리사 백종원


 15%|█▌        | 3/20 [00:16<01:33,  5.52s/it]

16.741732597351074s for query: 흑백요리사 안성재


 20%|██        | 4/20 [00:21<01:24,  5.28s/it]

21.63775873184204s for query: 흑백요리사 에드워드 리


 25%|██▌       | 5/20 [00:26<01:17,  5.17s/it]

26.627090454101562s for query: 흑백요리사 나폴리 맛피아


 30%|███       | 6/20 [00:31<01:13,  5.21s/it]

31.922846794128418s for query: 흑백요리사 트리플스타


 35%|███▌      | 7/20 [00:36<01:06,  5.11s/it]

36.815624952316284s for query: 흑백요리사 요리하는 돌아이


 40%|████      | 8/20 [00:42<01:02,  5.19s/it]

42.1934540271759s for query: 흑백요리사 최현석


 45%|████▌     | 9/20 [00:46<00:54,  4.99s/it]

46.732492208480835s for query: 흑백요리사 장호준


 50%|█████     | 10/20 [00:51<00:47,  4.80s/it]

51.094573736190796s for query: 흑백요리사 여경래


 55%|█████▌    | 11/20 [00:56<00:46,  5.13s/it]

56.98702597618103s for query: 흑백요리사 안유성


 60%|██████    | 12/20 [01:01<00:40,  5.08s/it]

61.93606638908386s for query: 흑백요리사 정지선


 65%|██████▌   | 13/20 [01:06<00:35,  5.01s/it]

66.79475927352905s for query: 흑백요리사 최강록


 70%|███████   | 14/20 [01:11<00:28,  4.81s/it]

71.14198756217957s for query: 흑백요리사 조은주


 75%|███████▌  | 15/20 [01:15<00:23,  4.66s/it]

75.44952511787415s for query: 흑백요리사 오세득


 80%|████████  | 16/20 [01:19<00:18,  4.57s/it]

79.80513143539429s for query: 흑백요리사 파브리치오 페라리


 85%|████████▌ | 17/20 [01:24<00:13,  4.55s/it]

84.32754349708557s for query: 흑백요리사 이영숙
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=_yOU-oKKSXg&maxResults=100&textFormat=plainText&key=AIzaSyAisAdMFODoW7sWAbLQn9L1e6-BintM3yc&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 90%|█████████ | 18/20 [01:28<00:08,  4.44s/it]

88.48794603347778s for query: 흑백요리사 선경 롱게스트


 95%|█████████▌| 19/20 [01:32<00:04,  4.34s/it]

92.62154984474182s for query: 흑백요리사 김도윤


100%|██████████| 20/20 [01:36<00:00,  4.81s/it]

96.20503854751587s for query: 흑백요리사 박준우





## Merge youtube_comments with movie_rating_dataset

In [76]:
comments = pd.read_csv("youtube_comments.csv")

In [77]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,3ZUL9il_beI,"《흑백요리사: 요리 계급 전쟁》, 지금 넷플릭스에서 시청하세요: https://ww..."
1,3ZUL9il_beI,흑백요리사 2기다리고 있다
2,3ZUL9il_beI,"이미 셰프로써 이룰걸 다 이룬 베테랑과 이제 막 이름을 알려야 하는 패기있는 루키,..."
3,3ZUL9il_beI,나폴리 뭐시기는 그냥 흑이 백을 이기는 구도연출을 위해서 우승시켜준거고 ㄹㅇ우승자는...
4,3ZUL9il_beI,나폴리가 두부지옥을 안해서 인정을 못받는거.


SKIP

In [78]:
import urllib.request
# download naver movie ratings dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x7bb2acb87700>)

In [79]:
movie_data = pd.read_table('ratings.txt')
movie_data.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [80]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,3ZUL9il_beI,"《흑백요리사: 요리 계급 전쟁》, 지금 넷플릭스에서 시청하세요: https://ww..."
1,3ZUL9il_beI,흑백요리사 2기다리고 있다
2,3ZUL9il_beI,"이미 셰프로써 이룰걸 다 이룬 베테랑과 이제 막 이름을 알려야 하는 패기있는 루키,..."
3,3ZUL9il_beI,나폴리 뭐시기는 그냥 흑이 백을 이기는 구도연출을 위해서 우승시켜준거고 ㄹㅇ우승자는...
4,3ZUL9il_beI,나폴리가 두부지옥을 안해서 인정을 못받는거.


In [81]:
print(f"movie data length: {len(movie_data)}")
print(f"comments data length: {len(comments)}")

movie data length: 200000
comments data length: 44422


In [82]:
# Merge two dataset because number of Comments dataset is not big enough to train word vectors.
df1_text = movie_data[['document']].rename(columns={'document': 'text'})
df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# merge movie_data and yt_comments_data
merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
merged_df

Unnamed: 0,text
0,어릴때보고 지금다시봐도 재밌어요ㅋㅋ
1,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산..."
2,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.
3,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...
4,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.
...,...
244417,이때는 다들 슬림했네\n정형돈씨 날씬해서 깜놀
244418,진짜 정형돈이 너무 잘 살린다..\n정형돈 있을 때가 냉부 전성기였다.
244419,5:30 항돈이형 ㅋㅋㅋㅋㅋㅋㅋㅋ1젓부터 12젓ㅋㄱㄱㅋㅋㅋ
244420,냉부로 기가맥힌 타이밍으로 조회수뽑네 ㅋㅋ 그래 얼마나좋아 너들도좋고 우리들도좋고


In [83]:
# NULL check
print(merged_df.isnull().values.any())

True


In [84]:
merged_df = merged_df.dropna(how = 'any') # drop rows with null values
print(merged_df.isnull().values.any()) 

False


In [85]:
print(len(merged_df)) 

244413


In [86]:
# remove all characters other than Hangeul
merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)


In [87]:
pip install konlpy

Note: you may need to restart the kernel to use updated packages.


In [88]:
from konlpy.tag import Okt
okt = Okt()

In [89]:
# NULL check
print(comments.isnull().values.any()) 

comments = comments.dropna(how = 'any') # drop rows with null values

print(comments.isnull().values.any()) 

True
False


In [90]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized_data = []

for sentence in tqdm.tqdm(comments['Comment']):
    sentence = str(sentence).strip()

    if not sentence:
        continue
    
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence 
                                  if not word in stopwords #조건1
                                     and len(word) >=2 #조건2
                                     and word.isalpha()] # 한글이나 영어
    if stopwords_removed_sentence: #빈 리스트가 아니라면 추가
        tokenized_data.append(stopwords_removed_sentence)

100%|██████████| 44421/44421 [02:16<00:00, 324.50it/s]


In [91]:
!pip install gensim



In [92]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [93]:
model.wv.vectors.shape

(6689, 100)

In [94]:
print(model.wv.most_similar("백종원"))

[('기준', 0.9172585010528564), ('성재', 0.8973420858383179), ('램지', 0.8878208994865417), ('한테', 0.8754184246063232), ('안성', 0.8749454021453857), ('의원', 0.8524170517921448), ('재는', 0.8455175161361694), ('고든', 0.8427581191062927), ('블라인드', 0.8383055329322815), ('평가', 0.8348464369773865)]


In [95]:
print(model.wv.most_similar("최현석"))

[('정지선', 0.9522685408592224), ('성재', 0.9319412708282471), ('여경', 0.9225386381149292), ('셰프', 0.9061894416809082), ('이영숙', 0.8923342823982239), ('나소', 0.8763349652290344), ('이랑', 0.8722212910652161), ('헤드', 0.8715230822563171), ('쉐프', 0.8640704154968262), ('제자', 0.8597797751426697)]


## Save W2V model

In [99]:
model.wv.save_word2vec_format('ko_w2v')

In [100]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

## Visualization for embedding

In [98]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv