## 1. 패키지 로드

In [32]:
# 라이브러리 import
import os
import pandas as pd
import numpy as np
import math

# 셀레늄
from selenium.webdriver.common.alert import Alert # 경고 처리
from selenium import webdriver  # 라이브러리(모듈) 가져오라
# pip install chromedriver_autoinstaller
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains as AC # 상호작용 자동화
import chromedriver_autoinstaller

# tqdm : for문 진행상황 체크
from tqdm import tqdm, tqdm_notebook
from tqdm.notebook import tqdm

# 정규표현식(regular expression) : 문자(알파벳,한글), 숫자, 특수기호 정제 및 추출
import re
from time import sleep
import time

# 워닝 무시
import warnings
warnings.filterwarnings('ignore')

## 2. text data 로드

In [37]:
actor_df = pd.read_csv('./csv_data/Actor_name_age_sex.csv', encoding='utf-8-sig')
actor_df

Unnamed: 0,Name,age,Sex
0,감우성,1970,남
1,강경준,1983,남
2,강기영,1983,남
3,강남길,1958,남
4,강동원,1981,남
...,...,...,...
1532,황은정,1980,여
1533,황정민,1969,여
1534,황정서,1984,여
1535,황정음,1984,여


In [38]:
actor_name_list = actor_df.Name.to_list()
actor_name_list[:100]

['감우성',
 '강경준',
 '강기영',
 '강남길',
 '강동원',
 '강성훈',
 '강은탁',
 '강이석',
 '강정우',
 '강지섭',
 '강훈',
 '고수',
 '고우림',
 '고윤',
 '고주원',
 '고창석',
 '공형진',
 '구교환',
 '곽도원',
 '곽동연',
 '곽정욱',
 '구자성',
 '권오중',
 '권해성',
 '권해효',
 '권혁수',
 '권현상',
 '권화운',
 '김강훈',
 '김광규',
 '김기방',
 '김기천',
 '김남길',
 '김도윤',
 '김동완',
 '김동욱',
 '김동희',
 '김동희',
 '김명곤',
 '김명민',
 '엘',
 '김무생',
 '김민규',
 '김민기',
 '김민석',
 '김민수',
 '김민식',
 '김민재',
 '김민종',
 '김병세',
 '김산호',
 '김상중',
 '김상호',
 '김선호',
 '김성수',
 '김성오',
 '김수로',
 '김수현',
 '김승우',
 '김시후',
 '김영광',
 '김영대',
 '김영준',
 '김영철',
 '김영훈',
 '김용건',
 '김윤석',
 '김일우',
 '김재영',
 '김재원',
 '김정학',
 '김정현',
 '김종구',
 '김주혁',
 '김준',
 '김지석',
 '김지훈',
 '김진규',
 '김진수',
 '김진우',
 '김진엽',
 '김찬우',
 '김창완',
 '김형묵',
 '김형민',
 '김흥수',
 '김희원',
 '나인우',
 '나윤찬',
 '남궁민',
 '남궁원',
 '남다름',
 '남주혁',
 '노종현',
 '노주현',
 '노형욱',
 '도상우',
 '독고영재',
 '류경수',
 '류담']

## 3. 크롤링

In [42]:
# 크롬 옵션설정
options = webdriver.ChromeOptions()
options.add_argument("--window-size=800,1200") # 크롬 윈도우 사이즈 조절
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36')
options.headless = True

# 크롬 load
chrome_path = chromedriver_autoinstaller.install()

In [43]:
driver = webdriver.Chrome(chrome_path, options=options)

In [44]:
data_list = []

for name in tqdm_notebook(actor_name_list):
    actor_dict = dict()
    
    actor_dict['Name'] = name
    name = f'배우+{name}+프로필'
    
    driver.get(f"https://search.naver.com/search.naver?where=nexearch&query={name}")
    time.sleep(1)
    try:
        photo = driver.find_element_by_css_selector('div.cm_content_area._cm_content_area_profile > div > div.detail_info > a > img')
        photo_url = photo.get_attribute('src')
        actor_dict['photo'] = photo_url
    except:
        actor_dict['photo'] = np.nan
    
    data_list.append(actor_dict)

  0%|          | 0/1537 [00:00<?, ?it/s]

In [45]:
driver.close()

In [46]:
result = pd.DataFrame(data_list)

In [47]:
result

Unnamed: 0,Name,photo
0,감우성,https://search.pstatic.net/common?type=b&size=...
1,강경준,https://search.pstatic.net/common?type=b&size=...
2,강기영,https://search.pstatic.net/common?type=b&size=...
3,강남길,https://search.pstatic.net/common?type=b&size=...
4,강동원,https://search.pstatic.net/common?type=b&size=...
...,...,...
1532,황은정,
1533,황정민,https://search.pstatic.net/common?type=b&size=...
1534,황정서,
1535,황정음,https://search.pstatic.net/common?type=b&size=...


In [48]:
actor_df

Unnamed: 0,Name,age,Sex
0,감우성,1970,남
1,강경준,1983,남
2,강기영,1983,남
3,강남길,1958,남
4,강동원,1981,남
...,...,...,...
1532,황은정,1980,여
1533,황정민,1969,여
1534,황정서,1984,여
1535,황정음,1984,여


In [53]:
result_df = pd.concat([actor_df, result[['photo']]], axis=1)

In [55]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1537 entries, 0 to 1536
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    1537 non-null   object
 1   age     1537 non-null   int64 
 2   Sex     1537 non-null   object
 3   photo   970 non-null    object
dtypes: int64(1), object(3)
memory usage: 48.2+ KB


In [57]:
result_df['age'] = result_df.age.apply(lambda x : 2022 - x)

In [58]:
result_df

Unnamed: 0,Name,age,Sex,photo
0,감우성,52,남,https://search.pstatic.net/common?type=b&size=...
1,강경준,39,남,https://search.pstatic.net/common?type=b&size=...
2,강기영,39,남,https://search.pstatic.net/common?type=b&size=...
3,강남길,64,남,https://search.pstatic.net/common?type=b&size=...
4,강동원,41,남,https://search.pstatic.net/common?type=b&size=...
...,...,...,...,...
1532,황은정,42,여,
1533,황정민,53,여,https://search.pstatic.net/common?type=b&size=...
1534,황정서,38,여,
1535,황정음,38,여,https://search.pstatic.net/common?type=b&size=...


In [59]:
result_df.to_excel('./csv_data/Actor_image.xlsx', index=False)