# AI기법과 활용 - Week 04
멀티쓰레딩 기법으로, 빠르게 데이터를 수집하는 방법에 대해 배웁니다
____

In [None]:
import requests
from bs4 import BeautifulSoup



## 1. 영화 리스트 불러오기

In [None]:
def get_movie_list(page=1):
    url = "https://movie.naver.com/movie/sdb/rank/rmovie.naver?sel=pnt&page={}".format(page)
    res = requests.get(url)
    html_doc = res.text
    soup = BeautifulSoup(html_doc, 'html.parser')
    table = soup.find("table",attrs={"class":"list_ranking"})
    rows = table.find_all("tr")
    MOVIE_URL_PREFIX = "https://movie.naver.com"
    movie_list = []
    for row in rows:
        anchor = row.find("a")
        if anchor:
            if anchor.has_attr('href'):
                movie_list.append(MOVIE_URL_PREFIX+anchor['href'])
    return movie_list

In [None]:
movie_list = get_movie_list(page=1)

## 2. 개별 영화의 정보 불러오기

In [None]:
def get_movie_info(url):

    res = requests.get(url)
    html_doc = res.text
    soup = BeautifulSoup(html_doc, 'html.parser')
    title = story = poster = year = ""
    try:
        title = soup.find("h3", attrs={"class":"h_movie"}).find('a').text
        story = soup.find("p", attrs={"class":"con_tx"}).text
        poster = soup.find("div", attrs={"class":"poster"}).find("img")['src']
        year = soup.find("strong", attrs={"class":"h_movie2"}).text.split(",")[-1].strip()
    except:
        pass
    return title, story, poster, year

In [None]:
url = "https://movie.naver.com/movie/bi/mi/basic.naver?code=22082"
get_movie_info(url)

In [None]:
import time

## 3. 이전 방식으로 스크래핑

In [None]:

movies = []
start = time.time()
for i in range(1,2):
    
    print("현재 페이지:",i)
    try:
        movie_list = get_movie_list(page=i)
        for movie_url in movie_list:
            title, story, poster, year = get_movie_info(movie_url)
            if title:
                movie = {"title":title, "story":story, "poster":poster, "year":year, "url":url}
                movies.append(movie)
    except Exception as e:
        print(e)
        break
        pass
    print("수집된 영화:",len(movies))
    
    end = time.time() - start
    print("소요시간:{} 초".format(round(end)))



## 4. 멀티 쓰레딩


### 4-1. 일반적인 순차적인 request

In [None]:
def get_url(url):
    return requests.get(url)

list_of_urls = ["https://postman-echo.com/get?foo1=bar1&foo2=bar2"]*10
start = time.time()
for url in list_of_urls:
    print(get_url(url))
end = time.time() - start
print("소요시간:{} 초".format(round(end)))

### 4-2. 멀티쓰레딩을 활용한 request

In [None]:
import requests
from concurrent.futures import ThreadPoolExecutor

def get_url(url):
    return requests.get(url)

list_of_urls = ["https://postman-echo.com/get?foo1=bar1&foo2=bar2"]*10
start = time.time()

with ThreadPoolExecutor(max_workers=10) as pool:
    response_list = list(pool.map(get_url,list_of_urls))

for response in response_list:
    print(response)
end = time.time() - start
print("소요시간:{} 초".format(round(end)))

## 5. 멀티쓰레딩을 활용한 스크래핑

In [None]:
from concurrent.futures import ThreadPoolExecutor

movies = []
start = time.time()

for i in range(1, 100):
    print("현재 페이지:",i)
    try:
        movie_list = get_movie_list(page=i)
        with ThreadPoolExecutor(max_workers=10) as pool:
            movie_info_list = list(pool.map(get_movie_info,movie_list))
        for movie, url in zip(movie_info_list,movie_list):
            title, story, poster, year = movie
            if title:
                movie = {"title":title, "story":story, "poster":poster, "year":year, "url":url}
                movies.append(movie)
    except Exception as e:
        print(e)
        break
        pass
    print("수집된 영화:",len(movies))
    end = time.time() - start
    print("소요시간:{} 초".format(round(end)))


## 6. 데이터 저장

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(movies)

In [None]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [None]:
df['story'] = df['story'].str.replace("\r"," ").replace("\t"," ").replace("\n"," ")
df['story'] = df['story'].astype('U')



In [None]:
df.to_csv("movies.csv",sep="\t", index=False)