# `Train` 데이터 전체 추출

In [1]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import time
import requests
import pickle


In [2]:
# id 값 불러오기
with open("train_musical_id.pkl", "rb") as file:
    train = pickle.load(file)

In [3]:
len(train)

20449

In [11]:
detail_url = "http://www.playdb.co.kr/playdb/playdbDetail.asp?sReqPlayno={}"

synopsis_url = "https://www.playdb.co.kr/playdb/playdbDetail_Content.asp?TabKind=2&PlayNo={}"
# synopsis_url = "https://www.playdb.co.kr/playdb/playdbDetail_Content.asp?TabKind=2&PlayNo=192992"

# 뮤지컬별 dict 형태로 저장된 데이터를 저장할 리스트 
data_list = []

In [12]:
for i in tqdm(train):
    # 세부사항 데이터 url 연결
    r1 = requests.get(detail_url.format(i))
    # 작품설명, 줄거리 데이터 url 연결
    r2 = requests.get(synopsis_url.format(i))

    soup1 = BeautifulSoup(r1.text,'html.parser')
    soup2 = BeautifulSoup(r2.text,'html.parser')

    # 저장될 변수 데이터 Null 값으로 초기화
    musical_id = None
    title = None
    poster = None
    genre = None
    date = None
    location = None
    cast = None
    age_rating = None
    running_time = None
    describe = None
    synopsis = None

    temp ={}
    
    try:
        title = soup1.find("span", class_="title").get_text(strip=True)
        poster = soup1.find("div", class_="pddetail").find("h2").find("img").get("src")

        if soup1.find("img", alt="세부장르"):
            genre = soup1.find("img", alt="세부장르").find_next("td").get_text(strip=True).replace("뮤지컬>", "").replace("|","")
            
            if genre == "창작창작":
                genre = "창작"

        if soup1.find("img", alt="일시"):
            date = soup1.find("img", alt="일시").find_next("td").get_text(strip=True)

        if soup1.find("img", alt="장소"):
            location = soup1.find("img", alt="장소").find_next("td").get_text(strip=True)

        if soup1.find("img", alt="출연"):
            cast = soup1.find("img", alt="출연").find_next("td").get_text(strip=True)

        if soup1.find("img", alt="관람등급"):
            age_rating = soup1.find("img", alt="관람등급").find_next("td").get_text(strip=True)
        
        if soup1.find("img", alt="관람시간"):
            running_time = soup1.find("img", alt="관람시간").find_next("td").get_text(strip=True)
        
        # soup2를 통해 decribe, synopsis 가져오기
        td_elements = soup2.find_all("td")
        
        for td_element in td_elements:
            # 줄거리 내용에 해당되는 태그 찾기
            a_element = td_element.find("a", {"name": "TabSynopsis"})
            if a_element:
                synopsis = td_element.find("td", class_="news").get_text(strip=True) 

            # 줄거리가 아닌 값, 작품 설명
            elif td_element.find("td",class_="news"):
                describe = td_element.find("td",class_="news").get_text(strip=True)
        
        temp = {
            'musical_id' : i
            ,'title': title
            ,'poster_url': poster
            ,'genre' : genre
            ,'date': date
            ,'location': location
            ,'actors': cast
            ,'age_rating' :  age_rating
            ,'running_time': running_time
            ,'describe': describe
            ,'synopsis': synopsis
        }
    except AttributeError as ae :
        print(f"Expected error : {ae}")
        
    except Exception as e:
        print(f"Other error : {e}")
        
    data_list.append(temp)
    time.sleep(2)

df = pd.DataFrame(data_list)

df.to_csv("./train.csv", encoding="utf-8-sig", index=False)

100%|██████████| 20449/20449 [14:06:47<00:00,  2.48s/it]  


In [17]:
# df

## `Present, Future` 데이터 와 `train` 데이터 중복 제거

In [48]:
# 현재 공연 중인 뮤지컬 id 값 불러오기
with open("present_musical_id.pkl", "rb") as file:
    present = pickle.load(file)

In [37]:
len(present)

90

In [44]:
# present 중복 체크 및 삭제

for i in present:
    if i in train:
        print(f"{i} is duplicated")
        present.remove(i)
        print(f"{i} is removed")


In [49]:
len(present)

86

In [47]:
# present 재저장
with open("present_musical_id.pkl", "wb") as file:
    pickle.dump(present, file)

In [50]:
# 공연 예정 id 값 불러오기
with open("future_musical_id.pkl", "rb") as file:
    future = pickle.load(file)

In [51]:
len(future)

396

In [55]:
# future 중복 체크 및 삭제 - with train

for i in future:
    if i in train:
        print(f"{i} is duplicated")
        present.remove(i)
        print(f"{i} is removed")


In [57]:
# future 중복 체크 및 삭제 - with present

for i in future:
    if i in present:
        print(f"{i} is duplicated")
        # present.remove(i)
        # print(f"{i} is removed")


- future은 중복값이 없으므로 그대로 유지

# `Present` 데이터 전체 추출 - 현재 공연 중인 뮤지컬

In [60]:
# 현재 공연 중인 뮤지컬 id 값 불러오기
with open("present_musical_id.pkl", "rb") as file:
    present = pickle.load(file)

In [61]:
len(present)

86

In [63]:
detail_url = "http://www.playdb.co.kr/playdb/playdbDetail.asp?sReqPlayno={}"

synopsis_url = "https://www.playdb.co.kr/playdb/playdbDetail_Content.asp?TabKind=2&PlayNo={}"
# synopsis_url = "https://www.playdb.co.kr/playdb/playdbDetail_Content.asp?TabKind=2&PlayNo=192992"

# 뮤지컬별 dict 형태로 저장된 데이터를 저장할 리스트 
present_data_list = []

In [65]:
for i in tqdm(present):
    # 세부사항 데이터 url 연결
    r1 = requests.get(detail_url.format(i))
    # 작품설명, 줄거리 데이터 url 연결
    r2 = requests.get(synopsis_url.format(i))

    soup1 = BeautifulSoup(r1.text,'html.parser')
    soup2 = BeautifulSoup(r2.text,'html.parser')

    # 저장될 변수 데이터 Null 값으로 초기화
    musical_id = None
    title = None
    poster = None
    genre = None
    date = None
    location = None
    cast = None
    age_rating = None
    running_time = None
    describe = None
    synopsis = None

    temp ={}
    
    try:
        title = soup1.find("span", class_="title").get_text(strip=True)
        poster = soup1.find("div", class_="pddetail").find("h2").find("img").get("src")

        if soup1.find("img", alt="세부장르"):
            genre = soup1.find("img", alt="세부장르").find_next("td").get_text(strip=True).replace("뮤지컬>", "").replace("|","")
            
            if genre == "창작창작":
                genre = "창작"

        if soup1.find("img", alt="일시"):
            date = soup1.find("img", alt="일시").find_next("td").get_text(strip=True)

        if soup1.find("img", alt="장소"):
            location = soup1.find("img", alt="장소").find_next("td").get_text(strip=True)

        if soup1.find("img", alt="출연"):
            cast = soup1.find("img", alt="출연").find_next("td").get_text(strip=True)

        if soup1.find("img", alt="관람등급"):
            age_rating = soup1.find("img", alt="관람등급").find_next("td").get_text(strip=True)
        
        if soup1.find("img", alt="관람시간"):
            running_time = soup1.find("img", alt="관람시간").find_next("td").get_text(strip=True)
        
        # soup2를 통해 decribe, synopsis 가져오기
        td_elements = soup2.find_all("td")
        
        for td_element in td_elements:
            # 줄거리 내용에 해당되는 태그 찾기
            a_element = td_element.find("a", {"name": "TabSynopsis"})
            if a_element:
                synopsis = td_element.find("td", class_="news").get_text(strip=True) 

            # 줄거리가 아닌 값, 작품 설명
            elif td_element.find("td",class_="news"):
                describe = td_element.find("td",class_="news").get_text(strip=True)
        
        temp = {
            'musical_id' : i
            ,'title': title
            ,'poster_url': poster
            ,'genre' : genre
            ,'date': date
            ,'location': location
            ,'actors': cast
            ,'age_rating' :  age_rating
            ,'running_time': running_time
            ,'describe': describe
            ,'synopsis': synopsis
        }
    except AttributeError as ae :
        print(f"Expected error : {ae}")
        
    except Exception as e:
        print(f"Other error : {e}")
        
    present_data_list.append(temp)
    time.sleep(2)

df = pd.DataFrame(present_data_list)

df.to_csv("./present.csv", encoding="utf-8-sig", index=False)

100%|██████████| 86/86 [03:34<00:00,  2.50s/it]


# `Future` 데이터 추출 - 공연 예정인 뮤지컬

In [66]:
# 공연 예정 id 값 불러오기
with open("future_musical_id.pkl", "rb") as file:
    future = pickle.load(file)

In [67]:
len(future)

396

In [68]:
detail_url = "http://www.playdb.co.kr/playdb/playdbDetail.asp?sReqPlayno={}"

synopsis_url = "https://www.playdb.co.kr/playdb/playdbDetail_Content.asp?TabKind=2&PlayNo={}"
# synopsis_url = "https://www.playdb.co.kr/playdb/playdbDetail_Content.asp?TabKind=2&PlayNo=192992"

# 뮤지컬별 dict 형태로 저장된 데이터를 저장할 리스트 
future_data_list = []

In [69]:
for i in tqdm(future):
    # 세부사항 데이터 url 연결
    r1 = requests.get(detail_url.format(i))
    # 작품설명, 줄거리 데이터 url 연결
    r2 = requests.get(synopsis_url.format(i))

    soup1 = BeautifulSoup(r1.text,'html.parser')
    soup2 = BeautifulSoup(r2.text,'html.parser')

    # 저장될 변수 데이터 Null 값으로 초기화
    musical_id = None
    title = None
    poster = None
    genre = None
    date = None
    location = None
    cast = None
    age_rating = None
    running_time = None
    describe = None
    synopsis = None

    temp ={}
    
    try:
        title = soup1.find("span", class_="title").get_text(strip=True)
        poster = soup1.find("div", class_="pddetail").find("h2").find("img").get("src")

        if soup1.find("img", alt="세부장르"):
            genre = soup1.find("img", alt="세부장르").find_next("td").get_text(strip=True).replace("뮤지컬>", "").replace("|","")
            
            if genre == "창작창작":
                genre = "창작"

        if soup1.find("img", alt="일시"):
            date = soup1.find("img", alt="일시").find_next("td").get_text(strip=True)

        if soup1.find("img", alt="장소"):
            location = soup1.find("img", alt="장소").find_next("td").get_text(strip=True)

        if soup1.find("img", alt="출연"):
            cast = soup1.find("img", alt="출연").find_next("td").get_text(strip=True)

        if soup1.find("img", alt="관람등급"):
            age_rating = soup1.find("img", alt="관람등급").find_next("td").get_text(strip=True)
        
        if soup1.find("img", alt="관람시간"):
            running_time = soup1.find("img", alt="관람시간").find_next("td").get_text(strip=True)
        
        # soup2를 통해 decribe, synopsis 가져오기
        td_elements = soup2.find_all("td")
        
        for td_element in td_elements:
            # 줄거리 내용에 해당되는 태그 찾기
            a_element = td_element.find("a", {"name": "TabSynopsis"})
            if a_element:
                synopsis = td_element.find("td", class_="news").get_text(strip=True) 

            # 줄거리가 아닌 값, 작품 설명
            elif td_element.find("td",class_="news"):
                describe = td_element.find("td",class_="news").get_text(strip=True)
        
        temp = {
            'musical_id' : i
            ,'title': title
            ,'poster_url': poster
            ,'genre' : genre
            ,'date': date
            ,'location': location
            ,'actors': cast
            ,'age_rating' :  age_rating
            ,'running_time': running_time
            ,'describe': describe
            ,'synopsis': synopsis
        }
    except AttributeError as ae :
        print(f"Expected error : {ae}")
        
    except Exception as e:
        print(f"Other error : {e}")
        
    future_data_list.append(temp)
    time.sleep(2)

df = pd.DataFrame(future_data_list)

df.to_csv("./future.csv", encoding="utf-8-sig", index=False)

100%|██████████| 396/396 [16:23<00:00,  2.48s/it]
