## 필요 라이브러리 설치

!pip install BeautifulSoup4

!pip install pandas

## 기본 html 파싱

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
res = requests.get('https://musclewiki.com/Directory')
soup = BeautifulSoup(res.content, 'html.parser')

lst = soup.find_all('tr')

## 필요 데이터 추출

In [None]:
exercise = []
equipment = []
difficulty = []
link = []

for item in lst:
    txt = [tmp for tmp in item.get_text().split('\n') if tmp]
    if 'Exercises' in item.get_text():
        muscle.append(txt[0].strip()[:-10])
        exercise.append([])
        equipment.append([])
        difficulty.append([])
        link.append([])
        
    else:
        exercise[-1].append(txt[0].strip())
        equipment[-1].append(txt[-2].strip())
        difficulty[-1].append(txt[-1].strip())
        tmp = 'https://musclewiki.com' + item.find('a')['href']
        link[-1].append(tmp)

In [3]:
muscle = []
exercise = []
equipment = []
difficulty = []
link = []

for item in lst:
    txt = [tmp for tmp in item.get_text().split('\n') if tmp]
    if 'Exercises' in item.get_text():
        muscle.append(txt[0].strip()[:-10])
        exercise.append([])
        equipment.append([])
        difficulty.append([])
        link.append([])
        
    else:
        exercise[-1].append(txt[0].strip())
        equipment[-1].append(txt[-2].strip())
        difficulty[-1].append(txt[-1].strip())
        tmp = 'https://musclewiki.com' + item.find('a')['href']
        link[-1].append(tmp)

## 데이터프레임 생성

In [4]:
df_lst = []

for i in range(len(muscle)):
    tmp_df = pd.DataFrame([exercise[i], equipment[i], difficulty[i], link[i]], 
                          index = ['exercise', 'equipment', 'difficulty', 'link']).transpose()
    tmp_df['muscle'] = muscle[i]
    df_lst.append(tmp_df)

## 데이터프레임에 이미지 및 운동 방법 추가

In [5]:
for df in df_lst:
    for i in range(len(df)):
        link = df.loc[i, 'link']
        tmp_res = requests.get(link)
        tmp_soup = BeautifulSoup(tmp_res.content, 'html.parser')
        
        if '404' in tmp_soup.get_text():
            df.loc[i, 'img_link'] = 'no data'
            df.loc[i, 'video_link'] = 'no data'
            df.loc[i, 'step'] = 'no data'
            
        else:
            image = tmp_soup.find(class_ = 'exercise-images-grid')
            if image:
                df.loc[i, 'img_link'] = 'https://www.musclewiki.com' + image.find('img')['src']
                df.loc[i, 'video_link'] = image.find('a')['href']
            else:
                df.loc[i, 'img_link'] = 'no data'
                df.loc[i, 'video_link'] = 'no data'
            
            step_parse = tmp_soup.find(class_ = 'steps-list')
            step_lst = [item for item in step_parse.get_text().split('\n') if item]
            step = ""
            for j in range(len(step_lst)):
                step = step + str(j + 1) + '. ' + step_lst[j] + "\n"
            step = step.strip('\n')
            
            if step:
                df.loc[i, 'step'] = step
            else:
                df.loc[i, 'step'] = 'no data'

## 난이도로 구분

In [6]:
beginner = []
master = []

for df in df_lst:
    beginner.append(df.loc[df['difficulty'] == 'Beginner'])
    master.append(df.loc[df['difficulty'] != 'Beginner'])

In [7]:
for i in range(len(df)):
    b_filename = "eng_Beginner_" + muscle[i]
    m_filename = "eng_Master_"+ muscle[i]
    
    b_df = beginner[i].loc[beginner[i]['difficulty'] == 'Beginner']
    m_df = master[i].loc[master[i]['difficulty'] != 'Beginner']
    
    b_df.to_csv('data/' + b_filename + '.csv', index = False)
    m_df.to_csv('data/' + m_filename + '.csv', index = False)

## 데이터 한글로 변환
- googletrans 라이브러리 사용

!pip install googletrans==4.0.0-rc1

In [8]:
import os
import googletrans
import time

path = './data/'
files = os.listdir(path)
files = list(filter(lambda x: x[:3] == 'eng', files))
translator = googletrans.Translator()

In [9]:
for file in files:
    df = pd.read_csv(path + file)
    filename = file.replace('eng', 'kor')
    
    for i in range(len(df)):
        step = df.loc[i, 'step']
        time.sleep(0.5)
        if step != 'no data':
            trans = translator.translate(step, dest = 'ko', src = 'en')
            df.loc[i, 'step'] = trans.text
    df.to_csv(path+filename, encoding = 'utf-8-sig', index = False)