In [25]:
from bs4 import BeautifulSoup
import requests
import uuid

# 한 건의 대화에 대한 정보를 담는 객체입니다.
class Conversation:
    # 질문(Question), 응답(Answer) 두 변수로 구성됩니다.
    def __init__(self, contentName, contentType, question, answer):
        if len(question) > 0 and question[0] == ':':
            question = question[1:]
        if len(answer) > 0 and answer[0] == ':':
            answer = answer[1:]
        while len(question) > 0 and question[0] == ' ':
            question = question[1:]
        while len(answer) > 0 and answer[0] == ':':
            answer = answer[1:]
        self.contentName = contentName
        self.contentType = contentType
        self.question = question
        self.answer = answer
        
    def __str__(self):
        return "질문: " + self.question + "\n답변: " + self.answer + "\n"

# 한 건의 영어 대화 주제 링크를 담는 객체입니다.
class Subject:
    def __init__(self, number, title, link):
        self.number = number
        self.title = title
        self.link = link
        
    def __str__(self):
        return "번호: " + self.number + "\n제목: " + self.title + "\n답변: " + self.link + "\n"
    
# 모든 영어 대화 주제를 추출하는 함수입니다.
def get_subjects(number):
    subjects = []
    
    req = requests.get('http://www.elllo.org/video/' + number)
    html = req.text
    
    soup = BeautifulSoup(html, 'html.parser')
    divs = soup.findAll('div', {"class": "mobilelist"})
    
    for subject in divs:
        tags = subject.findAll('a')
        
        for tag in tags:
            title = tag.text
            if title == '':
                continue
            link = tag['href']
            s = Subject(number, title, link)
            # print(s) # 추출된 주제를 출력합니다.
            subjects.append(s)
            
    return subjects

# 특정 대화 주제의 대화 내용을 크롤링합니다.
def get_conversation(title, number, url):
    conversations = []
    
    req = requests.get('http://www.elllo.org/video/' + number + "/" + url)
    html = req.text
    
    soup = BeautifulSoup(html, 'html.parser')
    if soup.text.find("transcript") != -1: # 스크립트 자체가 페이지에 없으면 취소
        return conversations
    if len(soup.findAll('div', {"class": "transcript"})) < 1:
        return conversations
    sayings = soup.findAll('div', {"class": "transcript"})[0]
    strongs = sayings.findAll('strong') # strong에 들어간 사람 이름 정보 확인

    idx = 0
    while True:
        if idx + 1 >= len(strongs):
            break
        question = strongs[idx].nextSibling # 사람 이름 다음에 메시지가 등장
        answer = strongs[idx + 1].nextSibling
        idx = idx + 1
        if question == None or answer == None:
            continue
        c = Conversation(number + str(uuid.uuid4()), idx, str(question), str(answer))
        print(c) # 추출된 대화 내용을 출력합니다.
        conversations.append(c)
        
    return conversations

In [26]:
subjects = []

# 모든 영어 대화 주제 링크를 얻어냅니다.
for i in range(0, 20):
    print("(" + str(i) + " / " + str(19) + ")")
    subjects.extend(get_subjects(str(1001 + (i * 25))))

print("전체 대화 주제의 개수는 " + str(len(subjects)) + "개 입니다.")

(0 / 19)
(1 / 19)
(2 / 19)
(3 / 19)
(4 / 19)
(5 / 19)
(6 / 19)
(7 / 19)
(8 / 19)
(9 / 19)
(10 / 19)
(11 / 19)
(12 / 19)
(13 / 19)
(14 / 19)
(15 / 19)
(16 / 19)
(17 / 19)
(18 / 19)
(19 / 19)
전체 대화 주제의 개수는 268개 입니다.


In [27]:
conversations = []

idx = 0
# 모든 대화 내용을 얻어냅니다.
for subject in subjects:
    print("(" + str(idx) + "/" + str(len(subjects)) + ") 현재 주제: " + subject.title + "\n")
    conversations.extend(get_conversation(subject.title, subject.number, subject.link))
    idx = idx + 1
    
print("전체 대화 주제의 개수는 " + str(len(conversations)) + "개 입니다.")

(0/268) 현재 주제: Do teachers praise students too much?

(1/268) 현재 주제: Do you prefer to read online or offline?

(2/268) 현재 주제: Do you read fewer books because of the internet?

(3/268) 현재 주제: What is being threatened in your country?

질문: Hi, I'm Jingwei.
답변:  I'm Joseph.

질문: I'm Joseph.
답변:  We're from Singapore and this is for elllo.org. The question today is what is being threatened in our country?

질문: We're from Singapore and this is for elllo.org. The question today is what is being threatened in our country?
답변:  I assume this refers to animals, and as of now I really can't think of any. The Singapore is doing a pretty good job, but in the past â¦

질문: I assume this refers to animals, and as of now I really can't think of any. The Singapore is doing a pretty good job, but in the past â¦
답변:  We had tigers ...

질문: We had tigers ...
답변:  But they're all gone now, except for at the zoo. We had quite a few species of snakes like grass snakes, but we don't see them around much now

(20/268) 현재 주제: Why you should stay in shape?

(21/268) 현재 주제: How do people show affection in your country?

(22/268) 현재 주제: Are you touchy-feely?

(23/268) 현재 주제: Are you a tree hugger?

(24/268) 현재 주제: What is your country's biggest environmental problem?

(25/268) 현재 주제: What do you like shopping for? 

(26/268) 현재 주제: Do you tend to hold onto things?

질문: Hi, everyone I'm Danny from England.
답변:  And I'm Alex from Australia. And were going to be talking about whether or not we hoard things. Now Danny do you hoard things?

질문: And I'm Alex from Australia. And were going to be talking about whether or not we hoard things. Now Danny do you hoard things?
답변:  Oh, all the time. I have so many papers everywhere, so many books. All my receipts, just anything that I've bought that's special to me. All of them in one big box.

질문: Oh, all the time. I have so many papers everywhere, so many books. All my receipts, just anything that I've bought that's special to me. All of them in one big b

(44/268) 현재 주제: Do you believe in conspiracies?

질문: Hi, this is Paul from South Korea. 
답변:  And I'm Hanna from Australia.

질문: And I'm Hanna from Australia.
답변:  So Hanna, do you believe in conspiracies?

질문: So Hanna, do you believe in conspiracies?
답변:  Conspiracies? I like to read about conspiracies, and I really like watching all those conspiracy theory videos on youtube, but I'm not really convinced to be honest. Yeah, I don't really believe it's true. I don't know. It's really difficult because there's never enough information

질문: Conspiracies? I like to read about conspiracies, and I really like watching all those conspiracy theory videos on youtube, but I'm not really convinced to be honest. Yeah, I don't really believe it's true. I don't know. It's really difficult because there's never enough information
답변: Yeah, all these rumors people are talking about.

질문: Yeah, all these rumors people are talking about.
답변:  Yeah, exactly, a lot of it just seems like a rumor to m

(4

(61/268) 현재 주제: Would you ever want to be a lawyer?

(62/268) 현재 주제: Do you believe in the justice system?

(63/268) 현재 주제: Have you been to Argentina?

(64/268) 현재 주제: Are you good at managing your time?

(65/268) 현재 주제: What do you think of homeschooling?

(66/268) 현재 주제: Have you raised an animal?

(67/268) 현재 주제: Did you ever go to camp?

(68/268) 현재 주제: What renewable energies do you use?

(69/268) 현재 주제: Do you think we will ever stop using oil?

(70/268) 현재 주제: What are the main parties in your country?

(71/268) 현재 주제: What is controversial in your country?

(72/268) 현재 주제: How do you feel about guns?

(73/268) 현재 주제: Does your country have good healthcare?

(74/268) 현재 주제: What's your favorite national holiday?

(75/268) 현재 주제: What career would you like to have?

(76/268) 현재 주제: What is your dream job?

(77/268) 현재 주제: Do people drive fast in your country?

(78/268) 현재 주제: Are people safe drivers in my country?

(79/268) 현재 주제: What do you like to eat for an energy boost?

(8

(128/268) 현재 주제: What kind of movies do you like?

(129/268) 현재 주제: Best town for weekend holidays?

(130/268) 현재 주제: What is my country's best cities?

(131/268) 현재 주제: Do you take naps?

질문: Hi, my name is Win and I'm from Vietnam.
답변:  Hello, my name is Michael and I'm from Norway.

질문: Hello, my name is Michael and I'm from Norway.
답변:  So, my question is do I take naps?  Well, I do take naps as a habit because in my country for my 12 years of education in Vietnam I have to take nap as compulsory, at least one hour a day. I'm, well, scientifically, I've read that if you take nap it's really good for your brain as it helps you refresh from your half-a-day activity. So, what do you think? Do you take naps?

질문: So, my question is do I take naps?  Well, I do take naps as a habit because in my country for my 12 years of education in Vietnam I have to take nap as compulsory, at least one hour a day. I'm, well, scientifically, I've read that if you take nap it's really good for your brai

(148/268) 현재 주제: Does your country have good food?

(149/268) 현재 주제: Is it cheap where you live?

(150/268) 현재 주제: Would you rather work for a large or small company?

질문: Hello, this is the video for ello.org. Iâm  from Thailand.
답변:  And Iâm Micky from Nigeria.

질문: And Iâm Micky from Nigeria.
답변:  Uhm, the question is about âWould you rather work for a large or small company?â Would you?

질문: Uhm, the question is about âWould you rather work for a large or small company?â Would you?
답변:  Yeah, I donât know, but I think I would rather work for a large company.

질문: Yeah, I donât know, but I think I would rather work for a large company.
답변:  Uhm, why?

질문: Uhm, why?
답변:  Because working for a large company gives you room to build yourself. Have large friends, have large organization to work with because youâre kind of like reckon with and all that.

질문: Because working for a large company gives you room to build yourself. Have large friends, have large organizatio

(196/268) 현재 주제: What is your city like?

(197/268) 현재 주제: What is your favorite movie?

(198/268) 현재 주제: What movie disappointed you?

(199/268) 현재 주제: How do you feel about the  zoo?

(200/268) 현재 주제: What do you eat for breakfast?

(201/268) 현재 주제: What foods do you love?

(202/268) 현재 주제: Were you naughty as a child?

(203/268) 현재 주제: Are you good with children?

질문: Hi. I'm Silvia. I'm from Guatemala.
답변:  And I'm Diego. I'm from Mexico. 

질문: And I'm Diego. I'm from Mexico. 
답변:  Diego, you're like a very fun person to be around. So I'm wondering, how good are you with children?

질문: Diego, you're like a very fun person to be around. So I'm wondering, how good are you with children?
답변:  Actually, I think I'm quite good with children. I love kids, maybe it's because I'm a kid as well here. And I like to play with kids. I think it's really hilarious. They definitely make it exhausted and tired  but I do enjoy playing with kids. And actually sometimes in the local orphanage, when i

(221/268) 현재 주제: When do you like to dress up?

질문: Hello
답변:  Youâre tuning to ello.org. Iâm Micky from Nigeria and..

질문: Youâre tuning to ello.org. Iâm Micky from Nigeria and..
답변:  Iâm Gaew from Thailand.

질문: Iâm Gaew from Thailand.
답변:  So weâre talking about today is dressing up. So Gaew, what do you think about the topic?

질문: So weâre talking about today is dressing up. So Gaew, what do you think about the topic?
답변:  When do you like to dress up?

질문: When do you like to dress up?
답변:  Yes.

질문: Yes.
답변:  For me, I usually dress up whenever I feel like to.

질문: For me, I usually dress up whenever I feel like to.
답변:  Wow, that is nice.

질문: Wow, that is nice.
답변:  Yeah.

질문: Yeah.
답변:  For me, I totally feel like anytime Iâm going out is good for me to dress up. Thatâs what I feel.

질문: For me, I totally feel like anytime Iâm going out is good for me to dress up. Thatâs what I feel.
답변:  So you dress up everytime you go out?

질문: So you dress up everyt

(233/268) 현재 주제: What is an unhealthy food you enjoy?

(234/268) 현재 주제: What kind of music do you like?

(235/268) 현재 주제: What is your favorite musical instrument?

(236/268) 현재 주제: What is a special skill you wish you had?

(237/268) 현재 주제: What is your best quality?

(238/268) 현재 주제: What country would you like to vist?

(239/268) 현재 주제: What animals do you like?

(240/268) 현재 주제: Do you prefer cats or dogs?

(241/268) 현재 주제: Are you addicted to your phone?

(242/268) 현재 주제: Who is the last person who called you?

(243/268) 현재 주제: What is the best gift you have received?

(244/268) 현재 주제: Do you prefer to give gifts or get gifts?

(245/268) 현재 주제: Do people kiss in public in your country?

(246/268) 현재 주제: What is your favorite cuisine?

(247/268) 현재 주제: Do you spend a lot on shopping?

(248/268) 현재 주제: How do you feel about fashion magazines?

질문: Hello! My name is Aisle and Iâm from Lithuania and this is for elllo.org.
답변:  And I can Chris from Belgium. The topic of today is fash

(252/268) 현재 주제: Do you like beauty pageants?

(253/268) 현재 주제: Do you worry about your looks?

(254/268) 현재 주제: Are you good with children?

질문: Hi! This is Hanna, from Australia. 
답변:  And this is Paul, from South Korea. 

질문: And this is Paul, from South Korea. 
답변:  So, Paul! Are you good with children? 

질문: So, Paul! Are you good with children? 
답변:  I think I'm good with children. 

질문: I think I'm good with children. 
답변:  Oh, really? 

질문: Oh, really? 
답변:  I really enjoy hanging out with them. Some people say I'm a bit childish, that's why I get along with kids so well. 

질문: I really enjoy hanging out with them. Some people say I'm a bit childish, that's why I get along with kids so well. 
답변:  Oh, really? 

질문: Oh, really? 
답변:  Oh yeah, I do enjoy hanging out with them. I mean, the way they play... 

질문: Oh yeah, I do enjoy hanging out with them. I mean, the way they play... 
답변:  Yeah? 

질문: Yeah? 
답변:  Like throwing balls at each other, and like all the sports and all th

(263/268) 현재 주제: What business would you like to start?

(264/268) 현재 주제: What is the best thing about your country?

(265/268) 현재 주제: What is the best thing about your country?

(266/268) 현재 주제: What are you cooking?

질문: Hey I say, what are you doing?					       
답변:   I'm making some lunch. We are very busy now because we have to study for exams so I'm making something very quick. It's just some meat with salad and potatoes that are even not peeled if you look at it. Actually, I don't have myself eating potatoes in so hot climate because it's very hard to digest but Krustoff likes it a lot so we make everything together and then we'll see who eats what. I kind of like this food, it's enough for a day.

(267/268) 현재 주제: What are your favorite things?

전체 대화 주제의 개수는 402개 입니다.


In [28]:
import xlsxwriter

# 엑셀 워크 북 및 워크 시트를 생성합니다.
workbook = xlsxwriter.Workbook('Conversation Data 2.xlsx')
worksheet = workbook.add_worksheet()

row = 0
col = 0

# 모든 대화 내용을 엑셀로 저장합니다.
for c in conversations:
    worksheet.write(row, col, c.contentName)
    worksheet.write(row, col + 1, c.contentType)
    worksheet.write(row, col + 2, c.question)
    worksheet.write(row, col + 3, c.answer)
    row = row + 1
    
workbook.close()

In [5]:
# 내보내기 이후에 엑셀에서 공백이 있거나 None으로 처리된 것들은 제거합니다.