# 크롤링 프로젝트

- 본 프로젝트는 FAST CAMPUS "데이터 사이언스 취업완성 SCHOOL" 의 박두진 강사님 지도하에  손형진이 진행하였습니다.

## 1. 데이터 수집 목적 및 동기

- 한국프로야구 관중수 즉 티켓판매 비용은 프로구단의 가장 큰 수입원 가운데 하나이다. 이에 따라, 직,간접적인 지표를 바탕으로 프로야구 관중수를 예측할 수 있다면, 그에 맞는 관중 유치 전략을 세우는데 도움을 줄 수 있다.
- 특히, 프로야구 팀의 성적은 경기장으로 향하는 팬의 마음을 움직이게 하는 결정적인 요소 가운데 하나이다. 이에 따라, 본 프로젝트에서는 관중수를 예측하기 위해 반드시 필요한 일자별 팀의 성적을 KBO 웹사이트로부터 크롤링하고자 한다

## 2. 데이터 수집 계획

![process](images/process.png)

## 3. 코드 시행

In [73]:
%%writefile requirements.txt

아래 패키지는 본 크롤링을 위해 사용된 패키지입니다.

import requests
import json
import scrapy
import pandas as pd

Overwriting requirements.txt


### 모듈 작성

In [29]:
%%writefile crawkbo.py
import requests
import pandas as pd

def kboresults(year, month, team):
    game_dic = []
    
    # json url search and Query pattern search
    for y in year:
        for m in month:
            for t in team:
                url = "https://www.koreabaseball.com/ws/Schedule.asmx/GetScheduleList?leId=1&srIdList=0%2C9&seasonId={}&gameMonth={}&teamId={}".format(y, m, t)
                # Request response
                response = requests.get(url)
                # Parsing
                data = response.json()["rows"]
                
                # Data select
                for n in range(0,31):
                    try:
                        year_c = y 
                        date = data[n]['row'][0]['Text'] #날짜
                        time = data[n]['row'][1]['Text'] #시간
                        result = data[n]['row'][2]['Text'] #경기결과
                        park = data[n]['row'][7]['Text']  #구장
                        etc = data[n]['row'][8]['Text'] #비고
                                
                        game_dic.append({
                            "year" : year_c,
                            "dates" : date,
                            "times" : time,
                            "results" : result,
                            "parks" : park,
                            "etcs" : etc,            
                            })
                    
                    except:
                        break
    
    #Data processing
    game = pd.DataFrame(game_dic)
    game['times'] = game['times'].str.replace('<b>',' ').str.replace('</b>',' ')
    game['results'] = game['results'].str.replace('<span class="win">',' win ').str.replace('<span class="lose">',' lose ').str.replace('<span class="same">',' same ').str.replace('</span><span>',' ').str.replace('</span></em><span>',' ').str.replace('<span>','').str.replace('</span>','').str.replace('<em>','').str.replace('</em>','')
    results_split = pd.DataFrame([x.split(' ') for x in sum([list([x]) for x in game["results"]], [])])
    dates_split = pd.DataFrame([x.split('(') for x in sum([list([x]) for x in game["dates"]], [])])
    game["away"] = results_split[0]
    game["homewin"] = results_split[4]
    game["home"] = results_split[6]
    game["weekday"] = dates_split[1].str.replace(")","")
    game = game.drop(["results"],axis = 1)
    game["dates"] = game["year"]+'/' + game["dates"].str[:2] +"/" + game["dates"].str[3:5]
    game = game.drop(["year"],axis = 1)
    game = game[["dates", "parks", "away", "home", "homewin", "etcs", "times" ]]
    game['dates'] = game['dates'].str.replace(".","/")
    game = game.drop_duplicates(subset=['dates','parks'], keep='first')
    game.reset_index(drop=True, inplace=True)
    
    return game                

Overwriting crawkbo.py


### 모듈 시행

In [1]:
import crawkbo

In [2]:
year = ["2017", "2018", "2019"]
month = ["03","04","05", "06", "07", "08", "09", "10"]
team = ["SK", "HH", "WO", "HT", "SS", "LT", "OB", "LG","KT","NC"]

In [3]:
game = crawkbo.kboresults(year, month, team)

In [4]:
game.head()

Unnamed: 0,dates,parks,away,home,homewin,etcs,times
0,2017/03/31,문학,KT,SK,lose,-,19:00
1,2017/03/31,잠실,한화,두산,win,-,19:00
2,2017/03/31,고척,LG,넥센,lose,-,19:00
3,2017/03/31,대구,KIA,삼성,lose,-,19:00
4,2017/03/31,마산,롯데,NC,win,-,19:00


**전처리전**
![df_origin](images/df_origin.png)

### 스크래피(scrapy)

![scrapy_process](images/scrapy_process.png)

#### item.py

- 수집 대상: 연도(year), 날짜(dates), 시간(times), 결과(results), 구장(park), 비고(etc)

In [37]:
%%writefile crawler/crawler/items.py
import scrapy

class CrawlerItem(scrapy.Item):
    year = scrapy.Field()
    dates = scrapy.Field()
    times = scrapy.Field()
    results = scrapy.Field()
    parks = scrapy.Field()
    etcs = scrapy.Field()

Overwriting crawler/crawler/items.py


#### spider.py

In [50]:
%%writefile crawler/crawler/spiders/spider.py
import scrapy
import requests
import json
from crawler.items import CrawlerItem

class Spider(scrapy.Spider):
    name = "KBO"

    def start_requests(self):
        year = ["2017", "2018", "2019"]
        month = ["03","04","05", "06", "07", "08", "09", "10"]
        team = ["SK", "HH", "WO", "HT", "SS", "LT", "OB", "LG","KT","NC"]
        for self.y in year:
            for m in month:
                for t in team:
                    url = "https://www.koreabaseball.com/ws/Schedule.asmx/GetScheduleList?leId=1&srIdList=0%2C9&seasonId={}&gameMonth={}&teamId={}".format(self.y, m, t)
                    yield scrapy.Request(url, callback=self.match_parse)
 
    def match_parse(self, response):
        item = CrawlerItem()
        for n in range(0,31):
            try:
                item["year"] = self.y
                item["dates"] = json.loads(response.body)["rows"][n]['row'][0]['Text'] #날짜
                time = json.loads(response.body)["rows"][n]['row'][1]['Text'] #시간
                item["times"] = time.replace("<b>", "").replace("</b>", "")
                result = json.loads(response.body)["rows"][n]['row'][2]['Text'] #경기결과
                item["results"] = result.replace('<span class="win">',' win ').replace('<span class="lose">',' lose ').replace('<span class="same">',' same ').replace('</span><span>',' ').replace('</span></em><span>',' ').replace('<span>','').replace('</span>','').replace('<em>','').replace('</em>','')
                item["parks"] = json.loads(response.body)["rows"][n]['row'][7]['Text']  #구장
                item["etcs"] = json.loads(response.body)["rows"][n]['row'][8]['Text'] #비고
                yield item
  
            except:
                break

Overwriting crawler/crawler/spiders/spider.py


#### mongodb.py

In [34]:
%%writefile crawler/crawler/mongodb.py

import pymongo

client = pymongo.MongoClient('mongodb://52.79.169.68/:27017/')
db = client.KBO
collection = db.game

Overwriting crawler/crawler/mongodb.py


#### pipelines.py

In [44]:
%%writefile crawler/crawler/pipelines.py
from .mongodb import collection

class CrawlerPipeline(object):
    
    def process_item(self, item, spider):
        
        data = { "year": item["year"],
                 "dates": item["dates"], 
                 "times": item["times"],
                 "results": item["results"], 
                 "parks": item["parks"],
                 "etcs": item["etcs"],
               }
        
        collection.insert(data)
        
        return item

Overwriting crawler/crawler/pipelines.py


#### settings.py

In [50]:
!echo "ITEM_PIPELINES = {"  >> crawler/crawler/settings.py

In [51]:
!echo "   'crawler.pipelines.CrawlerPipeline': 300," >> crawler/crawler/settings.py

In [52]:
!echo "}" >> crawler/crawler/settings.py

In [1]:
!tail -n 5 crawler/crawler/settings.py

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

ITEM_PIPELINES = {
        'crawler.pipelines.CrawlerPipeline':300,
        }


#### run.sh

In [10]:
%%writefile run.sh
cd crawler
scrapy crawl KBO -o KBO_results.csv

Overwriting run.sh


In [None]:
!./run.sh

In [69]:
import pandas as pd

In [70]:
df = pd.read_csv("crawler/KBO_results.csv")
df.head(5)

Unnamed: 0,dates,etcs,parks,results,times,year
0,03.31(금),-,대구,KIA win 7 vs lose 2 삼성,19:00,2017
1,03.31(금),-,대구,KIA win 7 vs lose 2 삼성,19:00,2017
2,03.31(금),-,잠실,한화 lose 0 vs win 3 두산,19:00,2017
3,03.31(금),-,고척,LG win 2 vs lose 1 넥센,19:00,2017
4,03.31(금),-,고척,LG win 2 vs lose 1 넥센,19:00,2017


In [71]:
results_split = pd.DataFrame([x.split(' ') for x in sum([list([x]) for x in df["results"]], [])])
dates_split = pd.DataFrame([x.split('(') for x in sum([list([x]) for x in df["dates"]], [])])
df["away"] = results_split[0]
df["homewin"] = results_split[4]
df["home"] = results_split[6]
df["weekday"] = dates_split[1].str.replace(")","")
df = df.drop(["results"],axis = 1)
df["dates"] = df["year"].astype(str)+'/' + df["dates"].str[:2] +"/" + df["dates"].str[3:5]
df = df.drop(["year"],axis = 1)
df = df[["dates", "parks", "away", "home", "homewin", "etcs", "times" ]]
df['dates'] = df['dates'].str.replace(".","/")
df = df.drop_duplicates(subset=['dates','parks'], keep='first')
df.reset_index(drop=True, inplace=True)

In [72]:
df.head()

Unnamed: 0,dates,parks,away,home,homewin,etcs,times
0,2017/03/31,대구,KIA,삼성,lose,-,19:00
1,2017/03/31,잠실,한화,두산,win,-,19:00
2,2017/03/31,고척,LG,넥센,lose,-,19:00
3,2017/03/31,마산,롯데,NC,win,-,19:00
4,2017/03/31,문학,KT,SK,lose,-,19:00


### pymongo 데이터 불러오기

In [17]:
import pymongo

In [18]:
client = pymongo.MongoClient('mongodb:___')
client

MongoClient('52.79.169.68', 27017)

In [20]:
result = client.KBO.game.find()

In [22]:
pd.DataFrame(result).tail()

Unnamed: 0,_id,dates,etcs,parks,results,times
912,5df278b1301c9c3aaa4c7142,07.23(목),-,잠실,롯데 win 6 vs lose 4 두산,18:30
913,5df278b1301c9c3aaa4c7143,07.28(화),-,대전,두산 win 7 vs lose 2 한화,18:30
914,5df278b1301c9c3aaa4c7144,07.29(수),-,대전,두산 lose 3 vs win 7 한화,18:30
915,5df278b1301c9c3aaa4c7145,07.30(목),-,대전,두산 win 2 vs lose 1 한화,18:30
916,5df278b1301c9c3aaa4c7146,07.31(금),-,잠실,SK lose 4 vs win 6 두산,18:30


from .mongodb import collection

## 4.회고
- 스크래피를 통해 전처리 과정을 모두 담지 못했다는 점
- 파이몽고를 noSQL 쿼리를 많이 써보지 못했다는 점