# 試合日程のスクレイピング

## 日程・結果のページからデータを抽出する。

<ul><li>HTMLからテーブルを見つける<li>行，セルを順番に処理<li>JSONやiCal形式のファイルに出力</ul>

## なでしこリーグの日程・結果のページ

In [259]:
urlBase = 'http://www.nadeshikoleague.jp/2015/'
urlList = [
    'nadeshiko1/match/index.php','nadeshiko1/match/index2.php'
    ,'nadeshiko2/match/index.php','nadeshiko2/match/index2.php','nadeshiko2/match/index3.php'
    ,'east/match/index.php','east/match/index2.php'
    ,'west/match/index.php','west/match/index2.php'
]
print(urlList)

['nadeshiko1/match/index.php', 'nadeshiko1/match/index2.php', 'nadeshiko2/match/index.php', 'nadeshiko2/match/index2.php', 'nadeshiko2/match/index3.php', 'east/match/index.php', 'east/match/index2.php', 'west/match/index.php', 'west/match/index2.php']


In [313]:
import urllib2
from bs4 import BeautifulSoup
import json

from datetime import datetime
from datetime import timedelta
import uuid

# 省略名を変換する
teamNames = {
    'NORD':'ノルディーア',
    '大和S':'シルフィード',
    '日テレ':'ベレーザ',
    '浦和':'浦和レッズ',
    'I神戸':'レオネッサ',
    '新潟L':'アルビレックス',
    'AS埼玉':'エルフェン',
    'ノジマ':'ノジマステラ',
    'Ａハリマ':'ハリマアルビオン',
    'アンジュ':'アンジュビオレ',
    '福岡AN':'アンクラス',
    'S世田谷':'スフィーダ'
}

def covertString(navStr):
    # NavigableStringを変換する
    return unicode(navStr).encode('utf-8')

def outputToJson(fn, data):
    fp = open(fn,'w')
    json.dump(data, fp, indent=1, ensure_ascii=False)
    fp.close()

def writeEventField(fp, key, val):
    fp.write('{0}:{1}\n'.format(key, val))

def convertTeamName(name):
    if name in teamNames:
        return teamNames[name]
    return name
    
def outputEvent(fp, data):
    writeEventField(fp, 'BEGIN','VEVENT')
    writeEventField(fp, 'UID:',uuid.uuid1().hex)
    
    dt = datetime.now()
    dtstr = dt.strftime("%Y%m%dT%H%M%SZ")
    writeEventField(fp, 'CREATED', dtstr)
    writeEventField(fp, 'DTSTAMP', dtstr)
    
    writeEventField(fp, 'SEQUENCE','10')
    writeEventField(fp, 'TRANSP','OPAQUE')
    
    if 'date' in data:
        s = data['date']
        dt = datetime.strptime(s[:5]+s[-6:],'%m/%d %H:%M')
        dt = dt.replace(year = 2015)
        
    dts = dt.strftime('%Y%m%dT%H%M%S')
    writeEventField(fp, 'DTSTART;TZID=Asia/Tokyo', dts)
    
    dt = dt + timedelta(0,60*110)
    dte = dt.strftime('%Y%m%dT%H%M%S')
    writeEventField(fp, 'DTEND;TZID=Asia/Tokyo', dte)
    
    summary = "{0} {1} {2} {3}".format(data['gameNo'], convertTeamName(data['left']), data['result'], convertTeamName(data['right']))
    writeEventField(fp, 'SUMMARY', summary)
    writeEventField(fp, 'LOCATION', data.get('location','---'))
    # writeEventField(fp, 'DESCRIPTION', data.get('date','---'))
    
    if 'url' in data:
        writeEventField(fp, 'URL;VALUE=URI', data['url'])
    writeEventField(fp, 'END','VEVENT')

def outputICalHeader(fp):
    fp.write('BEGIN:VCALENDAR\n')
    fp.write('VERSION:2.0\n')
    fp.write('METHOD:PUBLISH\n')
    fp.write('PRODID:-PYTHON_SCRAPER\n')
    fp.write("X-WR-CALNAME:Nadeshiko\n")
    fp.write('X-WR-TIMEZONE:Asia/Tokyo\n')
    fp.write('CALSCALE:GREGORIAN\n')
    
    vtimezone = '''BEGIN:VTIMEZONE
TZID:Asia/Tokyo   
BEGIN:DAYLIGHT
TZOFFSETFROM:+0900
DTSTART:19500507T020000
TZNAME:JDT
TZOFFSETTO:+1000
END:DAYLIGHT        
BEGIN:STANDARD
TZOFFSETFROM:+1000
DTSTART:19510908T020000
TZNAME:JST
TZOFFSETTO:+0900
END:STANDARD
END:VTIMEZONE
'''
    fp.write(vtimezone)

def outputToICalendar(dataList):
    fp = open('schedule_nadeshiko.ics','w')

    outputICalHeader(fp)
    
    for game in dataList:
        outputEvent(fp, game)
        
    fp.write('END:VCALENDAR')
    fp.close()
    
def scrapeFromUrl(url):
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup(html)

    dataList = []
    game = {}
    
    div = soup.find(id='gameSchedule')
    for e in div.children:
        if isinstance(e, Tag):
            if e.name == 'h3':
                # 第N節
                gameNo = covertString(e.string)
                game['gameNo'] = gameNo
            elif e.name == 'table':
                for tr in e.find_all('tr', recursive=False):
                    for td in tr.find_all('td'):
                        classList = td.get('class')
                        if classList is None:
                            # 会場
                            game['location'] = covertString(td.string)
                        elif classList[0] == 'result':
                            # スコア
                            game['result'] = covertString(td.string)
                        elif classList[0] == 'date':
                            # 日付
                            game['date'] = covertString(td.string)
                        elif classList[0] == 'record':
                            # 公式記録
                            record = td.find('a')
                            if record is not None:
                                s = record.get('onclick')
                                l = s.split("'")
                                game['url'] = l[1]
                        elif classList[0] == 'teamName':
                            # チーム名 'left', 'right'
                            game[classList[1]] = covertString(td.string)
                    dataList.append(game)

                    game = {}
                    game['gameNo'] = gameNo
    return dataList

if __name__ == '__main__':
    dataList = []
    for u in urlList:
        print(urlBase +u)
        dataList.extend(scrapeFromUrl(urlBase+u))

    outputToICalendar(dataList) # iCalendarに出力
    #outputToJson('schedule_challenge_east2.json', dataList) # JSONで出力

    print(len(dataList))

http://www.nadeshikoleague.jp/2015/nadeshiko1/match/index.php
http://www.nadeshikoleague.jp/2015/nadeshiko1/match/index2.php
http://www.nadeshikoleague.jp/2015/nadeshiko2/match/index.php
http://www.nadeshikoleague.jp/2015/nadeshiko2/match/index2.php
http://www.nadeshikoleague.jp/2015/nadeshiko2/match/index3.php
http://www.nadeshikoleague.jp/2015/east/match/index.php
http://www.nadeshikoleague.jp/2015/east/match/index2.php
http://www.nadeshikoleague.jp/2015/west/match/index.php
http://www.nadeshikoleague.jp/2015/west/match/index2.php
315


### Jリーグの試合

In [243]:
import urllib2
import BeautifulSoup
import sys

idList = {
14:"Sapporo", 2:"Chiba", 23:"Fukuoka", 29:"Yamagata",94:"Mito", 40:"Tochigi",
35:"Gunma", 4:"Verdy", 34:"YokohamaFC", 46:"Matsumoto", 41:"Toyama", 39:"Gifu",
24:"Kyoto", 9:"Gamba", 18:"Kobe",44:"Tottori", 42:"Okayama",36:"Tokushima",
37:"Ehime", 43:"Kitakyusyu", 47:"Nagasaki", 38:"Kumamoto"
}

yearParams = [(2014, 373)] #(2013, 348), 

url = 'https://data.j-league.or.jp/SFMS01/search?competition_frame_ids=2&competition_ids=348&home_away_select=0&tv_relay_station_name='

def getGameResult(fp, url):
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup.BeautifulSoup(html)

    table = soup.find('table', attrs={'class':'table-base00 search-table'})
    if table is None:
        return
    
    rows = table.findAll('tr')
    for row in rows:
        cells = row.findAll('td')
        if len(cells):
            for i,cell in enumerate(cells):
                #print(cell.contents)
                
                if i>0:
                    fp.write(',')
                for c in cell.contents:
                    if type(c) == BeautifulSoup.Tag:
                        fp.write(c['href']) # a href
                        fp.write(','+c.contents[0].replace('-', ',').encode('utf-8')) #チーム名
                    elif not c.isspace():
                        str = c.strip().replace(',', '')
                        if str == 'vs':
                            fp.write(',,')
                        elif str != "&nbsp;":
                            fp.write(str.encode('utf-8'))
            fp.write('\n')

if __name__ == '__main__':
    yp = (2014, 373)
    num = 14
    
    filepath ='/Users/takahiro/Desktop/J2Result/{0}/'.format(yp[0]) +idList[num]+'.csv'
    print(filepath)

    params = '&team_ids={team}&competition_years={year}&competition_ids={compe}'.format(team=num, year=yp[0], compe=yp[1])
    #getGameResult(sys.stdout, url+params)

/Users/takahiro/Desktop/J2Result/2014/Sapporo.csv


In [59]:
url = 'https://data.j-league.or.jp/SFMS01/search?competition_years=2014&competition_frame_ids=1&competition_ids=372&team_ids=14&home_away_select=0&tv_relay_station_name='
html = urllib2.urlopen(url).read()
soup = BeautifulSoup.BeautifulSoup(html)
table = soup.find('table', attrs={'class':'table-base00 search-table'})
print(table is None)

True
