# なでしこリーグ試合日程のスクレイピング

## 日程・結果のページからデータを抽出しiCalendar形式のファイルに出力する。


### 日程・結果のページのURL

In [51]:
urlBase = 'http://www.nadeshikoleague.jp/2016/match/'
urlList = {
    'nadeshiko_div1':('index.php', 'index_1.php'), # なでしこリーグ1部
    'nadeshiko_div2':('nadeshiko2.php', 'nadeshiko2_1.php'), # なでしこリーグ2部
    'nadeshiko_cup1':('nadeshiko_cup1_1.php', 'nadeshiko_cup1_2.php'), # リーグカップ1部 A,B
    'nadeshiko_cup2':('nadeshiko_cup2_1.php', 'nadeshiko_cup2_2.php'), # リーグカップ2部 A,B
    'challenge_east':['challenge_east.php'],
    'challenge_west':['challenge_west.php']
}
print(urlList)

{'nadeshiko_cup2': ('nadeshiko_cup2_1.php', 'nadeshiko_cup2_2.php'), 'nadeshiko_cup1': ('nadeshiko_cup1_1.php', 'nadeshiko_cup1_2.php'), 'nadeshiko_div1': ('index.php', 'index_1.php'), 'nadeshiko_div2': ('nadeshiko2.php', 'nadeshiko2_1.php'), 'challenge_west': ['challenge_west.php'], 'challenge_east': ['challenge_east.php']}


In [56]:
import urllib2
from bs4 import BeautifulSoup
from bs4 import element
import json

from datetime import datetime
from datetime import timedelta
import uuid
import os
import sys
import time

# 省略名を変換する辞書
teamNames = {
    'I神戸':'レオネッサ',
    'コノミヤ':'スペランツァ',
    '新潟L':'アルビレックス',
    '日テレ':'ベレーザ',
    '浦和':'レッズ',
    
    'ノジマ':'ノジマステラ',
    'ちふれ':'エルフェン',
    'S世田谷':'スフィーダ',
    'ニッパツ':'シーガルズ',
    'アンジュ':'アンジュビオレ',
    'Ａハリマ':'ハリマアルビオン',
    
    'NORD':'ノルディーア',
    '大和S':'シルフィード',
    
    '静産磐田':'磐田ボニータ',
    'ac福島':'アカデミー福島',
    '名古屋':'NGU名古屋',
    '福岡AN':'アンクラス'
}

thisYear = 2016

def covertString(navStr):
    # NavigableStringを変換する
    return unicode(navStr).encode('utf-8')

def writeEventField(fp, key, val):
    fp.write('{0}:{1}\n'.format(key, val))

def convertTeamName(name):
    if name in teamNames:
        return teamNames[name]
    return name
    
def outputEvent(fp, data):
    writeEventField(fp, 'BEGIN','VEVENT')
    writeEventField(fp, 'UID:',uuid.uuid1().hex)
    
    dt = datetime.now()
    dtstr = dt.strftime("%Y%m%dT%H%M%SZ")
    writeEventField(fp, 'CREATED', dtstr)
    writeEventField(fp, 'DTSTAMP', dtstr)
    
    writeEventField(fp, 'SEQUENCE','10')
    writeEventField(fp, 'TRANSP','OPAQUE')
    
    if 'date' in data:
        s = data['date']
        if s is not None:
            try:
                daytm = "{0} {1}".format(s['day'], s['time'])
                #print(daytm)
                dt = datetime.strptime(daytm,'%m/%d %H:%M')
                dt = dt.replace(year = thisYear)
                
                dts = dt.strftime('%Y%m%dT%H%M%S')
                writeEventField(fp, 'DTSTART;TZID=Asia/Tokyo', dts)

                dt = dt + timedelta(0,60*120)
                dte = dt.strftime('%Y%m%dT%H%M%S')
                writeEventField(fp, 'DTEND;TZID=Asia/Tokyo', dte)
            except:
                print("Error: {0}".format(s))
                
    #summary = "{0} {1} {2} {3}".format(data['gameNo'], data['left'], data['result'], data['right'])
    summary = "{0} {1} {2} {3}".format(data['gameNo'], convertTeamName(data['left']), data['result'], convertTeamName(data['right']))

    writeEventField(fp, 'SUMMARY', summary)
    writeEventField(fp, 'LOCATION', data.get('location','---'))
    # writeEventField(fp, 'DESCRIPTION', data.get('date','---'))
    
    if 'url' in data:
        writeEventField(fp, 'URL;VALUE=URI', data['url'])
    writeEventField(fp, 'END','VEVENT')

def outputICalHeader(fp, calname):
    fp.write('BEGIN:VCALENDAR\n')
    fp.write('VERSION:2.0\n')
    fp.write('METHOD:PUBLISH\n')
    fp.write('PRODID:-PYTHON_SCRAPER\n')
    fp.write("X-WR-CALNAME:{0}\n".format(calname))
    fp.write('X-WR-TIMEZONE:Asia/Tokyo\n')
    fp.write('CALSCALE:GREGORIAN\n')
    
    vtimezone = '''BEGIN:VTIMEZONE
TZID:Asia/Tokyo   
BEGIN:DAYLIGHT
TZOFFSETFROM:+0900
DTSTART:19500507T020000
TZNAME:JDT
TZOFFSETTO:+1000
END:DAYLIGHT        
BEGIN:STANDARD
TZOFFSETFROM:+1000
DTSTART:19510908T020000
TZNAME:JST
TZOFFSETTO:+0900
END:STANDARD
END:VTIMEZONE
'''
    fp.write(vtimezone)

def outputToICalendar(fp, calname, dataList):
    outputICalHeader(fp, calname)

    for game in dataList:
        outputEvent(fp, game)

    fp.write('END:VCALENDAR')

def scrapeFromUrl(url):
    '''
    urlのhtmlページを取得して解析する
    
    '''
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup(html,"lxml")

    gameList = list()
    game = dict()
    
    div = soup.find(id='gameSchedule')
    for e in div.children:
        if isinstance(e, element.Tag):
            if e.name == 'h3':
                # 第N節
                gameNo = covertString(e.string)
                game['gameNo'] = gameNo
            elif e.name == 'table':
                for tr in e.find_all('tr', recursive=False):
                    for td in tr.find_all('td'):
                        classList = td.get('class')
                        if classList is None:
                            # 会場
                            game['location'] = covertString(td.string)
                        elif classList[0] == 'result':
                            # スコア
                            game['result'] = covertString(td.string)
                        elif classList[0] == 'date':
                            # 日付
                            if len(td.contents) == 3:
                                day = covertString(td.contents[0])[:-9]
                                i = day.find("（")
                                if i > 0:
                                    day = day[:i]
                                if len(day) == 0:
                                    day = None
                                    
                                time = covertString(td.contents[2])
                                if time == "未定":
                                    time = None
                                date = {'day':day, 'time':time}
                            else:
                                date = None
                            game['date'] = date
                        elif classList[0] == 'record':
                            # 公式記録
                            record = td.find('a')
                            if record is not None:
                                s = record.get('onclick')
                                l = s.split("'")
                                game['url'] = l[1]
                        elif classList[0] == 'teamName':
                            # チーム名 'left', 'right'
                            game[classList[1]] = covertString(td.string)
                    gameList.append(game)

                    game = {'gameNo': gameNo}
    return gameList

if __name__ == '__main__':
    for f, v in urlList.items():
        dataList = list()
        for u in v:
            print(f,u)
            print(urlBase +u)
            dataList.extend(scrapeFromUrl(urlBase+u))
    
        path = os.path.join(os.getcwd(), f +'.ics')
        print(path)
        with open(path,'w') as fp:
            outputToICalendar(fp, f, dataList) # iCalendarに出力

('nadeshiko_cup2', 'nadeshiko_cup2_1.php')
http://www.nadeshikoleague.jp/2016/match/nadeshiko_cup2_1.php
('nadeshiko_cup2', 'nadeshiko_cup2_2.php')
http://www.nadeshikoleague.jp/2016/match/nadeshiko_cup2_2.php
/Users/takahiro/Documents/ipython/html-web-map/nadeshiko_cup2.ics
Error: {'day': '7/16', 'time': None}
Error: {'day': '8/11', 'time': None}
Error: {'day': '8/7', 'time': None}
('nadeshiko_cup1', 'nadeshiko_cup1_1.php')
http://www.nadeshikoleague.jp/2016/match/nadeshiko_cup1_1.php
('nadeshiko_cup1', 'nadeshiko_cup1_2.php')
http://www.nadeshikoleague.jp/2016/match/nadeshiko_cup1_2.php
/Users/takahiro/Documents/ipython/html-web-map/nadeshiko_cup1.ics
('nadeshiko_div1', 'index.php')
http://www.nadeshikoleague.jp/2016/match/index.php
('nadeshiko_div1', 'index_1.php')
http://www.nadeshikoleague.jp/2016/match/index_1.php
/Users/takahiro/Documents/ipython/html-web-map/nadeshiko_div1.ics
Error: {'day': None, 'time': '13:00'}
Error: {'day': '9/25', 'time': None}
('nadeshiko_div2', 'nadeshi

In [4]:
import os
import time

path = 'schedule_nadeshiko.ics'
if os.path.exists(path):
    # ファイル修正時刻
    eps = os.path.getmtime(path)
    t = time.gmtime(eps)
    
t

time.struct_time(tm_year=2016, tm_mon=1, tm_mday=27, tm_hour=12, tm_min=51, tm_sec=48, tm_wday=2, tm_yday=27, tm_isdst=0)

## 関東大学リーグ

In [5]:
urlBase = 'http://www.juwfa-kanto.com/'
url = 'schedule.cgi'
divisions = [u'1部', u'2部', u'3部']

def scrapeFromUrl(soup):

    dataList = []
    
    article_section = soup.find(id='article-section')
    for h5 in article_section.find_all('h5'):
        if h5.string in divisions:
            table = h5.find_next_sibling('table')
            if table:
                for tr in table.find_all('tr'):
                    gameInfo = [h5.string.encode('utf-8')]
                    for td in tr.find_all('td'):
                        # 月日,時間,HOME,結果,AWAY,会場,PDF,捕捉
                        if td.string == u'\u25cf':
                            # PDF
                            pdfLink = td.find('a')
                            linkStr = pdfLink.get('href')
                            gameInfo.append(urlBase+linkStr)
                        else:
                            if td.string:
                                gameInfo.append(td.string.encode('utf-8'))
                    if len(gameInfo) > 1:
                        dataList.append(gameInfo)
    return dataList

def outputToCSV(filename, dataList):
    import csv
    
    with open(filename,'w') as fp:
        writer = csv.writer(fp)
        for game in dataList:
            writer.writerow(game)

def writeEventField(fp, key, val):
    fp.write('{0}:{1}\n'.format(key, val))
    
def outputEvent(fp, data):
    writeEventField(fp, 'BEGIN','VEVENT')
    writeEventField(fp, 'UID:',uuid.uuid1().hex)
    
    dt = datetime.now()
    dtstr = dt.strftime("%Y%m%dT%H%M%SZ")
    writeEventField(fp, 'CREATED', dtstr)
    writeEventField(fp, 'DTSTAMP', dtstr)
    
    writeEventField(fp, 'SEQUENCE','10')
    writeEventField(fp, 'TRANSP','OPAQUE')
    
    #日時
    if data[2] == '未定':
        dt = datetime.strptime(data[1],'%Y/%m/%d')
        dts = dt.strftime('%Y%m%d')
        writeEventField(fp, 'DTSTART;VALUE=DATE', dts)
    else:
        s = "{0} {1}".format(data[1],data[2])
        dt = datetime.strptime(s,'%Y/%m/%d %H:%M')

        dts = dt.strftime('%Y%m%dT%H%M%S')
        writeEventField(fp, 'DTSTART;TZID=Asia/Tokyo', dts)

        dt = dt + timedelta(0,60*110)
        dte = dt.strftime('%Y%m%dT%H%M%S')
        writeEventField(fp, 'DTEND;TZID=Asia/Tokyo', dte)
    
    summary = "{0} {1} {2}".format(data[3], data[4], data[5])
    writeEventField(fp, 'SUMMARY', summary)
    
    if len(data) >= 7:
        writeEventField(fp, 'LOCATION', data[6])
    
    if len(data) == 8:
        writeEventField(fp, 'URL;VALUE=URI', data[7])
    writeEventField(fp, 'END','VEVENT')
    
def outputToICalendar(path, dataList):
    fp = open(path,'w')

    #outputICalHeader(fp)
    
    for game in dataList:
        outputEvent(fp, game)
        
    fp.write('END:VCALENDAR')
    fp.close()
    
if __name__ == '__main__':
    html = urllib2.urlopen(urlBase +url).read()
    soup = BeautifulSoup(html,"lxml")

    dataList = scrapeFromUrl(soup)

    outputToICalendar('juwfa_kanto.ics', dataList)
    #outputToCSV('juwfa-kanto.csv', dataList)
    
    print(len(dataList))
    

149
