# 展開済みのTXTをマージするコマンド

In [31]:
import os
import glob
import subprocess


year_range = range(2010,2021)

for year in year_range:
    # lzhファイルの取得
    year_lzh_files = glob.glob("./{}/*.lzh".format(year))
    #print(year_lzh_files)
    
    #展開
    for file in year_lzh_files:
        cmd = ("rm {}".format(file))
        result = os.popen(cmd)


# TXTをCSVに変更するスクリプト

In [39]:
import re
import collections
import csv

class BoartRaceDataGenerator:
    
    def __init__(self,target_file):
        
        #読み込み対象のCSVfile
        self.text_file = target_file
        
        #読み込んだtextのlist
        self.text_data_list = []
        
        # RaceGroupの始行と終行のリストを二次元配列で格納
        self.terminate_list = []
        
        # terminate_listの値をもとにRaceGroupeの情報をdict型で格納
        self.racegroup_info_list = []
        
        #1レースの記述行数
        self.race_description_line = 21

    
    def _is_racegroup_start_terminate_line(self, linestr):
        """ この行がRaceGroup情報の始行かどうかの判定するメソッド
        """
        if re.match(r'\d\dKBGN',linestr):
            return True
        else:
            return False
        
    def _is_racegroup_end_terminate_line(self, linestr):
        """ この行がRaceGroup情報の終行かどうかの判定するメソッド
        """
        if re.match(r'\d\dKEND',linestr):
            return True
        else:
            return False
        
    def set_data_list(self):
        with open(self.text_file, mode='rt', encoding='utf-8') as f:
            self.text_data_list = list(f)
            
    def set_terminate_list(self):
        """ ファイルのRaceGroupごとのterminate情報一覧を取得
        """
        
        self.set_data_list()
        terminate_num_list = []
        
        # 始行と終行の行番号を格納する
        with open(self.text_file, mode='rt', encoding='utf-8') as f:
            self.text_data_list = list(f)
            for line_num,line in enumerate(self.text_data_list):
                if self._is_racegroup_start_terminate_line(line) or self._is_racegroup_end_terminate_line(line):
                    terminate_num_list.append(line_num)

        # 取得した行番号をグルーピングする
        start_num = 0
        end_num = 0
        results = []
        for terminate_num in terminate_num_list:
            if start_num == 0 and end_num == 0:
                start_num = terminate_num
            elif start_num != 0 and end_num == 0:
                end_num = terminate_num
                results.append([start_num, end_num])
                start_num = 0
                end_num = 0
        self.terminate_list = results
    
    
    def set_racegroup_info(self):
                
        self.racegroup_info_list = []
        
        csv_data_list = []
        
        # GroupRace毎に格納
        for terminate in self.terminate_list:
            self.racegroup_info_list.append(self.text_data_list[terminate[0]:terminate[1]])
            
        # GroupRaceごとに解析
        for group_num, group in enumerate(self.racegroup_info_list):
            
            # [**KBGN]を処理始まり行(0行目)とする
            # **は会場番号

            ## 5行目 タイトル
            ## ７行目 何日目か　開催日　場所名
            ## 12行〜23行目　払戻金情報

            ### 27行目 天候　風方角　風力　波
            ### 30~３５　1Rレース結果
            ### 37~45 配当と人気
            # 正常系レースの場合
            if len(group) == 279:
                
                # Groupレースの会場番号
                grouprace_placeno = int(group[0].strip()[0:2])
                #print("grouprace_placeno:"+str(grouprace_placeno))
                
                # Groupレースのタイトル
                grouprace_title = group[5].strip()
                #print("group_title:" + str(grouprace_title))
                
                # Groupレースの何日目
                grouprace_day = int(group[7].split()[1][0:1])
                #print("grouprace_day:" + str(grouprace_day))
                
                #Groupレースの開催日
                grouprace_yyyy,grouprace_mm,grouprace_dd = re.sub('[ァ-ン 一-龥]', '', group[7])[1:].split("/")
                #print("raceinfo: "+ re.sub('[ァ-ン 一-龥]', '', group[7])[1:].split("/"))
                #print("grouprace_yyyymmdd:{}/{}/{}".format(str(grouprace_yyyy),str(grouprace_mm),str(grouprace_dd)))
                
                for info_num, info in enumerate(group):
                    # 分割ラインを発見したらそれを基準にレース情報を取得する
                    if not info.find("-------------"):
                        raceinfo = group[info_num - 2].replace("　","").split()
                        #print(raceinfo)
                        receinfo_no = int(raceinfo[0].replace("R",""))
                        receinfo_title = raceinfo[1]
                        
                        if raceinfo[2] == "進入固定":
                            receinfo_apploache_flg = 1
                        else:
                            receinfo_apploache_flg = 0
                        
                        raceinfo_wave = int(raceinfo[-1].replace("cm",""))
                        raceinfo_wind_speed = raceinfo[-3]
                        raceinfo_wind_direction = raceinfo[-4]
                        raceinfo_wheater = raceinfo[-6]
                        
                        
                        
#                         print(receinfo_no)
#                         print(receinfo_title)
#                         print(receinfo_apploache_flg)
#                         print(raceinfo_wave)
#                         print(raceinfo_wind_direction)
#                         print(raceinfo_wheater)
                        
                        
                        # 1R~6Rまでの情報を取得
                        for race_num in range(1,7):
                            raceinfo_results = group[info_num + race_num ].replace("　","").split()
                            receinfo_goal = raceinfo_results[0]
                            receinfo_course = raceinfo_results[1]
                            receinfo_racer_no = raceinfo_results[2]
                            receinfo_racer_name = raceinfo_results[3]
                            receinfo_boat_no = raceinfo_results[4]
                            receinfo_moter_no = raceinfo_results[5]
                            receinfo_pre_time = raceinfo_results[6]
                            
                            #print(raceinfo_results)

                            csv_data_list.append([grouprace_yyyy,
                                             grouprace_mm,
                                             grouprace_dd,
                                             grouprace_placeno,
                                             grouprace_title,
                                             grouprace_day,
                                             receinfo_title,
                                             receinfo_no,
                                             receinfo_apploache_flg,
                                             raceinfo_wheater,
                                             raceinfo_wind_direction,
                                             raceinfo_wind_speed,
                                             raceinfo_wave,
                                             receinfo_goal,
                                             receinfo_course,
                                             receinfo_racer_no,
                                             receinfo_racer_name,
                                             receinfo_boat_no,
                                             receinfo_moter_no,
                                             receinfo_pre_time])
                
                        
                        
                        
                # CSVを作成する対象ファイル
        csv_list_header = ["開催年",
                          "開催月",
                          "開催日",
                          "開催場所番号",
                          "レースタイトル",
                          "x日目",
                          "レース名",
                          "レース回数",
                          "進入固定",
                          "天気",
                          "風向",
                          "風速",
                          "波高",
                          "順位",
                          "コース",
                          "選手番号",
                          "選手名",
                          "ボート番号",
                          "モータ番号",
                          "展示タイム"]
        # csvの作成
        with open('create.csv', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(csv_list_header)
            writer.writerows(csv_data_list)

            
#         array = []
#         for group in self.racegroup_info_list:
#             array.append(len(group))
#         array = collections.Counter(array)
#         print(array)

        
                    
                    
sample = BoartRaceDataGenerator("/Users/eiichiroyoshioka/git/boatrace_ml/2019/utf8_all_2019.txt")
sample.set_terminate_list()
sample.set_racegroup_info()