In [1]:
#使用ライブラリ
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import statsmodels.api as sm
import numpy as np

In [2]:
###データ読込(結合したデータ)
train = pd.read_csv(filepath_or_buffer='../j_league/new_data/__train_new.csv')
test = pd.read_csv(filepath_or_buffer='../j_league/new_data/test_new.csv')

rank_data = pd.read_csv(filepath_or_buffer='../j_league/new_data/ranking_data.csv').drop('stage', 1)

In [3]:
# 順位データ追加
train = pd.merge(train, rank_data, left_on=['id', 'home'], right_on=['id', 'team'])
test = pd.merge(test, rank_data, left_on=['id', 'home'], right_on=['id', 'team'])

In [4]:
##はずれ値の除去
#観客動員数が0人の試合は除去する
train = train[train.y != 0]

#雨フラグの追加
train['rain_flag'] = 0
test['rain_flag'] = 0

train.loc[train.weather.str.contains('雨'), 'rain_flag'] = 1
test.loc[test.weather.str.contains('雨'), 'rain_flag'] = 1

#晴れフラグの追加
train['sunny_flag'] = 0
test['sunny_flag'] = 0
train.loc[train.weather.str.contains('晴'), 'sunny_flag'] = 1
test.loc[test.weather.str.contains('晴'), 'sunny_flag'] = 1

# TV局数を追加
train['tv_num'] = train.tv.str.count('／') + 1
test['tv_num'] = test.tv.str.count('／') + 1

In [5]:
#曜日ごとにラベル付け
train['day_label'] = 0
train.loc[train.gameday.str.contains('月'), 'day_label'] = 1
train.loc[train.gameday.str.contains('火'), 'day_label'] = 2
train.loc[train.gameday.str.contains('水'), 'day_label'] = 3
train.loc[train.gameday.str.contains('木'), 'day_label'] = 4
train.loc[train.gameday.str.contains('金'), 'day_label'] = 5
train.loc[train.gameday.str.contains('土'), 'day_label'] = 6
train.loc[train.gameday.str.contains('日'), 'day_label'] = 7

test['day_label'] = 0
test.loc[test.gameday.str.contains('月'), 'day_label'] = 1
test.loc[test.gameday.str.contains('火'), 'day_label'] = 2
test.loc[test.gameday.str.contains('水'), 'day_label'] = 3
test.loc[test.gameday.str.contains('木'), 'day_label'] = 4
test.loc[test.gameday.str.contains('金'), 'day_label'] = 5
test.loc[test.gameday.str.contains('土'), 'day_label'] = 6
test.loc[test.gameday.str.contains('日'), 'day_label'] = 7

### 使用変数の作成

In [6]:
#変数選択(目的変数, 気温, 収容人数)
lm_train = train[['id',
                  'y',
                  'stage',
                  'temperature',
                  'capa',
                  'rain_flag',
                  'sunny_flag',
                  'day_label',
                  'stadium',
                  'home',
                  'rank_last',
                  'tv_num']]
lm_train = pd.get_dummies(lm_train)
lm_test = test[['id',
                'stage',
                'temperature',
                'capa',
                'rain_flag',
                'sunny_flag',
                'day_label',
                'stadium',
                'home',
                'rank_last',
                'tv_num']]
lm_test = pd.get_dummies(lm_test)

#テストデータに存在しない要素は削除する
#home
lm_train = lm_train.drop(['home_ザスパ草津',
                          'home_ＦＣ町田ゼルビア',
                          'home_ガイナーレ鳥取'],1)
#stadium
lm_train = lm_train.drop(['stadium_宮城スタジアム',
                          'stadium_鹿児島県立鴨池陸上競技場 ',
                          'stadium_国立競技場',
                          'stadium_岡山県津山陸上競技場',
                          'stadium_とりぎんバードスタジアム',
                          'stadium_駒沢オリンピック公園総合運動場陸上競技場',
                          'stadium_町田市立陸上競技場',
                          'stadium_岐阜メモリアルセンター長良川球技メドウ',
                          'stadium_佐賀県総合運動場陸上競技場',
                          'stadium_福島県営あづま陸上競技場',
                          'stadium_大垣市浅中公園総合グラウンド陸上競技場'],1)

In [7]:
##Stage(J1/J2)で場合分け
#学習データ
lm_train_j1 = lm_train.groupby('stage_Ｊ１').get_group(1).drop('stage_Ｊ２', 1)
lm_train_j2 = lm_train.groupby('stage_Ｊ２').get_group(1).drop('stage_Ｊ１', 1)
#テストデータ
lm_test_j1 = lm_test.groupby('stage_Ｊ１').get_group(1).drop('stage_Ｊ２', 1)
lm_test_j2 = lm_test.groupby('stage_Ｊ２').get_group(1).drop('stage_Ｊ１', 1)

### 重回帰分析

In [8]:
##クロスバリデーション
#データをK分割
K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=17)

#J1のループ
pred_j1_tmp = np.zeros(len(lm_test_j1))
for train_index, test_index in kf.split(lm_train_j1):

    train_train = lm_train_j1.iloc[train_index]    
    lm_j1 = sm.OLS(train_train['y'], train_train.drop(['y', 'id'], 1)).fit() #J1
    #検証データの予測値
    pred_j1_tmp += lm_j1.predict(lm_test_j1.drop('id',1))

pred_j1 = pred_j1_tmp/K

#J2のループ
pred_j2_tmp = np.zeros(len(lm_test_j2))
for train_index, test_index in kf.split(lm_train_j2):

    train_train = lm_train_j2.iloc[train_index]    
    lm_j2 = sm.OLS(train_train['y'], train_train.drop(['y', 'id'], 1)).fit() #J1
    #検証データの予測値
    pred_j2_tmp += lm_j2.predict(lm_test_j2.drop('id',1))

pred_j2 = pred_j2_tmp/K

In [9]:
# サマリーの表示
#lm.summary()

### submit形式に整える

In [10]:
pred_j1 = pd.DataFrame({'id': lm_test_j1['id'], 'y': pred_j1})
pred_j2 = pd.DataFrame({'id': lm_test_j2['id'], 'y': pred_j2})

out = pd.concat([pred_j1, pred_j2]).sort_values('id')
# 予測値がCapaを超えている場合はCapaの値に修正
for i in range(len(out)):
    if out.iloc[i].y > test.iloc[i].capa:
        out.iloc[i].y = test.iloc[i].capa

#CSV出力(ヘッダーなし)
out_csv_name = 'submit/20200820_5_submit_lm.csv'
out.to_csv(out_csv_name, header=False, index=False)

In [11]:
#出力したCSVが想定どおりのフォーマットか確認する
#先ほど出力したファイルと投稿用のサンプルファイルを読込
submit_sample = pd.read_csv(filepath_or_buffer='../../data/JLeague/sample_submit.csv')
submit_out = pd.read_csv(filepath_or_buffer=out_csv_name)

#インデックスを比較
assert len(submit_sample.index.values) == len(submit_out.index.values)
assert (submit_sample.index.values == submit_out.index.values).all()

#1列目がidとなっており、同じIDから始まっているか検証
assert submit_sample.columns[0] == submit_out.columns[0]
assert (submit_out.iloc[:, 0] == submit_sample.iloc[:, 0]).all()

#2列目の列名「y」が入っていないか検証
assert 'y' != submit_out.columns[0]