In [1]:
import numpy as np
# 訓練データと検証データの分割
from sklearn.model_selection import train_test_split
# データを扱う
import pandas as pd
# グラフ描画
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

In [36]:
# 重回帰
from sklearn.linear_model import LinearRegression as LR
# 評価関数
from sklearn.metrics import mean_squared_error as MSE

In [88]:
class TrainDataset():
    def __init__(self):        
        
        # 学習データ
        data = pd.read_csv('train.csv')
        data_add = pd.read_csv('train_add.csv')
        data_new = pd.concat([data, data_add])
        
        # スタジアム
        self.stadium = pd.read_csv('stadium.csv')
        data_all = pd.merge(data_new, self.stadium, left_on='stadium', right_on='name', how='left') 
        
        # data_allのnameカラムを削除して、data_allに代入してください。
        data_all = data_all.drop(columns=['name'])
        
        # 不正データの補正 (レコード削除のパターンもあるため、ファイル読み込み直後に処理する)
        self.df = self.correction(data_all.copy())
        
        # 目的変数の外れ値は、評価データに存在しないため、init内で処理する
        self.df = self.df[self.df['y'] > 0]
        self.df['y_capa'] = self.df['y'] / self.df['capa']
        
        # 説明変数と目的変数に分割する
        df_x = self.df.drop(['y', 'y_capa'], axis=1)
        self.df_y = self.df.loc[:,['y']]
        self.df_y_capa = self.df.loc[:,['y_capa']]
        
        
        # 説明変数の量的データ、質的データ分割
        self.x_cate , self.x_cont, self.x_id = self.DataChange(df_x.copy())
        self.x_desc = self.x_cont.describe()
        
        self.x_cate_conv = self.x_cate.copy()
        self.x_cont_conv = self.x_cont.copy()
        
        self.DataConv()
        
        print(self.x_cate_conv.info())
        print(self.x_cont_conv.info())
        
    def correction(self, df):

        df['month'] = df['gameday'].apply(self.get_month)
        df['week'] = df['gameday'].apply(self.get_week)
        df['match_num'] = df['match'].apply(self.get_match)
        df['hour'] = df['time'].apply(self.get_hour)
        df['tv_num'] = df['tv'].apply(self.get_num)
        df = df.replace('ザスパ草津','ザスパクサツ群馬')
        df = df.replace('岐阜メモリアルセンター長良川球技メドウ','岐阜メモリアルセンター長良川競技場')
        
        #print('null check')
        #print(df.isnull().sum())
        #print('-------------------------')
        
        return df
    
    def DataChange(self, df_x):
        categ_cols = ['stage' ,'month', 'gameday', 'tv', 'week']
        contin_cols = ['year', 'match_num' ,'tv_num']
        index_cols = ['id', 'capa']

        ####################################
        ##  データ分割
        ##  説明変数と目的変数に分ける。
        ##  説明変数はカテゴリデータと連続データに分ける。
        ####################################
        x_cate = df_x[categ_cols].copy()
        x_cont = df_x[contin_cols].copy()
        x_id = df_x[index_cols].copy()
        x_id = x_id.astype('int64')
        
        return x_cate , x_cont  ,x_id
    
    
    # 開催日の月を取り出す
    def get_month(self, x):
        return int(x[0:2])
    
    # 開催日の曜日を取り出す
    def get_week(self, x):
        return x[6:7]
    
    # 開催日の月を取り出す
    def get_hour(self, x):
        return int(x[0:2])
    
    # 第〇節の値を取り出して数値化する
    def get_match(self, x):
        return int(x[x.find('第') + 1: x.find('節')])
    
    # 湿度を数値化する
    def get_humidity(self, x):
        return float(x[:-1])/100
    
    # 放送するテレビ局の数を数える
    def get_num(self, x):
        return len(x.split('／'))

    def dropCol(self):
        
        self.x_cate_conv = self.x_cate_conv.drop(columns=['gameday', 'tv'  ], axis=1)
        # application_typeは使ってみる
        
        self.x_cont_conv = self.x_cont_conv.drop(columns=['year'], axis=1)
        # credit_scoreは使ってみる（差があるかは要確認）
        
    def stageConv(self):
        
        col = 'stage'
        
        self.x_cate_conv[col + '_1'] = 0

        row_index = self.x_cate.index[self.x_cate[col]=='Ｊ１']        
        self.x_cate_conv.loc[row_index, col + '_1'] = 1
        
        # ALL ゼロがＪ２なので処理しない
        row_index = self.x_cate.index[self.x_cate[col]=='Ｊ２']
        
        self.x_cate_conv = self.x_cate_conv.drop(columns=col ,axis=1)
        
        
    def monthConv(self):
        
        col = 'month'
        
        self.x_cate_conv[col + '_1'] = 0

        row_index = self.x_cate.index[self.x_cate[col]=='12']
        self.x_cate_conv.loc[row_index, col + '_1'] = 1
        
        # 12月だけ観客数が多いので、その他は０にする
        row_index = self.x_cate.index[self.x_cate[col]=='03']
        row_index = self.x_cate.index[self.x_cate[col]=='04']
        row_index = self.x_cate.index[self.x_cate[col]=='05']
        row_index = self.x_cate.index[self.x_cate[col]=='06']
        row_index = self.x_cate.index[self.x_cate[col]=='07']
        row_index = self.x_cate.index[self.x_cate[col]=='08']
        row_index = self.x_cate.index[self.x_cate[col]=='09']
        row_index = self.x_cate.index[self.x_cate[col]=='10']
        row_index = self.x_cate.index[self.x_cate[col]=='11']
        
        self.x_cate_conv = self.x_cate_conv.drop(columns=col ,axis=1)
    
    def weekConv(self):
        
        col = 'week'
        
        self.x_cate_conv[col + '_1'] = 0

        row_index = self.x_cate.index[self.x_cate[col]=='土']
        self.x_cate_conv.loc[row_index, col + '_1'] = 1
        
        # 土曜日だけ観客数が多いので、その他は０にする
        row_index = self.x_cate.index[self.x_cate[col]=='日']
        row_index = self.x_cate.index[self.x_cate[col]=='月']
        row_index = self.x_cate.index[self.x_cate[col]=='火']
        row_index = self.x_cate.index[self.x_cate[col]=='水']
        row_index = self.x_cate.index[self.x_cate[col]=='木']
        row_index = self.x_cate.index[self.x_cate[col]=='金']
        
        self.x_cate_conv = self.x_cate_conv.drop(columns=col ,axis=1)
    
    
    def train_Split(self):
        
        # 訓練データと検証データに分割
        x_train_cate, x_test_cate ,y_train, y_test = train_test_split(self.x_cate_conv, self.df_y, test_size=0.2, random_state=3) 
        
        # カテゴリデータと同じ行数分数値データを抽出する (id も訓練データと検証データに分ける)
        row_index = x_train_cate.index.values
        x_train_cont = self.x_cont_conv.loc[row_index, :]
        self.X_train_id = self.x_id.loc[row_index, :]
        self.X_train_id.reset_index(drop=True, inplace=True)
        y_train_capa = self.df_y_capa.loc[row_index, :]
        
        ###　検証データ
        row_index = x_test_cate.index.values
        x_test_cont = self.x_cont_conv.loc[row_index, :]
        self.X_test_id = self.x_id.loc[row_index, :]
        self.X_test_id.reset_index(drop=True, inplace=True)
        y_test_capa = self.df_y_capa.loc[row_index, :]
        
        x_train = pd.concat([x_train_cate , x_train_cont] , axis=1)
        x_test = pd.concat([x_test_cate , x_test_cont] , axis=1)
        
        self.X_train = np.array(x_train)
        self.X_test = np.array(x_test)
        self.Y_train = np.array(y_train)
        self.Y_test = np.array(y_test)
        self.Y_train_capa = np.array(y_train_capa)
        self.Y_test_capa = np.array(y_test_capa)
        
        self.coti_size = self.x_cont.shape[1]
        self.in_size  = self.X_train.shape[1]
        self.out_size = self.Y_train.shape[1]  
    
    def DataConv(self):
        
        self.stageConv()
        #self.termConv()
        self.monthConv()
        self.weekConv()
        self.dropCol()        
        self.train_Split()
        

In [89]:
train_dataset = TrainDataset()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1952 entries, 0 to 1952
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   stage_1  1952 non-null   int64
 1   month_1  1952 non-null   int64
 2   week_1   1952 non-null   int64
dtypes: int64(3)
memory usage: 141.0 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1952 entries, 0 to 1952
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   match_num  1952 non-null   int64
 1   tv_num     1952 non-null   int64
dtypes: int64(2)
memory usage: 125.8 KB
None


  res_values = method(rvalues)


In [90]:
train_dataset.x_cate_conv

Unnamed: 0,stage_1,month_1,week_1
0,1,0,1
1,1,0,1
2,1,0,1
3,1,0,1
4,1,0,1
...,...,...,...
1948,0,0,0
1949,0,0,0
1950,0,0,1
1951,0,0,0


In [91]:
train_dataset.X_train

array([[ 1,  0,  0, 28,  2],
       [ 0,  0,  1, 20,  2],
       [ 1,  0,  1, 11,  4],
       ...,
       [ 0,  0,  0, 34,  3],
       [ 0,  0,  1, 21,  2],
       [ 0,  0,  0, 42,  3]], dtype=int64)

### 訓練データの読み込み

In [92]:
class TestDataset(TrainDataset):
    def __init__(self, x_desc):
        
        # 学習データ
        data = pd.read_csv('test.csv')

        # スタジアム
        self.stadium = pd.read_csv('stadium.csv')
        data_all = pd.merge(data, self.stadium, left_on='stadium', right_on='name', how='left') 
        
        # data_allのnameカラムを削除して、data_allに代入してください。
        data_all = data_all.drop(columns=['name'])
        
        # 不正データの補正 (レコード削除のパターンもあるため、ファイル読み込み直後に処理する)
        data_all = self.correction(data_all.copy())
        
        # 目的変数の格納
        self.x_cate , self.x_cont, self.x_id = self.DataChange(data_all.copy())
        self.x_desc = x_desc
        
        self.x_cate_conv = self.x_cate.copy()
        self.x_cont_conv = self.x_cont.copy()
        
        # データの加工（標準化や対数化など）
        #self.std_scale = 0
        self.max_scale = 0
        #self.dobule_scale = 0
        #self.polynomialFlg = 0
        #self.binSplitFlg = 0
        #self.logFlg  = 0
        self.DataConv()
        
        #self.NpToPy()
        #self.num = len(self.x_cate)
        self.num = len(self.x_cate_conv)
        self.coti_size = self.x_cont_conv.shape[1]

    def __getitem__(self, index):
        return self.X_vat[index]
    
    def __len__(self):
        return self.num
    
    def train_Split(self):
        
        #x_temp = self.x_cont.copy()
        tmp_cate = np.array(self.x_cate_conv)
        tmp_cont = np.array(self.x_cont_conv)
        
        # numpy の値を torch の値に変換する
        self.X_vat_cate = tmp_cate
        self.X_vat_cont = tmp_cont
 


In [93]:
test_dataset = TestDataset(train_dataset.x_desc)

  res_values = method(rvalues)


# モデル定義

In [94]:
lr = LR()
lr.fit(train_dataset.X_train, train_dataset.Y_train_capa)

LinearRegression()

# 予測する

### 訓練データ

In [95]:
print(train_dataset.X_train_id.head(20))

       id   capa
0   15087  26530
1   15357  22563
2   14934  63700
3   15467  20588
4   14442  15589
5   14699  15589
6   14399  21292
7   16214  15100
8   14625  20000
9   15354  15454
10  14031  19694
11  15040  20281
12  15705  26530
13  16084  15100
14  14127  20000
15  15209  20000
16  15774  24490
17  15105  24490
18  14539  19637
19  15576  25250


In [101]:
# 予測
y_pred_train = lr.predict(train_dataset.X_train)
df_pred_train = pd.DataFrame(y_pred_train , columns=['vat_tmp'])

# capa を結合して観客数を求める
df_pred_train = pd.concat([train_dataset.X_train_id , df_pred_train] , axis=1)
df_pred_train['vat'] = df_pred_train['vat_tmp'] * df_pred_train['capa']

y_pred_train = np.round(np.array(df_pred_train['vat']))
y_pred_train = y_pred_train.reshape(-1,1)
print(y_pred_train)
print(train_dataset.Y_train)

[[15046.]
 [ 7709.]
 [37065.]
 ...
 [ 5310.]
 [ 7077.]
 [ 5894.]]
[[14388]
 [ 5318]
 [46649]
 ...
 [ 3706]
 [10573]
 [ 3560]]


In [102]:
rmse_train = np.sqrt(MSE(train_dataset.Y_train, y_pred_train))
print(rmse_train)

5459.2424916208865


### 検証データ

In [103]:
y_pred_test = lr.predict(train_dataset.X_test)
df_pred_test = pd.DataFrame(y_pred_test , columns=['vat_tmp'])

# capa を結合して観客数を求める
df_pred_test = pd.concat([train_dataset.X_test_id , df_pred_test] , axis=1)
df_pred_test['vat'] = df_pred_test['vat_tmp'] * df_pred_test['capa']

y_pred_test = np.round(np.array(df_pred_test['vat']))
y_pred_test = y_pred_test.reshape(-1,1)

In [104]:
rmse_test = np.sqrt(MSE(train_dataset.Y_test, y_pred_test))
print(rmse_test)

4798.119240444562
