In [1]:
import numpy as np
# 訓練データと検証データの分割
from sklearn.model_selection import train_test_split
# データを扱う
import pandas as pd
# グラフ描画
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# 回帰分析
from sklearn.linear_model import LinearRegression as LR
# 評価関数（平均二乗誤差）
from sklearn.metrics import mean_squared_error as MSE

# 学習データ

In [34]:
class TrainDataset():
    def __init__(self):
        super().__init__()
        
        # CSVファイル読み込み（訓練データ）
        df = pd.read_csv('train.tsv' , sep='\t')
        
        # 不正データの補正 (レコード削除のパターンもあるため、ファイル読み込み直後に処理する)
        df = self.correction(df)

        # 説明変数と目的変数に分割する
        df_x = df.drop(['mpg'], axis=1)
        self.df_y = df.loc[:,['mpg']]
        
        # 説明変数の量的データ、質的データ分割
        self.x_cate , self.x_cont = self.DataChange(df_x.copy())
        self.x_desc = self.x_cont.describe()
        
        self.x_cate_conv = self.x_cate.copy()
        self.x_cont_conv = self.x_cont.copy()
        
        # データの加工（標準化や対数化など）
        #self.std_scale = 0
        self.max_scale = 0
        #self.dobule_scale = 0
        #self.polynomialFlg = 0
        #self.binSplitFlg = 0
        self.logFlg = 0
        self.DataConv()
        
        #self.NpToPy()
        #self.num = len(self.X_train)
        self.num = len(self.x_cont)
        self.coti_size = self.x_cont.shape[1]
        
        
    
    def correction(self, df):
        # horsepowerの不正データに平均を格納する
        drop_index = df.index[df['horsepower']=='?']
        print('horsepower index : ', drop_index)
        df.loc[drop_index,'horsepower'] = df.drop(drop_index)['horsepower'].astype('float32').mean()
        df['horsepower'] = df['horsepower'].astype('float64')
        
        return df
    
    def DataChange(self, df_x):
        categ_cols = ['cylinders', 'model year', 'origin', 'car name']
        contin_cols = ['displacement', 'horsepower', 'weight', 'acceleration']

        ####################################
        ##  データ分割
        ##  説明変数と目的変数に分ける。
        ##  説明変数はカテゴリデータと連続データに分ける。
        ####################################
        x_cate = df_x[categ_cols].copy()
        x_cont = df_x[contin_cols].copy()
        
        return x_cate , x_cont

    def train_Split(self):
        
        #tmp = self.x_cont[['displacement_log', 'horsepower_log', 'weight_log', 'acceleration_log']]
        tmp = self.x_cont_conv[['horsepower_log', 'weight_log', 'acceleration_log']]
        
        x_temp = pd.concat([self.x_cate_conv , tmp] , axis=1)
        #x_temp = self.x_cont.copy()
        x_temp = np.array(x_temp)
        y_temp = np.array(self.df_y)
        
        # 訓練データと検証データに分割
        x_train,x_test,y_train,y_test=train_test_split(x_temp, y_temp, test_size=0.2, random_state=3) 
        
        # 
        self.X_train = x_train
        self.Y_train = y_train
        self.X_test = x_test
        self.Y_test = y_test
        
        self.coti_size = self.x_cont.shape[1]
        self.in_size  = self.X_train.shape[1]
        self.out_size = self.Y_train.shape[1]
    
    def log_Scale(self):
        if self.logFlg == 0:
            
            #tmp = np.array(self.x_cont[['displacement']].copy())
            #tmp = tmp*2/100
            #tmp = tmp.round()
            #tmp = tmp/2*10
            #self.x_cont['displacement_log']= np.log(tmp)
            self.x_cont_conv['horsepower_log']= np.log1p(self.x_cont_conv['horsepower'])
            self.x_cont_conv['weight_log']= np.log(self.x_cont_conv['weight'])
            self.x_cont_conv['acceleration_log']= np.log(self.x_cont_conv['acceleration'])
            
            self.logFlg = 1
    
    def displacementConv(self):
        
        col = 'displacement'
        
        displacement_tmp = np.array(self.x_cont[[col]].copy())
        displacement_tmp = displacement_tmp*2/100
        displacement_tmp = displacement_tmp.round()
        displacement_tmp = displacement_tmp/2*10

        
        self.x_cate_conv[col + '_1'] = 0
        self.x_cate_conv[col + '_2'] = 0     
        self.x_cate_conv[col + '_3'] = 0
        self.x_cate_conv[col + '_4'] = 0     
        self.x_cate_conv[col + '_5'] = 0
        self.x_cate_conv[col + '_6'] = 0     
        
        i = 0
        for tmp in displacement_tmp:
        
            if tmp == 10:
                self.x_cate_conv.loc[i , col + '_1'] = 1
            elif tmp == 15:
                self.x_cate_conv.loc[i , col + '_2'] = 1
            elif tmp == 20:
                self.x_cate_conv.loc[i , col + '_3'] = 1
            elif tmp == 25:
                self.x_cate_conv.loc[i , col + '_4'] = 1
            elif tmp == 30:
                self.x_cate_conv.loc[i , col + '_5'] = 1
            elif tmp == 35:
                self.x_cate_conv.loc[i , col + '_6'] = 1
            elif tmp > 35:
                self.x_cate_conv.loc[i , col + '_6'] = 1
            i += 1
        
        self.x_cont_conv = self.x_cont_conv.drop(columns=col ,axis=1)
    
    def cylindersConv(self):
        # cylindersの値は、３気筒を４気筒へ、５気筒を６気筒とみなして量的データとして処理する
        self.x_cate_conv.loc[self.x_cate['cylinders']==3, 'cylinders'] = 4
        self.x_cate_conv.loc[self.x_cate['cylinders']==5, 'cylinders'] = 6
    
    def originConv(self):
        
        col = 'origin'
        
        self.x_cate_conv[col + '_1'] = 0
        self.x_cate_conv[col + '_2'] = 0

        row_index = self.x_cate.index[self.x_cate[col]==1]
        self.x_cate_conv.loc[row_index, col + '_1'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]==2]
        self.x_cate_conv.loc[row_index, col + '_2'] = 1
        
        # ALL ゼロが３なので処理しない
        row_index = self.x_cate.index[self.x_cate[col]==3]

        self.x_cate_conv = self.x_cate_conv.drop(columns=col ,axis=1)
        
    def ModelYearConv(self):
        
        col = 'model year'
        
        self.x_cate_conv[col + '_1'] = 0
        self.x_cate_conv[col + '_2'] = 0
        self.x_cate_conv[col + '_3'] = 0
        self.x_cate_conv[col + '_4'] = 0
        
        self.x_cate_conv[col + '_5'] = 0
        self.x_cate_conv[col + '_6'] = 0
        #self.x_cate_conv[col + '_7'] = 0
        #self.x_cate_conv[col + '_8'] = 0
        
        #self.x_cate_conv[col + '_9'] = 0
        #self.x_cate_conv[col + '_10'] = 0
        #self.x_cate_conv[col + '_11'] = 0
        #self.x_cate_conv[col + '_12'] = 0
        
        
        row_index = self.x_cate.index[self.x_cate[col]==70]
        self.x_cate_conv.loc[row_index, col + '_1'] = 1
        row_index = self.x_cate.index[self.x_cate[col]==71]
        self.x_cate_conv.loc[row_index, col + '_1'] = 1
        row_index = self.x_cate.index[self.x_cate[col]==72]
        self.x_cate_conv.loc[row_index, col + '_2'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]==73]
        self.x_cate_conv.loc[row_index, col + '_2'] = 1
        row_index = self.x_cate.index[self.x_cate[col]==74]
        self.x_cate_conv.loc[row_index, col + '_3'] = 1
        row_index = self.x_cate.index[self.x_cate[col]==75]
        self.x_cate_conv.loc[row_index, col + '_3'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]==76]
        self.x_cate_conv.loc[row_index, col + '_4'] = 1
        row_index = self.x_cate.index[self.x_cate[col]==77]
        self.x_cate_conv.loc[row_index, col + '_4'] = 1
        row_index = self.x_cate.index[self.x_cate[col]==78]
        self.x_cate_conv.loc[row_index, col + '_5'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]==79]
        self.x_cate_conv.loc[row_index, col + '_5'] = 1
        row_index = self.x_cate.index[self.x_cate[col]==80]
        self.x_cate_conv.loc[row_index, col + '_6'] = 1
        row_index = self.x_cate.index[self.x_cate[col]==81]
        self.x_cate_conv.loc[row_index, col + '_6'] = 1
        
        # ALL　ゼロが82なので、処理しない
        row_index = self.x_cate.index[self.x_cate[col]==82]
        
        self.x_cate_conv = self.x_cate_conv.drop(columns=col ,axis=1)
    
    def CarNameConv(self):    
        col = 'brand'
        col2 = 'car name'
        self.x_cate[col] = self.x_cate[col2]
        
        spl = np.array(self.x_cate[col2].str.find(' '),dtype=np.int32).reshape(-1,1)
        for i in  range(len(self.x_cate)):
            #print(self.x_cate['brand'][i] [ : spl[i][0]] )
            self.x_cate[col][i] = self.x_cate[col][i] [ : spl[i][0]]
        

        self.x_cate_conv[col + '_1'] = 0
        self.x_cate_conv[col + '_2'] = 0
        self.x_cate_conv[col + '_3'] = 0
        
        #self.x_cate_conv[col + '_4'] = 0
        #self.x_cate_conv[col + '_5'] = 0
        #self.x_cate_conv[col + '_6'] = 0
        #self.x_cate_conv[col + '_7'] = 0
        #self.x_cate_conv[col + '_8'] = 0
        #self.x_cate_conv[col + '_9'] = 0

         
        # ゼネラルモーターズグループ 1
        num='_1'
        row_index = self.x_cate.index[self.x_cate[col]=='chevrolet']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='chevy']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='chevroelt']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='buick']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='pontiac']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='oldsmobile']
        self.x_cate_conv.loc[row_index, col + num] = 1
        
        # トヨタ  2
        num='_2'
        row_index = self.x_cate.index[self.x_cate[col]=='toyota']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='toyouta']
        self.x_cate_conv.loc[row_index, col + num] = 1

        # フォルクスワーゲン  3
        num='_3'
        row_index = self.x_cate.index[self.x_cate[col]=='volkswagen']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='vokswagen']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='vw']
        self.x_cate_conv.loc[row_index, col + num] = 1
        
        # フォードグループ  4
        num='_1'
        row_index = self.x_cate.index[self.x_cate[col]=='ford']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='mercury']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='capri']
        self.x_cate_conv.loc[row_index, col + num] = 1
        
        # ダッジ   5
        num='_1'
        row_index = self.x_cate.index[self.x_cate[col]=='dodge']
        self.x_cate_conv.loc[row_index, col + num] = 1
        
        # アメリカン・モーターズ 6
        num='_1'
        row_index = self.x_cate.index[self.x_cate[col]=='amc']
        self.x_cate_conv.loc[row_index, col + num] = 1
        
        # プリズム 7
        num='_1'
        row_index = self.x_cate.index[self.x_cate[col]=='plymouth']
        self.x_cate_conv.loc[row_index, col + num] = 1
        
        # その他日本車（ホンダ、マツダ、スバル、ダットサン（日産） 8
        num='_2'
        row_index = self.x_cate.index[self.x_cate[col]=='honda']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='subar']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='subaru']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='mazda']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='datsun']
        self.x_cate_conv.loc[row_index, col + num] = 1
        
        # その他　ドイツ車  9
        num='_3'
        row_index = self.x_cate.index[self.x_cate[col]=='opel']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='audi']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='bmw']
        self.x_cate_conv.loc[row_index, col + num] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='mercedes-benz']
        self.x_cate_conv.loc[row_index, col + num] = 1
        
        self.x_cate_conv = self.x_cate_conv.drop(columns=col2 ,axis=1)
        
    def MaxScale(self):
        roun=0
        
        if (self.max_scale == 0):
            col='horsepower'
            maxval=self.x_desc.loc['max',col]
            tmp_=self.x_cont_conv.loc[:,col]/maxval*100
            self.x_cont_conv.loc[:,col]=(tmp_.round(roun))

            col='weight'
            maxval=self.x_desc.loc['max',col]
            tmp_=self.x_cont_conv.loc[:,col]/maxval*100
            self.x_cont_conv.loc[:,col]=(tmp_.round(roun))
            
            col='acceleration'
            maxval=self.x_desc.loc['max',col]
            tmp_=self.x_cont_conv.loc[:,col]/maxval*100
            self.x_cont_conv.loc[:,col]=(tmp_.round(roun))
            
            self.max_scale = 1
            
    def DataConv(self):
        self.MaxScale()
        self.log_Scale()
        self.cylindersConv()
        self.originConv()
        self.ModelYearConv()
        self.CarNameConv()
        self.displacementConv()
        self.train_Split()
        

In [35]:
train_dataset = TrainDataset()

horsepower index :  Int64Index([24, 113, 145, 175], dtype='int64')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [36]:
train_dataset.x_cate_conv

Unnamed: 0,cylinders,origin_1,origin_2,model year_1,model year_2,model year_3,model year_4,model year_5,model year_6,brand_1,brand_2,brand_3,displacement_1,displacement_2,displacement_3,displacement_4,displacement_5,displacement_6
0,4,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
1,4,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0
2,6,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0
3,4,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0
4,4,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,4,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0
195,8,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0
196,8,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0
197,4,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0


In [37]:
train_dataset.x_desc

Unnamed: 0,displacement,horsepower,weight,acceleration
count,199.0,199.0,199.0,199.0
mean,183.311558,101.297436,2883.839196,15.647739
std,98.400457,35.201207,819.76687,2.701885
min,71.0,46.0,1613.0,8.5
25%,98.0,75.0,2217.5,14.0
50%,140.0,92.0,2702.0,15.5
75%,250.0,112.5,3426.5,17.15
max,454.0,220.0,5140.0,23.7


In [38]:
train_dataset.x_cate['brand'].value_counts()

chevrolet        27
ford             20
dodge            18
amc              16
datsun           14
toyota           13
plymouth         13
volkswagen       10
buick             8
honda             6
mercury           6
pontiac           5
vw                4
oldsmobile        4
fiat              4
renault           3
mazda             3
opel              3
saab              2
mercedes-benz     2
volvo             2
peugeot           2
bmw               2
audi              2
chevy             2
subaru            1
chevroelt         1
hi                1
triumph           1
capri             1
subar             1
toyouta           1
vokswagen         1
Name: brand, dtype: int64

# 評価データ

In [39]:
class TestDataset(TrainDataset):
    def __init__(self, x_desc):
        
        # CSVファイル読み込み（訓練データ）
        df = pd.read_csv('test.tsv', sep='\t' )
        
        # 不正データの補正 (レコード削除のパターンもあるため、ファイル読み込み直後に処理する)
        df = self.correction(df)
        
        # 目的変数の格納
        self.x_cate , self.x_cont = self.DataChange(df.copy())
        self.x_desc = x_desc
        
        self.x_cate_conv = self.x_cate.copy()
        self.x_cont_conv = self.x_cont.copy()
        
        # データの加工（標準化や対数化など）
        #self.std_scale = 0
        self.max_scale = 0
        #self.dobule_scale = 0
        #self.polynomialFlg = 0
        #self.binSplitFlg = 0
        self.logFlg  = 0
        self.DataConv()
        
        #self.NpToPy()
        #self.num = len(self.x_cate)
        self.num = len(self.x_cont)
        self.coti_size = self.x_cont.shape[1]

    def __getitem__(self, index):
        return self.X_test[index]
    
    def __len__(self):
        return self.num
    
    def train_Split(self):
        
        #tmp = self.x_cont[['displacement_log', 'horsepower_log', 'weight_log', 'acceleration_log']]
        tmp = self.x_cont_conv[['horsepower_log', 'weight_log', 'acceleration_log']]
        x_temp = pd.concat([self.x_cate_conv , tmp] , axis=1)
        
        #x_temp = self.x_cont.copy()
        x_temp = np.array(x_temp)
        
        # numpy の値を torch の値に変換する
        self.X_vat = x_temp



In [40]:
test_dataset = TestDataset(train_dataset.x_desc)

horsepower index :  Int64Index([70, 112], dtype='int64')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [41]:
test_dataset.x_cate_conv

Unnamed: 0,cylinders,origin_1,origin_2,model year_1,model year_2,model year_3,model year_4,model year_5,model year_6,brand_1,brand_2,brand_3,displacement_1,displacement_2,displacement_3,displacement_4,displacement_5,displacement_6
0,6,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1,6,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0
2,4,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0
3,6,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0
4,4,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,4,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0
195,4,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0
196,4,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0
197,8,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1


# モデル定義

In [42]:
model = LR()
model.fit(train_dataset.X_train, train_dataset.Y_train)

LinearRegression()

# 予測する

In [43]:
# 予測値の算出
y_pred_train = model.predict(train_dataset.X_train)
y_pred_test = model.predict(train_dataset.X_test)

# MSEの算出
mse_train = MSE(train_dataset.Y_train, y_pred_train)
mse_test = MSE(train_dataset.Y_test, y_pred_test)

# RMSEの算出
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

# RMSEの表示
print(rmse_train)
print(rmse_test)

2.426560222149238
2.909167374830151
