In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

df = pd.read_csv("train.csv") 
df_test = pd.read_csv("test.csv") 

#df = df.drop(["Country","City"], axis=1)
#df_test = df_test.drop(["Country","City"], axis=1)

df['date'] = pd.to_datetime({'year': df['year'], 'month': df['month'], 'day': df['day']})
df_test['date'] = pd.to_datetime({'year': df_test['year'], 'month': df_test['month'], 'day': df_test['day']})

df = df.drop(["year","month","day"], axis=1)
df_test = df_test.drop(["year","month","day"], axis=1)

#df = df.sort_values(['date', 'Country','City']).reset_index(drop=True)
#df_test = df_test.sort_values([ 'date','Country','City']).reset_index(drop=True)

df_id = df.copy()
df_test_id = df_test.copy()

df = df.drop(["date","id"], axis=1)
df_test = df_test.drop(["date","id"], axis=1)


In [13]:
from sklearn import base
from sklearn.model_selection import KFold

class KFoldTargetEncoderTrain_mean(base.BaseEstimator,
                               base.TransformerMixin):
    """How to use.
    targetc = KFoldTargetEncoderTrain('Feature','Target',n_fold=5)
    new_train = targetc.fit_transform(train)
    """
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col

    def fit(self, X, y=None):
        return self

    def transform(self,X):        
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)       

        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = False)        
        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc_mean'
        X[col_mean_name] = np.nan       

        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
            X[col_mean_name].fillna(mean_of_target, inplace = True)  # nanになってしまったところは平均値で埋める --(1)

        if self.verbosity:            
            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName, 
                                                                                  np.corrcoef(X[self.targetName].values,encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X
class KFoldTargetEncoderTrain_median(base.BaseEstimator,
                               base.TransformerMixin):
    """How to use.
    targetc = KFoldTargetEncoderTrain('Feature','Target',n_fold=5)
    new_train = targetc.fit_transform(train)
    """
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col

    def fit(self, X, y=None):
        return self

    def transform(self,X):        
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)       

        mean_of_target = X[self.targetName].median()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = False)        
        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc_median'
        X[col_mean_name] = np.nan       

        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].median())
            X[col_mean_name].fillna(mean_of_target, inplace = True)  # nanになってしまったところは平均値で埋める --(1)

        if self.verbosity:            
            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName, 
                                                                                  np.corrcoef(X[self.targetName].values,encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X

class KFoldTargetEncoderTrain_std(base.BaseEstimator,
                               base.TransformerMixin):
    """How to use.
    targetc = KFoldTargetEncoderTrain('Feature','Target',n_fold=5)
    new_train = targetc.fit_transform(train)
    """
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col

    def fit(self, X, y=None):
        return self

    def transform(self,X):        
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)       

        mean_of_target = X[self.targetName].std()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = False)        
        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc_std'
        X[col_mean_name] = np.nan       

        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].std())
            X[col_mean_name].fillna(mean_of_target, inplace = True)  # nanになってしまったところは平均値で埋める --(1)

        if self.verbosity:            
            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName, 
                                                                                  np.corrcoef(X[self.targetName].values,encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X



class TargetEncoderTest_mean(base.BaseEstimator, base.TransformerMixin):
    """How to use.
    test_targetc = TargetEncoderTest(new_train,
                                      'Feature',
                                      'Feature_Kfold_Target_Enc')
    new_test = test_targetc.fit_transform(test)
    """

    def __init__(self,train,colNames,encodedName):

        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName

    def fit(self, X, y=None):
        return self

    def transform(self,X):       
        mean =  self.train[[self.colNames, self.encodedName]].groupby(self.colNames).mean().reset_index() 

        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]
            X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})
        return X
class TargetEncoderTest_median(base.BaseEstimator, base.TransformerMixin):
    """How to use.
    test_targetc = TargetEncoderTest(new_train,
                                      'Feature',
                                      'Feature_Kfold_Target_Enc')
    new_test = test_targetc.fit_transform(test)
    """

    def __init__(self,train,colNames,encodedName):

        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName

    def fit(self, X, y=None):
        return self

    def transform(self,X):       
        mean =  self.train[[self.colNames, self.encodedName]].groupby(self.colNames).median().reset_index() 

        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]
            X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})
        return X
class TargetEncoderTest_std(base.BaseEstimator, base.TransformerMixin):
    """How to use.
    test_targetc = TargetEncoderTest(new_train,
                                      'Feature',
                                      'Feature_Kfold_Target_Enc')
    new_test = test_targetc.fit_transform(test)
    """

    def __init__(self,train,colNames,encodedName):

        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName

    def fit(self, X, y=None):
        return self

    def transform(self,X):       
        mean =  self.train[[self.colNames, self.encodedName]].groupby(self.colNames).std().reset_index() 

        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]
            X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})
        return X

In [14]:
targetc = KFoldTargetEncoderTrain_mean('Country','pm25_mid',n_fold=5)
df = targetc.fit_transform(df)

#targetc = KFoldTargetEncoderTrain_median('Country','pm25_mid',n_fold=5)
#df = targetc.fit_transform(df)

#targetc = KFoldTargetEncoderTrain_std('Country','pm25_mid',n_fold=5)
#df = targetc.fit_transform(df)


test_targetc = TargetEncoderTest_mean(df, 'Country', 'Country_Kfold_Target_Enc_mean')
df_test = test_targetc.fit_transform(df_test)

#test_targetc = TargetEncoderTest_median(df, 'Country', 'Country_Kfold_Target_Enc_median')
#df_test = test_targetc.fit_transform(df_test)

#test_targetc = TargetEncoderTest_std(df, 'Country', 'Country_Kfold_Target_Enc_std')
#df_test = test_targetc.fit_transform(df_test)

le = LabelEncoder()
df["Country"] = le.fit_transform(df["Country"])
df["City"] = le.fit_transform(df["City"])
df_test["Country"] = le.fit_transform(df_test["Country"])
df_test["City"] = le.fit_transform(df_test["City"])


df

Correlation between the new feature, Country_Kfold_Target_Enc_mean and, pm25_mid is 0.37994396434787336.


Unnamed: 0,Country,City,lat,lon,co_cnt,co_min,co_mid,co_max,co_var,o3_cnt,...,ws_mid,ws_max,ws_var,dew_cnt,dew_min,dew_mid,dew_max,dew_var,pm25_mid,Country_Kfold_Target_Enc_mean
0,0,19,-27.46794,153.02809,38,0.749,2.590,2.633,0.850,29,...,1.088,3.101,1.983,17,7.671,10.358,15.112,13.424,19.901,37.860553
1,0,39,-12.46113,130.84185,47,2.594,3.181,4.828,1.208,49,...,3.473,7.396,10.411,62,21.324,23.813,24.221,2.021,13.741,37.860553
2,0,117,-37.81400,144.96332,17,1.190,1.197,2.200,0.248,123,...,2.107,8.089,15.719,22,10.309,13.133,15.422,6.355,25.918,37.860553
3,0,140,-32.92953,151.78010,63,4.586,11.044,14.802,24.186,90,...,0.503,3.592,2.485,116,7.146,10.685,13.344,9.417,174.370,37.860553
4,0,153,-31.95224,115.86140,47,4.689,8.681,11.100,10.011,83,...,0.755,3.396,1.937,93,1.091,3.277,12.272,4.109,167.063,37.860553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195936,28,83,30.33218,-81.65565,12,0.694,0.995,1.301,0.090,26,...,2.710,6.125,3.757,12,16.774,22.679,26.058,13.252,16.150,45.038729
195937,28,106,36.17497,-115.13722,14,0.528,1.256,3.226,1.743,8,...,2.974,6.861,8.354,12,10.432,14.741,15.827,7.078,16.895,45.038729
195938,28,120,43.03890,-87.90647,171,1.975,6.627,6.639,5.293,112,...,1.087,2.578,0.612,26,2.049,3.531,6.686,5.286,86.299,45.038729
195939,29,68,21.02450,105.84117,31,2.613,2.704,8.767,4.317,108,...,3.058,6.005,6.085,51,1.922,7.443,7.716,4.642,36.523,53.625862


  
# 中央値のカラム
mid = ["co_mid","o3_mid","so2_mid","no2_mid","temperature_mid","humidity_mid","pressure_mid","ws_mid","dew_mid"]

# 観測回数のカラム
cnt = ["co_cnt","o3_cnt","so2_cnt","no2_cnt","temperature_cnt","humidity_cnt","pressure_cnt","ws_cnt","dew_cnt"]

# 最小値のカラム
mini = ["co_min","o3_min","so2_min","no2_min","temperature_min","humidity_min","pressure_min","ws_min","dew_min"]

# 最大値のカラム
maxx = ["co_max","o3_max","so2_max","no2_max","temperature_max","humidity_max","pressure_max","ws_max","dew_max"]

for q in range(9):
    i = mid[q]
    l = cnt[q]
    k = mini[q]
    t = maxx[q]
    col = i +"_"+ l
    col_q = t +"_"+ k
    df[col] = df[i] * df[l]
    df_test[col] = df_test[i] * df_test[l]
    df[col_q] = df[t] - df[k]
    df_test[col_q] = df_test[t] - df_test[k]
         
for x in mid:
    for y in mid:
        if x != y:
            wa = x +"+"+ y
            seki = x +"*"+ y
            df[wa] = df[i] + df[i]
            df[seki] = df[i] * df[i]
            df_test[wa] = df_test[i] + df_test[i]
            df_test[seki] = df_test[i] * df_test[i]

n = 5
for col in df.columns:
    if col != "pm25_mid":
        for i in range(1,n):
            name = col + str(i)
            df[name] = df[col].shift(i)
            df_test[name] = df_test[col].shift(i)          

#df = df.iloc[n-1:,:].reset_index(drop=True)      
#df_test = df_test.iloc[n-1:,:].reset_index(drop=True)



print("finish")

In [15]:
# 目的変数と説明変数の定義
X_data = df.drop(columns=['pm25_mid'])
y_data = df['pm25_mid']

# train_test_splitのインポート
from sklearn.model_selection import train_test_split

# shuffle=Falseとすることで時系列が混ざるのを防止

# 学習データおよび検証データと、評価データに80:20の割合で2分割する
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.20, shuffle=False)


print("finish")

finish


In [16]:
import numpy as np
import pandas as pd

# グラフ描画用
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')

import lightgbm as lgb

from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from catboost import Pool, CatBoostRegressor

In [18]:
def catboost(X_train, Y_train, X_valid, Y_valid, X_test):
    # objectの列番号を取得
    categorical_features_indices = np.where(X_train.dtypes==np.object)[0]
    lgb_train = Pool(X_train, Y_train, cat_features=categorical_features_indices)
    lgb_valid = Pool(X_valid, Y_valid, cat_features=categorical_features_indices)
    model = CatBoostRegressor(eval_metric='RMSE',
                            loss_function='RMSE',
                            num_boost_round=10000,
                            logging_level='Silent',
                            random_seed=2022)
    model.fit(lgb_train, 
            eval_set=lgb_valid,
            early_stopping_rounds=10,
            verbose=True,
            use_best_model=True)

  # 検証データに対する予測値を求める
    va_pred = model.predict(X_valid)

    mse = mean_squared_error(Y_valid, va_pred)
    rmse = np.sqrt(mse) # RSME = √MSEの算出
    eval_metric = rmse

    print(f"eval's rmse: {eval_metric}")

    #テストデータに対する予測値を求める
    te_pred = np.array(model.predict(X_test))

    return va_pred, te_pred, model

va_pred3, te_pred3, model = catboost(X_train, y_train, X_test, y_test, df_test)

score = np.sqrt(mean_squared_error(y_test, va_pred3))
print(f"Validation RMSE score : {score:.4f}")
print()
print(f"Test data best socre : {19.1648:.4f}")

va_pred3 = pd.Series(va_pred3)
va_pred3.to_csv("CatBoost_test.csv",index=None,header=None)
y_test.to_csv("y_test.csv",index=None,header=None)

eval's rmse: 19.468701992718582
Validation RMSE score : 19.4687

Test data best socre : 19.1648


AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

In [None]:
sample = pd.read_csv("submit_sample.csv",header=None) 

#pred = model.predict(df_test)
#pred = pd.Series(pred)
pred_df = pd.DataFrame(te_pred3,columns=['pred'])

pred_df["id"] = df_test_id["id"]
pred_df = pred_df.sort_values(['id']).reset_index(drop=True)
sample[1] = pred_df["pred"]
sample.to_csv("Catboost_sabmission.csv",index=None,header=None)
sample