solafune:夜間光データから土地価格を予測 BaseLine(by mst8823)

https://zenn.dev/mst8823/articles/cd40cb971f702e の写経

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2
#from ptitprince import RainCloud

from pandas_profiling import ProfileReport
%matplotlib inline

print(os.getcwd())

/content


In [3]:
from lightgbm import LGBMModel
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm import tqdm

In [4]:
INPUT_DIR = '/content/drive/MyDrive/Colab Notebooks/Solafune/夜間光データから土地価格を予測/data/inputs/'
OUTPUT_DIR = '/content/drive/MyDrive/Colab Notebooks/Solafune/夜間光データから土地価格を予測/data/outputs/'
NB_NAME = 'nb006'

os.makedirs(OUTPUT_DIR, exist_ok=True)

train = pd.read_csv(os.path.join(INPUT_DIR, 'TrainDataSet.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'EvaluationData.csv'))
submission = pd.read_csv(os.path.join(INPUT_DIR, 'UploadFileTemplate.csv'))

In [28]:
def max_min(x):
    return x.max() - x.min()


def q75_q25(x):
    # 分位数[https://www.kousotu.com/lect_math/d02.php]
    return x.quantile(0.75) - x.quantile(0.25)


def aggregation(input_df, group_key, group_values, agg_methods):
    new_df = []
    for agg_method in agg_methods:
        for col in group_values:
            if callable(agg_method):
                agg_method_name = agg_method.__name__
            else:
                agg_method_name = agg_method
            new_col = f"agg_{agg_method_name}_{col}_grpby_{group_key}"
            df_agg = (input_df[[col] + [group_key]].groupby(group_key)[[col]].agg(agg_method)) # データフレーム
            df_agg.columns = [new_col] 
            new_df.append(df_agg) # list

    _df = pd.concat(new_df, axis=1).reset_index() # データフレーム(listをDataFrameにできるみたい)
    ouput_df = pd.merge(input_df[[group_key]], _df, on=group_key, how='left')
    return  output_df.drop(group_key, axis=1)


def diff_aggregation(input_df, group_key, group_values, num_diffs):
    dfs = []
    for nd in num_diffs:
        _df = input_df.groupby(group_key)[group_values].diff(nd)
        _df.columns = [f'diff={nd}_{col}_grpby_{group_key}' for col in group_values]
        dfs.append(_df)
    output_df = pd.concat(dfs, axis=1)
    return ouoput_df


def shit_aggregation(input_df, group_key, group_values, num_shifts):
    dfs = []
    for ns in num_shifts:
        _df = input_df.groupby(group_key)[group_values].shift(ns)
        _df.columns = [f'shift={ns}_{col}_grpby_{group_key}' for col in group_values]
        dfs.append(_df)
    output_df = pd.concat(dfs, axis=1)
    return output_df

### 特徴量の関数

In [32]:
# そのまま使うやつ
def get_raw_geatures(input_df):
    cols = [
        "MeanLight",
        "SumLight",
        "Year"
    ]
    return input_df[cols].copy()

# 面積のようなもの
def get_area_feature(input_df):
    output_df = pd.DataFrame()
    # 1e-3 = 0.001  [eは10のべき乗]
    output_df["Area"] = input_df["SumLight"] / (input_df["MeanLight"] + 1e-3)
    return output_df

# PlaceIDをキーにした集約特徴量
def get_agg_place_id_features(input_df):
    _input_df = pd.concat([input_df, get_area_feature(input_df)], axis=1)
    group_key = "PlaceID"
    group_values = ["MeanLight", "SumLight", "Area"]
    agg_methods = ["min", "max", "median", "mean", "std", max_min, q75_q25]
    output_df = aggregation(_input_df,
                            group_key=group_key,
                            group_values=group_values,
                            agg_methods=agg_methods)
    return output_df

# Year をキーにした集約特徴量
def get_agg_year_features(input_df):
    _input_df = pd.concat([input_df, get_area_feature(input_df)], axis=1)   
    group_key = "Year"
    group_values = ["MeanLight", "SumLight", "Area"]
    agg_methods = ["min", "max", "median", "mean", "std", max_min, q75_q25]
    output_df = aggregation(_input_df, 
                            group_key=group_key, 
                            group_values=group_values,
                            agg_methods=agg_methods)
    return output_df


# PlaceIDをキーにしたグループ内差分
def get_diff_agg_place_id_features(input_df):
    group_key = "PlaceID"
    group_values = ["MeanLight", "SumLight"]
    num_diffs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
    output_df = diff_aggregation(input_df, 
                                 group_key=group_key,
                                 group_values=group_values,
                                 num_diffs=num_diffs)
    return output_df


# PlaceID をキーにしたグループ内シフト
def get_shift_agg_place_id_features(input_df):
    group_key = "PlaceID"
    group_values = ["MeanLight", "SumLight"]
    num_shifts = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
    output_df = shift_aggregation(input_df, 
                                  group_key=group_key, 
                                  group_values=group_values, 
                                  num_shifts=num_shifts)
    return output_df


# pivot tableを用いた特徴量
def get_place_id_vecs_features(input_df):
    _input_df = pd.concat([inpu_df, get_area_feature(inpu_df)], axis=1)
    #pivot table
    area_df = pd.pivot_table(_input_df, index="PlaceID", columns="Year", values="Area").add_prefix("Area=")
    mean_light_df = pd.pivot_table(_input_df, index="PlaceID",  columns="Year", values="MeanLight").add_prefix("Area=")
    sum_light_df = pd.pivot_table(_input_df, index="PlaceID",  columns="Year", values="SumLight").add_prefix("Area=")
    all_df = pd.concat([area_df, mean_light_df, sum_light_df], axis=1)

    # PCA all
    sc_all_df = StandardScaler().fit_transform(all_df.fillna(0))
    pca = PCA(n_conponent=64, random_state=2011)
    pca_all_df = pd.DataFrame(pca.fit_transform(sc_all_df), index=all_df.index).rename(columns=lambda x: f"PlaceID_all_PCA={x:03}")

    # PCA Area
    sc_area_df = StandardScaler().fit_transform(area_df.fillna(0))
    pca = PCA(n_components=16, random_state=2021)
    pca_area_df = pd.DataFrame(pca.fit_transform(sc_area_df), index=all_df.index).rename(columns=lambda x: f"PlaceID_Area_PCA={x:03}")

    # PCA MeanLight
    sc_mean_light_df = StandardScaler().fit_transform(mean_light_df.fillna(0))
    pca = PCA(n_components=16, random_state=2021)
    pca_mean_light_df = pd.DataFrame(pca.fit_transform(sc_mean_light_df), index=all_df.index).rename(columns=lambda x: f"PlaceID_MeanLight_PCA={x:03}")

    # PCA SumLight
    sc_sum_light_df = StandardScaler().fit_transform(sum_light_df.fillna(0))
    pca = PCA(n_components=16, random_state=2021)
    pca_sum_light_df = pd.DataFrame(pca.fit_transform(sc_sum_light_df), index=all_df.index).rename(columns=lambda x: f"PlaceID_SumLight_PCA={x:03}")

    df = pd.concat([all_df, pca_all_df, pca_area_df, pca_mean_light_df, pca_sum_light_df], axis=1)
    output_df = pd.mearge(_input_df[["PlaceID"]], df, left_on="PlaceID", right_index=True, how="left")
    return output_df.drop("PlaceID", axis=1)




In [41]:
_input_df = pd.concat([train, get_area_feature(train)], axis=1)
#pivot table
area_df = pd.pivot_table(_input_df, index="PlaceID", columns="Year", values="Area").add_prefix("Area=")
mean_light_df = pd.pivot_table(_input_df, index="PlaceID",  columns="Year", values="MeanLight").add_prefix("Area=")
sum_light_df = pd.pivot_table(_input_df, index="PlaceID",  columns="Year", values="SumLight").add_prefix("Area=")
all_df = pd.concat([area_df, mean_light_df, sum_light_df], axis=1)
# PCA Area
sc_area_df = StandardScaler().fit_transform(area_df.fillna(0))
pca = PCA(n_components=16, random_state=2021)
pca_area_df = pd.DataFrame(pca.fit_transform(sc_area_df))#, index=all_df.index)#.rename(columns=lambda x: f"PlaceID_Area_PCA={x:03}")
pca_area_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-1.263557,-0.067712,0.012977,0.002371,-0.001660,-0.008274,-0.000497,-0.002201,-0.008297,0.002505,-0.001252,-0.000694,0.001752,0.004184,-0.006480,-0.003301
1,-0.646690,-0.061638,0.019567,0.000664,-0.003096,-0.006261,-0.002435,0.002321,0.001923,-0.000676,0.002085,-0.001586,0.001062,-0.001197,0.003378,0.000961
2,0.000171,-0.062446,0.021201,0.001061,-0.002902,-0.005682,-0.002246,0.002276,0.002021,-0.000706,0.001958,-0.001482,0.000940,-0.001232,0.003337,0.000988
3,-3.163352,-0.063038,0.009856,0.000515,-0.002765,-0.009486,-0.001757,-0.000550,-0.005211,0.001539,0.000272,-0.001334,0.001916,0.002501,-0.003061,-0.001974
4,-3.747968,-0.057746,0.011735,-0.001232,-0.004033,-0.009027,-0.003335,0.002541,0.001462,-0.000524,0.002708,-0.002084,0.001642,-0.001039,0.003573,0.000833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,1.361424,-0.070904,0.019565,0.003985,-0.000836,-0.005895,0.000264,-0.002405,-0.007873,0.002408,-0.001751,-0.000232,0.001250,0.004054,-0.006691,-0.003180
1014,-3.555084,-0.055707,0.013902,-0.001808,-0.004520,-0.008366,-0.003986,0.004051,0.004870,-0.001582,0.003827,-0.002383,0.001415,-0.002833,0.006860,0.002256
1015,-3.457652,-0.002711,-0.050183,-0.014729,0.049350,0.037334,-0.018395,-0.070872,-0.012846,0.008436,-0.012710,0.022437,-0.023497,0.015346,-0.043317,-0.021256
1016,9.824207,-0.067890,0.050994,0.005004,-0.001632,0.004576,-0.001472,0.006170,0.013662,-0.004273,0.003520,-0.000849,-0.001478,-0.007080,0.012578,0.005664


In [43]:
area_df

Year,Area=1992,Area=1993,Area=1994,Area=1995,Area=1996,Area=1997,Area=1998,Area=1999,Area=2000,Area=2001,Area=2002,Area=2003,Area=2004,Area=2005,Area=2006,Area=2007,Area=2008,Area=2009,Area=2010,Area=2011,Area=2012,Area=2013
PlaceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,135.987488,135.984863,135.987156,135.987397,135.986940,135.985921,135.983875,135.984355,135.987286,135.985884,135.985799,135.979565,135.985571,135.982473,135.984503,135.986076,135.985391,133.986113,133.989459,133.985294,135.985884,135.985859
2,167.996563,167.996010,167.996652,167.996513,167.996524,167.996299,167.996380,167.996450,167.996514,167.996545,167.996575,167.995918,167.996437,167.996195,167.996484,167.996701,167.996603,168.996545,168.996864,168.996443,167.996817,167.996714
4,201.994037,201.993148,201.994347,201.994242,201.994269,201.993922,201.994122,201.994064,201.994574,201.994721,201.994679,201.992936,201.994382,201.993610,201.994321,201.994796,201.994695,202.994603,202.995427,202.994740,201.995109,201.994938
8,35.998893,35.998846,35.998971,35.998945,35.998937,35.998921,35.998921,35.998975,35.999074,35.999071,35.999143,35.998971,35.999056,35.999000,35.999103,35.999148,35.999157,34.998865,34.998985,34.999000,35.999133,35.999124
10,4.998685,4.999074,4.999324,4.999074,4.999138,4.998864,4.998958,4.998864,4.999265,4.999167,4.999167,4.998612,4.999074,4.998612,4.998913,4.999039,4.998685,5.998667,5.998759,5.998875,4.999074,4.999194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,273.929860,273.940420,273.935853,273.955686,273.948875,273.946029,273.946261,273.950191,273.959101,273.956199,273.955552,273.937145,273.948239,273.951412,273.952706,273.958197,273.955957,271.942178,271.967589,271.949911,273.952602,273.955462
1656,14.999762,14.999762,14.999762,14.999761,14.999762,14.999753,14.999762,14.999762,14.999762,14.999762,14.999762,14.999754,14.999762,14.999762,14.999758,14.999762,14.999762,16.999730,16.999730,16.999730,14.999762,14.999762
1657,,23.990404,0.000000,23.992003,23.989338,23.988486,23.980153,23.990861,23.995120,23.994242,23.994859,23.991890,23.992322,23.990073,23.988006,23.991276,23.987207,0.000000,19.992160,19.992003,23.981435,23.983072
1659,717.904478,717.900704,717.923186,717.925586,717.925009,717.916934,717.909093,717.913205,717.927116,717.920073,717.924179,717.894810,717.920152,717.906872,717.914118,717.925752,717.922038,721.909635,721.933982,721.916814,717.921829,717.925208
