<a href="https://colab.research.google.com/github/yuto-kobayashi-1/signate-AIQuest/blob/develop/210716_AIQuest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb
from copy import deepcopy
import numpy as np
from datetime import datetime , date
from sklearn.inspection import permutation_importance

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/train.csv",encoding="utf_8")
test = pd.read_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/test.csv",encoding="utf_8")

In [4]:
def preprocessing(df):
  df["host_response_rate"] = df["host_response_rate"].str.replace('%', '').astype(float)
  df["host_since"] = (date(2021,7,21) - pd.to_datetime(df["host_since"]).dt.date).dt.days
  # df["first_review"] = (date(2021,7,21) - pd.to_datetime(df["first_review"]).dt.date).dt.days
  # df["last_review"] = (date(2021,7,21) - pd.to_datetime(df["last_review"]).dt.date).dt.days
  # df["review_days"] = (data["first_review"] - data["last_review"])
  df['amenities'] = df['amenities'].str.replace('"', '')
  df['amenities'] = df['amenities'].str.replace('{', '')
  df['amenities'] = df['amenities'].str.replace('}', '')
  
  return df

def label_encorder(df_train,df_test,cols):
  df_train["flg"] = 0
  df_test["y"] = 100
  df_test["flg"] = 1

  df_ = pd.concat([df_train,df_test]).reset_index(drop=True)
  le = LabelEncoder()

  for col in cols:
    df_[col] = df_[col].fillna("OK")
    null_list = list(df_[df_[col] == "OK"].index)
    print(len(null_list))
    encoded = le.fit_transform(df_[col].values)
    decoded = le.inverse_transform(encoded)
    df_[col] = encoded
    df_.loc[null_list,col]=np.nan

  df_train_ = df_[df_["flg"]==0].drop("flg",axis=1)
  df_test_ = df_[df_["flg"]==1].drop(["flg","y"],axis=1)

  return df_train_ ,  df_test_

def get_dummys(df_train,df_test,cols):
  df_train["flg"] = 0
  df_test["y"] = 100
  df_test["flg"] = 1

  df_ = pd.concat([df_train,df_test]).reset_index(drop=True)

  amenities = df_["amenities"].str.get_dummies(sep=',')

  df_ = pd.concat([df_,amenities],axis=1)

  df_train_ = df_[df_["flg"]==0].drop("flg",axis=1)
  df_test_ = df_[df_["flg"]==1].drop(["flg","y"],axis=1)

  return df_train_ ,  df_test_


def train(X,y,label_cols):

  kf = KFold(n_splits=5,shuffle=True,random_state=0)

  models = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    X_valid = X.iloc[test_index]
    y_valid = y.iloc[test_index]

    X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 64,
    'min_data_in_leaf': 20,
    'max_depth': 7,
    'verbose': 0,
  }
    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data]
  )

    models.append(model)

  return models 

def feature_importance(models,data,cols):
  importance = pd.DataFrame(columns=data[cols].columns)

  for model in models:
    imp_ = pd.Series(model.feature_importance(),index=data[cols].columns)
    importance = importance.append(imp_,ignore_index=True)

  importance = importance.mean()

  importance = pd.DataFrame(importance,columns={"importance"}).sort_values("importance",ascending=False)

  importance = pd.DataFrame(importance/importance.sum())
  return importance

def adv_train(train,test,label_cols,drop_cols):

  df_train = train.copy()
  df_test = test.copy()

  df_train = df_train.drop(drop_cols,axis=1)
  df_test = df_test.drop(drop_cols,axis=1)
  
  df_train ,  df_test = label_encorder(df_train,df_test,label_cols)

  df_train = df_train.drop('y',axis=1)

  df_train["flg"] = 0
  df_test["flg"] = 1

  df_ = pd.concat([df_train,df_test]).reset_index(drop=True)

  kf = KFold(n_splits=5,shuffle=True,random_state=0)
  X  = df_ .drop('flg',axis=1)
  y   = df_['flg']

  models = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    X_valid = X.iloc[test_index]
    y_valid = y.iloc[test_index]

    params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 64,
    'min_data_in_leaf': 20,
    'max_depth': 7,
    'verbose': 0,
  }
    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data]
  )

    models.append(model)
  
  preds = np.zeros((len(df_train),len(models)))
  for n , model in enumerate(models):
    # pred = model.predict(df_test[cols],num_iteration=model.best_iteration)
    pred = model.predict(df_train.drop('flg',axis=1),num_iteration=model.best_iteration)
    preds[:,n] = pred
  
  train['pred'] =  np.mean(preds,axis=1)
  train =  train.sort_values('pred',ascending=False)

  return train 

def train_specify(train,valid,label_cols,drop_cols):
  
  X = train.drop(drop_cols+["y"],axis=1)
  y  = train['y']
  X_valid = valid.drop(drop_cols+["y"],axis=1)
  y_valid = valid['y']

  kf = KFold(n_splits=5,shuffle=True,random_state=0)

  models = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    # X_valid = X.iloc[test_index]
    # y_valid = y.iloc[test_index]

    X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 64,
    'min_data_in_leaf': 20,
    'max_depth': 7,
    'verbose': 0,
  }
    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data]
  )

    models.append(model)

  return models 

In [118]:
def train_permutation(train,valid,label_cols,select_cols):
  
  X = train[select_cols]
  y  = train['y']
  X_valid = valid[select_cols]
  y_valid = valid['y']

  kf = KFold(n_splits=5,shuffle=True,random_state=0)

  models = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    # X_valid = X.iloc[test_index]
    # y_valid = y.iloc[test_index]

    X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 64,
    'min_data_in_leaf': 20,
    'max_depth': 7,
    'verbose': 0,
  }
    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data]
  )

    models.append(model)

  return models 

def train_permutation_useValid(train,valid,label_cols,select_cols):
  
  data = pd.concat([train,valid])

  X = data[select_cols]
  y  = data['y']

  kf = KFold(n_splits=5,shuffle=True,random_state=0)

  models = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    X_valid = X.iloc[test_index]
    y_valid = y.iloc[test_index]

    X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 64,
    'min_data_in_leaf': 20,
    'max_depth': 7,
    'verbose': 0,
  }
    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data]
  )

    models.append(model)

  return models 

In [145]:
def target_encording(tr_x, tr_y, va_x, va_y, cols):

  for c in cols:

    data_tmp = pd.DataFrame({c:tr_x[c],'target':tr_y})
    target_mean = data_tmp.groupby(c)['target'].mean()
    #バリデーションデータのカテゴリを置換
    va_x.loc[:,c] = va_x[c].map(target_mean)

    #学習データの変換後の値を格納する配列を準備
    tmp = np.repeat(np.nan,tr_x.shape[0])
    kf_encording = KFold(n_splits=5,shuffle=True,random_state=72)
    for idx_1, idx_2 in kf_encording.split(tr_x):
      #out-of-foldで各カテゴリにおける目的変数の平均を計算
      target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
      #変換後の値を一時配列に格納
      tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean)

    tr_x.loc[:,c] = tmp

  return tr_x, tr_y, va_x, va_y

def target_encording_test(tr_x, tr_y, va_x, cols):
  for c in cols:
    data_tmp = pd.DataFrame({c:tr_x[c],'target':tr_y})
    target_mean = data_tmp.groupby(c)['target'].mean()
    #バリデーションデータのカテゴリを置換
    va_x.loc[:,c] = va_x[c].map(target_mean)

  return va_x

def predict(models,df):
  preds = np.zeros((len(df),len(models)))
  for n , model in enumerate(models):
    # pred = model.predict(df_test[cols],num_iteration=model.best_iteration)
    pred = model.predict(df.drop("id",axis=1),num_iteration=model.best_iteration)
    preds[:,n] = pred

  score = pd.DataFrame()
  score["id"] = df["id"].astype(int)
  score["pred"] = np.mean(preds,axis=1)
  
  return score

In [6]:
from sklearn.metrics import mean_squared_error, make_scorer
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred)) 
mse_scorer = make_scorer(rmse)

In [21]:
def permutation_train(train,valid,label_cols,drop_cols):
  
  X = train.drop(drop_cols+["y"],axis=1)
  y  = train['y']
  X_valid = valid.drop(drop_cols+["y"],axis=1)
  y_valid = valid['y']

  kf = KFold(n_splits=5,shuffle=True,random_state=0)

  models = []
  perm_imp_list = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    # X_valid = X.iloc[test_index]
    # y_valid = y.iloc[test_index]

    X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    parms= {'X': X_train.values,
                  'y': y_train.values,
                  'eval_set': (X_valid.values, y_valid.values),
                  'early_stopping_rounds': 100,
                  'eval_metric':'rmse',
                  'verbose': False,}

    model = lgb.LGBMRegressor(objective="regression", n_estimators=100, importance_type="gain", num_leave=64,max_depth=7)

    # gbm = model.fit(**parms, callbacks=callbacks)
    model = model.fit(**parms)

    models += [model]

    # run permutation importance
    result = permutation_importance(model, X_valid.values, y_valid.values, scoring=mse_scorer, n_repeats=10, n_jobs=-1, random_state=71)
    perm_imp_df = pd.DataFrame({"importances_mean":result["importances_mean"], "importances_std":result["importances_std"]}, index=X.columns)
    perm_imp_list += [perm_imp_df]

  return perm_imp_list 

In [8]:
len(data)

55583

In [9]:
len(test)

18528

In [109]:
cols = ['accommodates','bathrooms', 'bed_type', 'bedrooms',
       'beds', 'cancellation_policy', 'city', 'cleaning_fee',
       'host_has_profile_pic', 'host_identity_verified',
       'instant_bookable',
       'latitude', 'longitude', 'neighbourhood', 'number_of_reviews',
       'property_type', 'review_scores_rating', 'room_type',"host_response_rate",
        "host_since","amenities"
        ]

drop_cols = ["id","amenities","description","first_review","last_review","name","thumbnail_url","zipcode"]

label_cols = ['bed_type','cancellation_policy',"city","cleaning_fee","host_has_profile_pic","host_identity_verified","instant_bookable","neighbourhood","property_type","room_type"]


In [11]:
data = preprocessing(data)
test = preprocessing(test)

In [12]:
df_train , df_test  = get_dummys(data,test,label_cols)
# df_train , df_test  = data,test

adversarial train を行いtestデータに似ているtrainデータをvalidデータにする

In [None]:
df_train = adv_train(df_train , df_test,label_cols,drop_cols)

In [14]:
print(len(df_train)*0.2)

11116.6


In [15]:
train_data = df_train[:11116]
valid_data = df_train[11116:]

In [16]:
# train_data.to_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/train_data.csv",header=True,index=True)
# valid_data.to_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/valid_data.csv",header=True,index=True)
# df_test.to_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/test_data.csv",header=True,index=True)

permutation_importance

In [None]:
imoprtanceDF = permutation_train(train_data,valid_data,label_cols,drop_cols)

特徴量選択

In [83]:
imoprtanceDF_mean = pd.concat([imoprtanceDF[i][['importances_mean']] for i in range(len(imoprtanceDF))],axis=1)
imoprtanceDF_mean = pd.DataFrame(imoprtanceDF_mean.mean(axis=1),columns={'importances_mean'}).sort_values('importances_mean')
imoprtanceDF_mean = imoprtanceDF_mean[imoprtanceDF_mean['importances_mean']<0]

In [98]:
imoprtanceDF_mean

Unnamed: 0,importances_mean
bedrooms,-15.5585
accommodates,-13.622986
bathrooms,-8.619656
longitude,-3.120755
number_of_reviews,-2.561581
latitude,-2.174304
room_type,-1.943493
review_scores_rating,-1.287212
neighbourhood,-1.260013
host_response_rate,-0.895739


In [99]:
features = list(imoprtanceDF_mean.reset_index()['index'].values)
print(features)

['bedrooms', 'accommodates', 'bathrooms', 'longitude', 'number_of_reviews', 'latitude', 'room_type', 'review_scores_rating', 'neighbourhood', 'host_response_rate', 'city', 'Cable TV', 'Pool', 'Indoor fireplace', 'property_type', 'beds', 'Elevator', 'Suitable for events', 'Gym', 'Hot tub', 'Fire extinguisher', 'Buzzer/wireless intercom', 'Dryer', 'Essentials', 'cleaning_fee', 'TV', 'host_since', 'Breakfast', 'Kitchen', 'translation missing: en.hosting_amenity_50', 'Doorman', 'Heating', 'Hangers', 'Wireless Internet', 'cancellation_policy', 'Smoke detector', 'Dishwasher', 'Shampoo', 'instant_bookable', 'Wheelchair accessible', 'Cat(s)', 'Safety card', 'Carbon monoxide detector', 'Lock on bedroom door', 'Extra pillows and blankets', 'First aid kit', 'Ethernet connection', 'Bathtub', 'Keypad', 'Pets live on this property', 'Self Check-In', 'Well-lit path to entrance', ' smooth pathway to front door']


In [119]:
label_cols = ['cancellation_policy',"city","cleaning_fee","instant_bookable","neighbourhood","property_type","room_type"]



```
# これはコードとして書式設定されます
```

学習

In [121]:
len(features)

53

In [122]:
# models = train_specify(train_data,valid_data,label_cols,drop_cols)
# models = train_permutation(train_data,valid_data,label_cols,features)
models = train_permutation_useValid(train_data,valid_data,label_cols,features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


[1]	valid_0's rmse: 156.929
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 149.645
[3]	valid_0's rmse: 143.442
[4]	valid_0's rmse: 138.107
[5]	valid_0's rmse: 133.465
[6]	valid_0's rmse: 129.566
[7]	valid_0's rmse: 126.338
[8]	valid_0's rmse: 123.395
[9]	valid_0's rmse: 120.695
[10]	valid_0's rmse: 118.357
[11]	valid_0's rmse: 116.457
[12]	valid_0's rmse: 114.879
[13]	valid_0's rmse: 113.326
[14]	valid_0's rmse: 112.137
[15]	valid_0's rmse: 111.197
[16]	valid_0's rmse: 110.063
[17]	valid_0's rmse: 109.095
[18]	valid_0's rmse: 108.359
[19]	valid_0's rmse: 107.723
[20]	valid_0's rmse: 107.142
[21]	valid_0's rmse: 106.48
[22]	valid_0's rmse: 106.041
[23]	valid_0's rmse: 105.566
[24]	valid_0's rmse: 105.099
[25]	valid_0's rmse: 104.83
[26]	valid_0's rmse: 104.511
[27]	valid_0's rmse: 104.16
[28]	valid_0's rmse: 103.842
[29]	valid_0's rmse: 103.701
[30]	valid_0's rmse: 103.482
[31]	valid_0's rmse: 103.302
[32]	valid_0's rmse: 103.142
[33]	valid_0's rmse: 

In [123]:
# models = train(df_train.drop(drop_cols+["y"],axis=1),df_train['y'],label_cols)

In [139]:
aucs=[]
for model in models:
  aucs.append(model.best_score["valid_0"]["rmse"])
np.mean(aucs)

104.33890995879639

In [140]:
# importance = feature_importance(models,df_train,df_train.drop(drop_cols+["y"],axis=1).columns)
# importance[importance["importance"]>0].head(50)
importance = feature_importance(models,df_train,features)
importance[importance["importance"]>0]

Unnamed: 0,importance
neighbourhood,0.120413
longitude,0.091228
latitude,0.070779
host_since,0.066009
accommodates,0.065132
bathrooms,0.057018
number_of_reviews,0.051133
review_scores_rating,0.047259
host_response_rate,0.046637
property_type,0.044846


In [141]:
len(df_train)

55583

In [130]:
df_test = target_encording_test(df_train.drop(drop_cols+["y"],axis=1),df_train['y'],df_test,label_cols)

In [146]:
# drop_cols.remove("id")
score = predict(models,df_test[['id']+features])

In [1]:
score

NameError: ignored

In [148]:
score.dtypes

id        int64
pred    float64
dtype: object

In [149]:
score.to_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/submit.csv",header=False,index=False)