<a href="https://colab.research.google.com/github/yuto-kobayashi-1/signate-AIQuest/blob/develop/210716_AIQuest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [225]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb
from copy import deepcopy
import numpy as np
from datetime import datetime , date

In [166]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [249]:
data = pd.read_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/train.csv",encoding="utf_8")
test = pd.read_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/test.csv",encoding="utf_8")

In [250]:
def preprocessing(df):
  # df["host_response_rate"] = df["host_response_rate"].fillna('0%')
  df["host_response_rate"] = df["host_response_rate"].str.replace('%', '').astype(float)
  df["host_since"] = (date(2021,7,21) - pd.to_datetime(df["host_since"]).dt.date).dt.days
  return df

def label_encorder(df_train,df_test,cols):
  df_train["flg"] = 0
  df_test["y"] = 100
  df_test["flg"] = 1

  df_ = pd.concat([df_train,df_test]).reset_index(drop=True)
  le = LabelEncoder()

  for col in cols:
    df_[col] = df_[col].fillna("OK")
    null_list = list(df_[df_[col] == "OK"].index)
    print(len(null_list))
    encoded = le.fit_transform(df_[col].values)
    decoded = le.inverse_transform(encoded)
    df_[col] = encoded
    df_.loc[null_list,col]=np.nan

  df_train_ = df_[df_["flg"]==0].drop("flg",axis=1)
  df_test_ = df_[df_["flg"]==1].drop(["flg","y"],axis=1)

  return df_train_ ,  df_test_

def train(X,y,label_cols):

  kf = KFold(n_splits=5,shuffle=True,random_state=0)

  models = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    X_valid = X.iloc[test_index]
    y_valid = y.iloc[test_index]

    X_train, y_train, X_valid, y_valid =  target_encording(X_train, y_train, X_valid, y_valid,label_cols)

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 64,
    'min_data_in_leaf': 20,
    'max_depth': 7,
    'verbose': 0,
  }
    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data]
  )

    models.append(model)

  return models 

def feature_importance(models,data,cols):
  importance = pd.DataFrame(columns=data[cols].columns)

  for model in models:
    imp_ = pd.Series(model.feature_importance(),index=data[cols].columns)
    importance = importance.append(imp_,ignore_index=True)

  importance = importance.mean()

  importance = pd.DataFrame(importance,columns={"importance"}).sort_values("importance",ascending=False)

  importance = pd.DataFrame(importance/importance.sum())
  return importance

In [251]:
def target_encording(tr_x, tr_y, va_x, va_y, cols):

  for c in cols:

    data_tmp = pd.DataFrame({c:tr_x[c],'target':tr_y})
    target_mean = data_tmp.groupby(c)['target'].mean()
    #バリデーションデータのカテゴリを置換
    va_x.loc[:,c] = va_x[c].map(target_mean)

    #学習データの変換後の値を格納する配列を準備
    tmp = np.repeat(np.nan,tr_x.shape[0])
    kf_encording = KFold(n_splits=5,shuffle=True,random_state=72)
    for idx_1, idx_2 in kf_encording.split(tr_x):
      #out-of-foldで各カテゴリにおける目的変数の平均を計算
      target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
      #変換後の値を一時配列に格納
      tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean)

    tr_x.loc[:,c] = tmp

  return tr_x, tr_y, va_x, va_y

def target_encording_test(tr_x, tr_y, va_x, cols):
  for c in cols:
    data_tmp = pd.DataFrame({c:tr_x[c],'target':tr_y})
    target_mean = data_tmp.groupby(c)['target'].mean()
    #バリデーションデータのカテゴリを置換
    va_x.loc[:,c] = va_x[c].map(target_mean)

  return va_x

def predict(models,df):
  preds = np.zeros((len(test),len(models)))
  for n , model in enumerate(models):
    # pred = model.predict(df_test[cols],num_iteration=model.best_iteration)
    pred = model.predict(df_test[cols],num_iteration=model.best_iteration)
    preds[:,n] = pred

  score = pd.DataFrame()
  score["id"] = df_test["id"].astype(int)
  score["pred"] = np.mean(preds,axis=1)
  
  return score

In [252]:
len(data)

55583

In [253]:
len(test)

18528

In [254]:
cols = ['accommodates','bathrooms', 'bed_type', 'bedrooms',
       'beds', 'cancellation_policy', 'city', 'cleaning_fee',
       'host_has_profile_pic', 'host_identity_verified',
       'instant_bookable',
       'latitude', 'longitude', 'neighbourhood', 'number_of_reviews',
       'property_type', 'review_scores_rating', 'room_type',"host_response_rate","host_since"]


label_cols = ['bed_type','cancellation_policy',"city","cleaning_fee","host_has_profile_pic","host_identity_verified","instant_bookable","neighbourhood","property_type","room_type"]


In [255]:
data = preprocessing(data)
test = preprocessing(test)

In [256]:
# df_train , df_test  = label_encorder(data,test,label_cols)
df_train , df_test  = data,test

In [257]:
models = train(df_train[cols],df_train['y'],label_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


[1]	valid_0's rmse: 159.332
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 152.208
[3]	valid_0's rmse: 145.93
[4]	valid_0's rmse: 140.67
[5]	valid_0's rmse: 136.246
[6]	valid_0's rmse: 132.571
[7]	valid_0's rmse: 129.541
[8]	valid_0's rmse: 126.983
[9]	valid_0's rmse: 124.59
[10]	valid_0's rmse: 122.596
[11]	valid_0's rmse: 120.869
[12]	valid_0's rmse: 119.445
[13]	valid_0's rmse: 118.422
[14]	valid_0's rmse: 117.35
[15]	valid_0's rmse: 116.551
[16]	valid_0's rmse: 115.988
[17]	valid_0's rmse: 115.286
[18]	valid_0's rmse: 114.815
[19]	valid_0's rmse: 114.319
[20]	valid_0's rmse: 113.995
[21]	valid_0's rmse: 113.685
[22]	valid_0's rmse: 113.339
[23]	valid_0's rmse: 113.036
[24]	valid_0's rmse: 112.828
[25]	valid_0's rmse: 112.679
[26]	valid_0's rmse: 112.329
[27]	valid_0's rmse: 112.077
[28]	valid_0's rmse: 111.872
[29]	valid_0's rmse: 111.674
[30]	valid_0's rmse: 111.525
[31]	valid_0's rmse: 111.516
[32]	valid_0's rmse: 111.434
[33]	valid_0's rmse: 1

In [258]:
aucs=[]
for model in models:
  aucs.append(model.best_score["valid_0"]["rmse"])
np.mean(aucs)

106.62279219806915

In [259]:
importance = feature_importance(models,data,cols)
importance

Unnamed: 0,importance
neighbourhood,0.131053
longitude,0.102565
host_since,0.083663
latitude,0.082379
accommodates,0.067377
bathrooms,0.061109
number_of_reviews,0.058559
review_scores_rating,0.054258
host_response_rate,0.052024
property_type,0.050108


In [260]:
df_test = target_encording_test(df_train[cols],df_train['y'],df_test,label_cols)

In [261]:
df_test[cols].head()

Unnamed: 0,accommodates,bathrooms,bed_type,bedrooms,beds,cancellation_policy,city,cleaning_fee,host_has_profile_pic,host_identity_verified,instant_bookable,latitude,longitude,neighbourhood,number_of_reviews,property_type,review_scores_rating,room_type,host_response_rate,host_since
0,6,2.0,161.855415,2.0,2.0,177.708076,165.629287,163.180471,160.028517,162.540752,152.712832,42.359278,-71.069962,201.766234,58,187.191627,90.0,218.635896,100.0,1793.0
1,3,1.0,161.855415,1.0,1.0,141.202828,154.555324,163.180471,160.028517,159.091831,162.805405,34.084747,-118.367355,162.843985,4,125.772959,100.0,218.635896,100.0,2513.0
2,2,1.0,161.855415,0.0,1.0,149.785029,143.096152,151.821298,160.028517,159.091831,162.805405,40.720541,-73.959192,138.342622,0,146.58608,,88.334405,100.0,3199.0
3,4,1.0,161.855415,1.0,2.0,177.708076,143.096152,151.821298,160.028517,159.091831,162.805405,40.681117,-73.944091,101.388235,0,146.58608,,218.635896,,3101.0
4,3,1.5,161.855415,1.0,2.0,177.708076,154.555324,163.180471,160.028517,159.091831,162.805405,34.150995,-118.409359,,6,202.257112,92.0,218.635896,100.0,2397.0


In [262]:
score = predict(models,df_test[cols])

In [263]:
score

Unnamed: 0,id,pred
0,0,237.917782
1,1,131.405375
2,2,103.732415
3,3,159.455153
4,4,148.437243
...,...,...
18523,18523,201.124233
18524,18524,132.448742
18525,18525,194.179577
18526,18526,133.702683


In [264]:
score.dtypes

id        int64
pred    float64
dtype: object

In [265]:
score.to_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/submit.csv",header=False,index=False)