In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb
from copy import deepcopy
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/train.csv",encoding="utf_8")
test = pd.read_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/test.csv",encoding="utf_8")

In [None]:
def preprocessing(df):
  # df["host_response_rate"] = df["host_response_rate"].fillna('0%')
  df["host_response_rate"] = df["host_response_rate"].str.replace('%', '').astype(float)

  return df

def label_encorder(df_train,df_test,cols):
  df_train["flg"] = 0
  df_test["y"] = 100
  df_test["flg"] = 1

  df_ = pd.concat([df_train,df_test]).reset_index(drop=True)
  le = LabelEncoder()

  for col in cols:
    df_[col] = df_[col].fillna("OK")
    null_list = list(df_[df_[col] == "OK"].index)
    print(len(null_list))
    encoded = le.fit_transform(df_[col].values)
    decoded = le.inverse_transform(encoded)
    df_[col] = encoded
    df_.loc[null_list,col]=np.nan

  df_train_ = df_[df_["flg"]==0].drop("flg",axis=1)
  df_test_ = df_[df_["flg"]==1].drop(["flg","y"],axis=1)

  return df_train_ ,  df_test_

def train(X,y):

  kf = KFold(n_splits=5,shuffle=True,random_state=0)

  models = []
  for train_index, test_index in kf.split(X, y):
    X_train = X.iloc[train_index]
    y_train =  y.iloc[train_index]
    X_valid = X.iloc[test_index]
    y_valid = y.iloc[test_index]

    params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 64,
    'min_data_in_leaf': 20,
    'max_depth': 7,
    'verbose': 0,
  }
    train_data = lgb.Dataset(
      data=X_train, 
      label=y_train,
  )

    validation_data = lgb.Dataset(
      data=X_valid, 
      label=y_valid, 
  )

    model = lgb.train(
      params=params, 
      train_set=train_data, 
      num_boost_round=1000, 
      early_stopping_rounds=100,
      valid_sets=[validation_data]
  )

    models.append(model)

  return models 

def feature_importance(models,data,cols):
  importance = pd.DataFrame(columns=data[cols].columns)

  for model in models:
    imp_ = pd.Series(model.feature_importance(),index=data[cols].columns)
    importance = importance.append(imp_,ignore_index=True)

  importance = importance.mean()

  importance = pd.DataFrame(importance,columns={"importance"}).sort_values("importance",ascending=False)

  importance = importance/importance.sum()
  return importance

In [None]:
len(data)

55583

In [None]:
len(test)

18528

In [None]:
cols = ['accommodates','bathrooms', 'bed_type', 'bedrooms',
       'beds', 'cancellation_policy', 'city', 'cleaning_fee',
       'host_has_profile_pic', 'host_identity_verified',
       'instant_bookable',
       'latitude', 'longitude', 'neighbourhood', 'number_of_reviews',
       'property_type', 'review_scores_rating', 'room_type',"host_response_rate"]
label_cols = ['bed_type','cancellation_policy',"city","cleaning_fee","host_has_profile_pic","host_identity_verified","instant_bookable","neighbourhood","property_type","room_type"]

In [None]:
data = preprocessing(data)
test = preprocessing(test)

In [None]:
df_train , df_test  = label_encorder(data,test,label_cols)

0
0
0
0
188
188
0
6872
0
0


In [None]:
models = train(df_train[cols],df_train['y'])

[1]	valid_0's rmse: 159.736
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 152.807
[3]	valid_0's rmse: 146.82
[4]	valid_0's rmse: 141.785
[5]	valid_0's rmse: 137.49
[6]	valid_0's rmse: 133.701
[7]	valid_0's rmse: 130.551
[8]	valid_0's rmse: 128.023
[9]	valid_0's rmse: 125.693
[10]	valid_0's rmse: 123.932
[11]	valid_0's rmse: 122.389
[12]	valid_0's rmse: 120.945
[13]	valid_0's rmse: 119.735
[14]	valid_0's rmse: 118.69
[15]	valid_0's rmse: 117.864
[16]	valid_0's rmse: 117.096
[17]	valid_0's rmse: 116.466
[18]	valid_0's rmse: 115.793
[19]	valid_0's rmse: 115.286
[20]	valid_0's rmse: 114.871
[21]	valid_0's rmse: 114.491
[22]	valid_0's rmse: 114.024
[23]	valid_0's rmse: 113.746
[24]	valid_0's rmse: 113.493
[25]	valid_0's rmse: 113.107
[26]	valid_0's rmse: 112.821
[27]	valid_0's rmse: 112.579
[28]	valid_0's rmse: 112.386
[29]	valid_0's rmse: 112.037
[30]	valid_0's rmse: 111.892
[31]	valid_0's rmse: 111.735
[32]	valid_0's rmse: 111.603
[33]	valid_0's rmse: 

In [None]:
aucs=[]
for model in models:
  aucs.append(model.best_score["valid_0"]["rmse"])
np.mean(aucs)

106.71715473963881

In [None]:
importance = feature_importance(models,data,cols)
importance

Unnamed: 0,importance
longitude,0.157842
latitude,0.145902
neighbourhood,0.103858
bathrooms,0.079528
accommodates,0.078706
number_of_reviews,0.067196
review_scores_rating,0.065043
host_response_rate,0.059524
bedrooms,0.05144
beds,0.046918


In [None]:
preds = np.zeros((len(test),len(models)))
for n , model in enumerate(models):
  # pred = model.predict(df_test[cols],num_iteration=model.best_iteration)
  pred = model.predict(df_test[cols],num_iteration=model.best_iteration)
  preds[:,n] = pred

In [None]:
score = pd.DataFrame()
score["id"] = df_test["id"].astype(int)
score["pred"] = np.mean(preds,axis=1)
# score["pred"] = np.round(score["pred"]).astype(int)

In [None]:
score

Unnamed: 0,id,pred
55583,0,243.525461
55584,1,144.228364
55585,2,118.937154
55586,3,164.334785
55587,4,160.300292
...,...,...
74106,18523,198.557949
74107,18524,117.728608
74108,18525,236.200806
74109,18526,134.676637


In [None]:
score.dtypes

id        int64
pred    float64
dtype: object

In [None]:
score.to_csv("/content/drive/MyDrive/signate/competiton/210716_AI Quest/submit.csv",header=False,index=False)