In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import *
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA
from scipy.stats import mode
from sklearn.impute import SimpleImputer
import warnings
from sklearn import metrics
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from fun import *
import time
import nltk
from nltk.tokenize import word_tokenize
#function to split text into word
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from catboost import CatBoostRegressor
import pycountry

In [15]:
# Raw data import
train_raw = pd.read_csv("data/train.csv", low_memory=False)
test_raw = pd.read_csv("data/test.csv", low_memory=False)

## Preprocessing

In [16]:
def drop_initial(df): 
    # Drop the columns that do not need
    cols = ["id", "space", "experiences_offered", 
            "host_acceptance_rate",
            "city", "state", "zipcode", "country",
            "country_code",
            "maximum_nights","host_location","first_review", "last_review", 
            "minimum_nights","is_business_travel_ready"]
    return df.drop(cols,  axis = 1)

In [17]:
##################################### FEATURE ENGINEERING HELPER FUNCTIONS #####################################
# ----------------------------- Text Processing ----------------------------- #
def tokenize(sentence):
    # From text to words
    if pd.isnull(sentence):
        return "NoDescription"
    words = nltk.word_tokenize(sentence)
    new_words= [word for word in words if word.isalnum()]
    tokens = [w for w in new_words if not w in stop_words]
    snowball = nltk.stem.SnowballStemmer("english")
    stems = []
    for t in tokens:    
        stems.append(snowball.stem(t))
    return " ".join(stems)

def if_lux(sentence):
    # Mark as luxury listing if there contains following keywards in the processing text feature
    keys = [
        "celebr", "stylist", "premium",
        "lux", "citi view", "million",
        "superb", "prestig","magnific",
        "triplex", "duplex", "panoram","sleek",
        "eleg", "massiv", "photographi",
        "squar feet", "triple", "ultim"
        ]
    for i in keys:
        if i in sentence:
            return 1
    return 0

# class TextProcessor(BaseEstimator, TransformerMixin):
#     # TF-IDF vectorizer
#     def __init__(self):
#         self.tweet_text_transformer = Pipeline(steps=[
#         ('count_vectoriser', TfidfVectorizer(min_df = 0.05, max_features = 1000)),    ])   
#     def fit(self, X, y=None):
#         print("fitting...")
#         self.tweet_text_transformer.fit(X.squeeze())
#         return self
#     def transform(self, X, y=None):
#         print("transforming...")
#         return  self.tweet_text_transformer.transform(X.squeeze()).toarray()

# -------------------------- Extra People Processing -------------------------- #
def to_num(data, col):
    # drop $ and convert to float
    res = data[col].str.replace("$", "", regex = False).astype(float)
    return res


# -------------------------- Date Feature Processing -------------------------- #
def to_date(col, data):
    # process date features into three seperate features: year, month, day
    dt = pd.to_datetime(data[col]).dt
    data = data.drop(col, axis = 1)
    data[col+"_year"] = dt.year
    data[col+"_month"] = dt.month
    data[col+"_day"] = dt.day
    return data
    
# ---------------------------- Amenities Processing --------------------------- #
def get_all_amenities():
    # Get all unique amenities in the listings
    all_ams = set()
    for i in train_raw.amenities:
        lst = i.replace("{", "").replace("}", "").replace("\"", "").strip().split(",")
        for j in lst:
            all_ams.add(j)
    all_ams.remove("")
    return all_ams

def to_amenities(data):
    all_ams = get_all_amenities()
    am = data.amenities.apply(lambda x: x.replace("{", "").replace("}", "").replace("\"", "").strip().split(","))
    for amenity in all_ams:
        temp = []
        for idx in am.index:
            if amenity in am[idx]:
                temp.append(1)
            else:
                temp.append(0)
        data["has_"+amenity] = temp
    return data.drop("amenities", axis = 1)

# ----------------------- Host Neighbourhood Processing ----------------------- #
def get_host_neibourhood_value_count():
    return train_raw.host_neighbourhood.fillna("NoRecord").value_counts()   
raw_vc = get_host_neibourhood_value_count()
def to_host_nbhd(nbhd):
    if nbhd not in raw_vc.index:
        return "Other"
    elif raw_vc[nbhd] >6:
        return nbhd
    else:
        return "Other"

# --------------------------- Square Feet Processing --------------------------- #
def to_area_range(area):
    # Convert area feature to categorical features
    if pd.isnull(area):
        return "na"
    elif area < 500:
        return "<500"
    elif area < 1000:
        return "<1000"
    elif area < 1500:
        return "<1500"
    else:
        return ">1500"    

In [18]:
####################################### FEATURE ENGINEERING #######################################
def engineer_features(df):
    # Process extra people
    print("Processing extra_people...")
    df["extra_people"] = to_num(data = df, col = "extra_people")
    
    # process text features
    print("Processing text features...")
    txt_data = ["description", "summary", "name"]
    for i in txt_data:
        print("     Tokenizing {}...".format(i))
        df[i] = df[i].fillna("NoRecord").apply(tokenize)
    all_text = (df["description"]+" "+ df["summary"]+" "+df["name"])
    print("     Constructing if_lux feature...")
    df["potential_lux"] = all_text.apply(if_lux)
    df.drop(txt_data, axis = 1, inplace = True)
    
    # process date features
    print("Processing date features...")
    date_feature = ["host_since"]
    for i in date_feature:
        df = to_date(i, df)
        
    # host_neighbourhood transformation
    print("Processing host_neighbourhood...")
    df["host_neighbourhood"] = df["host_neighbourhood"].fillna("NoRecord").apply(to_host_nbhd)
    
    # Transfer amenities feature to multiple one hot featuers
    print("Processing amenities...")
    df = to_amenities(df)

    # If nan, fill 0, otherwise fill 1
    print("Processing nan OneHot...")
    nan_one_hot = ["access", 
                   "neighborhood_overview", 
                   "interaction", 
                   "host_name", 
                   "notes", 
                   "transit", 
                   "host_about", 
                   "house_rules"]
    for i in nan_one_hot:
        df[i] = df[i].apply(lambda x: 0 if pd.isnull(x) else 1)
    
    # transform from square feet to categorical area range
    print("Processing sq_feet...")
    df["square_feet"] = df["square_feet"].apply(to_area_range)
    
    # Count the number of host verifications in the dataset
    print("Processing host_verifications...")
    df["host_verifications"] = df["host_verifications"].apply(lambda x: x.count(",")+1 if len(x)>4 else 0)
    
##################################### THESE ARE THE FAILED ATTEMPTS #####################################
#     df["host_location"] =df.host_location.str.split(",").fillna("*").apply(lambda x: x[-1]).str.strip()
#     transit = df["transit"].apply(tokenize).str.split(" ")
#     df["has_bus"] = transit.apply(lambda x: "bus" in x)
#     df["has_subway"] = transit.apply(lambda x: "metro" in x or "subway" in x or "train" in x)
#     df["has_taxi"] = transit.apply(lambda x: "uber" in x or "lyft" in x or "cab" in x or "taxi" in x)
#     df["has_walk"] = transit.apply(lambda x: "walk" in x or "foot" in x or "feet" in x)    
#     def is_us(code):
#         if code == "United States" or code == "US" or code == "NY":
#             return "US"
#         elif code == "*":
#             return "N/A"
#         else:
#             return "Other"
#     df["host_location"] = df["host_location"].apply(is_us)
#     def tokenize(sentence):
#         words = nltk.word_tokenize(sentence)
#         new_words= [word for word in words if word.isalnum()]
#         tokens = [w for w in new_words if not w in stop_words]
#         snowball = nltk.stem.SnowballStemmer("english")
#         stems = []
#         for t in tokens:    
#             stems.append(snowball.stem(t))
#         return " ".join(stems)
#     df["summary"] = df["summary"].apply(lambda x: tokenize(x) if pd.notnull(x) else "NoDescription")
    # transform property types
#     ptmap = {"Villa":"Other",
#              "Bungalow":"Other",
#              "Tiny house":"Other",
#              "Aparthotel":"Other",
#              "Boat":"Other",
#              "Camper/RV":"Other",
#              "Tent":"Other",
#              "Cottage":"Other",
#              "Houseboat":"Other",
#              "Cabin":"Other",
#              "Chalet":"Other",
#              "Timeshare":"Other",
#              "Train":"Other",
#              "Island":"Other",
#              "Casa particular (Cuba)":"Other"}
# #     ptmap = {"Villa":"Other_value",
# #                  "Bungalow":"Other",
# #                  "Tiny house":"Other",
# #                  "Aparthotel":"Other_lux",
# #                  "Boat":"Other",
# #                  "Camper/RV":"Other",
# #                  "Tent":"Other",
# #                  "Cottage":"Other",
# #                  "Houseboat":"Other_lux",
# #                  "Cabin":"Other",
# #                  "Chalet":"Other",
# #                  "Timeshare":"Other_lux",
# #                  "Train":"Other_value",
# #                  "Island":"Other_value",
# #                  "Casa particular (Cuba)":"Other_value",
# #                  "Resort": "Other_lux",
# #                  "Hotel": "Other_lux",
# #                  "Hostel": "Other_value"}
#     df["property_type"].replace(ptmap, inplace = True)
    # transforom cancellation policy
#     cpmap = {"super_strict_60": "strict",
#              "super_strict_30": "strict",
#              "strict": "strict",
#              "long_term": "strict"}
#     df["cancellation_policy"].replace(cpmap, inplace = True)
    # na
    print("------- FEATURE ENGINEERING FINISHED -------")
    return df

In [19]:
####################################### IMPUTATION #######################################
def filling_values(df):
    print("Constructing filling map...")
    filling = {}
    # 0 imputation
    zeros = [
              "reviews_per_month",
              "host_listings_count",
              "host_since_year", 
              "host_since_month", 
              "host_since_day",
              "bathrooms", 
              "bedrooms", 
              "beds", 
              "review_scores_rating",
              "review_scores_accuracy",
              "review_scores_cleanliness",
              "review_scores_checkin",
              "review_scores_communication",
              "review_scores_location",
              "review_scores_value",
            ]
    for i in zeros:
        filling[i] = 0
        
    # unknown imputation
    unknown = [
                "host_is_superhost", 
                "host_identity_verified", 
                "host_has_profile_pic", 
                "require_guest_profile_picture",
                "host_response_time"
               ]
    for i in unknown:
        # Fill the features with "unknown"
        filling[i] = "unknown"
    
    # Special imputation
    filling["host_response_rate"] = (df["host_response_rate"].fillna(mode(df["host_response_rate"])
                                                        .mode[0]).str.replace("%","")
                                                        .astype(int))/100
    
#     modes = [
#                 "first_review_year", "first_review_month",
#                "first_review_day", "last_review_year",
#                "last_review_month", "last_review_day",
#              ]
#     for i in modes:
#         filling[i] = df.groupby("neighbourhood_group_cleansed")[i].mean()
    return filling


def fill_missing(df, fill_dict):
    print("Filling missing...")
    for i in fill_dict:
        df[i].fillna(fill_dict[i], inplace = True)
    print("------- IMPUTATION FINISHED -------")
    return df

In [20]:
####################################### AGGREGATION #######################################
def pre_transformation(df):
    res = drop_initial(df)
    res = engineer_features(res)
    fill_dict = filling_values(res)
    res = fill_missing(res, fill_dict)
    return res, fill_dict

In [21]:
%%time
pre_trans = pre_transformation(train_raw)
pt = pre_trans[0]
fill = pre_trans[1]

Processing extra_people...
Processing text features...
     Tokenizing description...
     Tokenizing summary...
     Tokenizing name...
     Constructing if_lux feature...
Processing date features...
Processing host_neighbourhood...
Processing amenities...
Processing nan OneHot...
Processing sq_feet...
Processing host_verifications...
------- FEATURE ENGINEERING FINISHED -------
Constructing filling map...
Filling missing...
------- IMPUTATION FINISHED -------
Wall time: 2min 11s


In [22]:
################################## COLUMN TRANSFROMATION ##################################
# All categorical featuers for OneHotEncoder
categorical = ["neighbourhood_cleansed", "neighbourhood_group_cleansed", "property_type", "room_type", "bed_type", "cancellation_policy",
              "market", "host_response_time", "square_feet", "host_is_superhost","host_neighbourhood","host_identity_verified", "host_has_profile_pic",
               "require_guest_profile_picture", "require_guest_phone_verification", "instant_bookable"]
# All numerical featuers for StandardScaler
std= ["accommodates", "bathrooms", "bedrooms", "beds", "guests_included",
      "extra_people", "number_of_reviews", "review_scores_rating", "review_scores_accuracy",
     "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication",
     "review_scores_location", "review_scores_value", "calculated_host_listings_count","host_since_year","host_verifications", "host_id",
     "host_since_month", "host_since_day", "host_listings_count" ]

# Construct ColumnTransformer with OHE AND STD transformer
clt = ColumnTransformer(
    transformers=[
        ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse=False), categorical),
        ("standardization", StandardScaler(), std)
#         ("tfidf_summary", TextProcessor(), ["summary"]),
#         ("tfidf_des", TextProcessor(), ["description"]),
#         ("tfidf_name", TextProcessor(), ["name"])
    ],
    remainder="passthrough"
)
clt.fit(pt.drop("price", axis = 1))

def preprocessing_train(df = pt):
    X = clt.transform(df.drop("price", axis = 1))
    y = df.price
    return X, y
def preprocessing_test(df = test_raw):
    return clt.transform(fill_missing(engineer_features(drop_initial(test_raw)), fill))

In [23]:
%%time
# train preprocessing
X, y = preprocessing_train()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

Wall time: 743 ms


In [24]:
%%time
#test preprocessing
test = preprocessing_test()

Processing extra_people...
Processing text features...
     Tokenizing description...
     Tokenizing summary...
     Tokenizing name...
     Constructing if_lux feature...
Processing date features...
Processing host_neighbourhood...
Processing amenities...
Processing nan OneHot...
Processing sq_feet...
Processing host_verifications...
------- FEATURE ENGINEERING FINISHED -------
Filling missing...
------- IMPUTATION FINISHED -------
Wall time: 1min 12s


In [28]:
X_train[0]

array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [25]:

def tune_lgbm():
    res = []
    best_max_dp = [13, 14, 15, 16,17,18,19]
    best_min_data_in_leaf = [10, 13,16,17,18,19, 20, 21, 22,23,24]
    num_leaves = [40, 45, 50,55, 60,65, 70]
    total = len(best_max_dp)*len(best_min_data_in_leaf)*len(num_leaves)
    counter = 0
    for i in best_min_data_in_leaf:
        for j in num_leaves:
            for k in best_max_dp:
                start = time.time()
                gbm = lgb.LGBMRegressor(max_depth=k, min_data_in_leaf = i, num_leaves = j)
                res.append([-(cross_val_score(gbm, X_train, y_train, cv = 5, scoring="neg_root_mean_squared_error").mean()), i, j,k ])
                counter +=1
                end = time.time()
                process_in(counter, total, start, end)
    return res
res_lgbm = tune_lgbm()

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

****------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

********--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

************----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

****************------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

********************--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

************************----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

****************************------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

********************************--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

************************************----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

****************************************------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

********************************************--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

************************************************----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

****************************************************------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

********************************************************--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

************************************************************----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

****************************************************************------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

********************************************************************--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

************************************************************************----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

****************************************************************************------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

********************************************************************************--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

************************************************************************************----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

****************************************************************************************------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "c:\users\zhaow\appdata\local\programs\python\python39\lib\site-packages\lightgbm\sklearn.py", line 658, in fit
    _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, en

********************************************************************************************--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

KeyboardInterrupt: 

In [None]:
min(res_lgbm)
#22， 45， 15， 85.03

In [None]:
%%time
def tune_xgb():
    depth = [9, 10,11,12,13,14,15]
    min_cw = [15,16,17,18,19,20,21,22,23]
    total = len(depth) * len(min_cw)
    counter = 0
    res = []
    for i in depth:
        for j in min_cw:
            start = time.time()
            xgbr = xgb.XGBRegressor(eta = 0.05, eval_metric  = "rmse", max_depth=i, learning_rate=0.05, n_estimators=100, min_child_weight = j, subsample = 0.8, )
            res.append([-(cross_val_score(xgbr, X, y, cv = 5, scoring="neg_root_mean_squared_error").mean()), i, j])
            counter+=1
            end = time.time()
            process_in(counter, total, start, end)
    return res
res_xgb = tune_xgb() 
ring() 

In [None]:
min(res_xgb)


In [None]:
def tune_ridge():
    alphas = [10, 20, 30]
    total = len(alphas)
    counter=0
    res = []
    for i in alphas:
        start = time.time()
        ridge=Ridge(alpha=i)
        res.append([-(cross_val_score(ridge, X, y, cv = 5, scoring="neg_root_mean_squared_error").mean()), i])
        counter+=1
        end = time.time()
        process_in(counter, total, start, end)
    return res
res_ridge = tune_ridge()

In [None]:
res_ridge

In [None]:
def tune_catb():
#     cat_params = {
#                 'n_estimators':5000,
#                 'learning_rate': 0.07,
#                 'eval_metric':'AUC',
#                 'loss_function':'RMSE',
#                 'metric_period':500,
#                 'od_wait':500,
#                 'depth': 8,
#                 #'colsample_bylevel':0.7,
#                 } 
    depths = [5,6,7,8,9,10]
    total = len(depths)
    counter=0
    res = []
    for i in depths:
        start = time.time()
        catb = CatBoostRegressor(depth=i, loss_function="RMSE", n_estimators=1000, learning_rate=0.07, metric_period=250)
        res.append([-(cross_val_score(catb, X, y, cv = 5, scoring="neg_root_mean_squared_error").mean()), i])
        counter+=1
        end = time.time()
        process_in(counter, total, start, end)
    return res
res_catb = tune_catb()
res_catb

In [None]:
res_catb

In [147]:
def to_pred():
    prediction = test_raw[["id"]].set_index("id")
#     ridge_pred = Ridge(alpha = 5.75).fit(X, y).predict(test)*0.01
    cbr_pred = CatBoostRegressor(n_estimators=1000, max_depth=8).fit(X, y).predict(test)*0.54
    lgb_pred=lgb.LGBMRegressor(max_depth=18, min_data_in_leaf = 23, num_leaves = 50).fit(X, y).predict(test)*0.41
    xgb_pred =xgb.XGBRegressor(max_depth=14, learning_rate=0.1, n_estimators=100, min_child_weight = 10).fit(X, y).predict(test)*0.05
    prediction["Predicted"] = lgb_pred+cbr_pred+xgb_pred#+#ridge_pred 
    prediction.to_csv("output/prediction.csv")

In [146]:
%%time
to_pred()

Learning rate set to 0.071321
0:	learn: 128.9892038	total: 46.7ms	remaining: 46.6s
1:	learn: 125.3998548	total: 88.7ms	remaining: 44.3s
2:	learn: 122.1682796	total: 130ms	remaining: 43.1s
3:	learn: 119.0672851	total: 169ms	remaining: 42.2s
4:	learn: 116.3605044	total: 226ms	remaining: 45s
5:	learn: 113.7943366	total: 268ms	remaining: 44.4s
6:	learn: 111.5940716	total: 309ms	remaining: 43.8s
7:	learn: 109.6213752	total: 350ms	remaining: 43.5s
8:	learn: 107.8377757	total: 398ms	remaining: 43.8s
9:	learn: 106.2593112	total: 437ms	remaining: 43.3s
10:	learn: 104.7520513	total: 475ms	remaining: 42.7s
11:	learn: 103.4238582	total: 516ms	remaining: 42.4s
12:	learn: 102.1206120	total: 572ms	remaining: 43.5s
13:	learn: 101.0335371	total: 615ms	remaining: 43.3s
14:	learn: 100.0476864	total: 658ms	remaining: 43.2s
15:	learn: 99.1235334	total: 709ms	remaining: 43.6s
16:	learn: 98.3043126	total: 751ms	remaining: 43.4s
17:	learn: 97.4896907	total: 801ms	remaining: 43.7s
18:	learn: 96.7262824	total: 

161:	learn: 76.6772255	total: 6.92s	remaining: 35.8s
162:	learn: 76.6668172	total: 6.97s	remaining: 35.8s
163:	learn: 76.6055425	total: 7.01s	remaining: 35.7s
164:	learn: 76.5727331	total: 7.05s	remaining: 35.7s
165:	learn: 76.5345904	total: 7.09s	remaining: 35.6s
166:	learn: 76.4891272	total: 7.13s	remaining: 35.6s
167:	learn: 76.4001505	total: 7.17s	remaining: 35.5s
168:	learn: 76.3671044	total: 7.21s	remaining: 35.4s
169:	learn: 76.3057650	total: 7.25s	remaining: 35.4s
170:	learn: 76.2303890	total: 7.29s	remaining: 35.3s
171:	learn: 76.2056737	total: 7.33s	remaining: 35.3s
172:	learn: 76.1540911	total: 7.37s	remaining: 35.2s
173:	learn: 76.1123275	total: 7.4s	remaining: 35.1s
174:	learn: 76.0646148	total: 7.44s	remaining: 35.1s
175:	learn: 76.0128086	total: 7.48s	remaining: 35s
176:	learn: 75.9560412	total: 7.52s	remaining: 35s
177:	learn: 75.9375922	total: 7.56s	remaining: 34.9s
178:	learn: 75.8637079	total: 7.6s	remaining: 34.9s
179:	learn: 75.8145256	total: 7.65s	remaining: 34.9s

318:	learn: 69.8360048	total: 13.3s	remaining: 28.4s
319:	learn: 69.7979014	total: 13.3s	remaining: 28.3s
320:	learn: 69.7538830	total: 13.4s	remaining: 28.3s
321:	learn: 69.7202714	total: 13.4s	remaining: 28.2s
322:	learn: 69.6763581	total: 13.5s	remaining: 28.2s
323:	learn: 69.6285457	total: 13.5s	remaining: 28.2s
324:	learn: 69.5771226	total: 13.6s	remaining: 28.2s
325:	learn: 69.5515140	total: 13.6s	remaining: 28.1s
326:	learn: 69.5376219	total: 13.6s	remaining: 28.1s
327:	learn: 69.5243823	total: 13.7s	remaining: 28s
328:	learn: 69.5007831	total: 13.7s	remaining: 28s
329:	learn: 69.4880072	total: 13.8s	remaining: 27.9s
330:	learn: 69.4364112	total: 13.8s	remaining: 27.9s
331:	learn: 69.4261770	total: 13.8s	remaining: 27.8s
332:	learn: 69.3902091	total: 13.9s	remaining: 27.8s
333:	learn: 69.3803979	total: 13.9s	remaining: 27.7s
334:	learn: 69.3468378	total: 13.9s	remaining: 27.7s
335:	learn: 69.2946068	total: 14s	remaining: 27.6s
336:	learn: 69.2812256	total: 14s	remaining: 27.6s
3

474:	learn: 65.3461063	total: 19.7s	remaining: 21.8s
475:	learn: 65.3301540	total: 19.8s	remaining: 21.8s
476:	learn: 65.3119079	total: 19.8s	remaining: 21.7s
477:	learn: 65.2861152	total: 19.8s	remaining: 21.7s
478:	learn: 65.2659975	total: 19.9s	remaining: 21.6s
479:	learn: 65.2490477	total: 19.9s	remaining: 21.6s
480:	learn: 65.2389880	total: 20s	remaining: 21.5s
481:	learn: 65.2208413	total: 20s	remaining: 21.5s
482:	learn: 65.1997997	total: 20s	remaining: 21.5s
483:	learn: 65.1623636	total: 20.1s	remaining: 21.4s
484:	learn: 65.1499431	total: 20.1s	remaining: 21.4s
485:	learn: 65.1322023	total: 20.2s	remaining: 21.3s
486:	learn: 65.1100588	total: 20.2s	remaining: 21.3s
487:	learn: 65.0840008	total: 20.3s	remaining: 21.2s
488:	learn: 65.0692321	total: 20.3s	remaining: 21.2s
489:	learn: 65.0473893	total: 20.3s	remaining: 21.2s
490:	learn: 65.0260225	total: 20.4s	remaining: 21.1s
491:	learn: 64.9948841	total: 20.4s	remaining: 21.1s
492:	learn: 64.9797467	total: 20.5s	remaining: 21s
4

633:	learn: 61.9651548	total: 26.2s	remaining: 15.1s
634:	learn: 61.9330434	total: 26.2s	remaining: 15.1s
635:	learn: 61.9083893	total: 26.3s	remaining: 15s
636:	learn: 61.9002563	total: 26.3s	remaining: 15s
637:	learn: 61.8993391	total: 26.4s	remaining: 15s
638:	learn: 61.8761875	total: 26.4s	remaining: 14.9s
639:	learn: 61.8432526	total: 26.4s	remaining: 14.9s
640:	learn: 61.8306231	total: 26.5s	remaining: 14.8s
641:	learn: 61.8122197	total: 26.5s	remaining: 14.8s
642:	learn: 61.7970636	total: 26.6s	remaining: 14.7s
643:	learn: 61.7454399	total: 26.6s	remaining: 14.7s
644:	learn: 61.7289936	total: 26.6s	remaining: 14.7s
645:	learn: 61.7170760	total: 26.7s	remaining: 14.6s
646:	learn: 61.7054371	total: 26.7s	remaining: 14.6s
647:	learn: 61.6856233	total: 26.8s	remaining: 14.5s
648:	learn: 61.6771336	total: 26.8s	remaining: 14.5s
649:	learn: 61.6613724	total: 26.8s	remaining: 14.4s
650:	learn: 61.6383423	total: 26.9s	remaining: 14.4s
651:	learn: 61.6254375	total: 26.9s	remaining: 14.4s

792:	learn: 58.9867969	total: 32.8s	remaining: 8.56s
793:	learn: 58.9801555	total: 32.8s	remaining: 8.52s
794:	learn: 58.9714435	total: 32.9s	remaining: 8.48s
795:	learn: 58.9471365	total: 32.9s	remaining: 8.43s
796:	learn: 58.9109395	total: 33s	remaining: 8.39s
797:	learn: 58.8950426	total: 33s	remaining: 8.35s
798:	learn: 58.8724115	total: 33s	remaining: 8.31s
799:	learn: 58.8523770	total: 33.1s	remaining: 8.27s
800:	learn: 58.8269368	total: 33.1s	remaining: 8.23s
801:	learn: 58.8209421	total: 33.2s	remaining: 8.19s
802:	learn: 58.8067433	total: 33.2s	remaining: 8.15s
803:	learn: 58.7823701	total: 33.2s	remaining: 8.1s
804:	learn: 58.7637973	total: 33.3s	remaining: 8.06s
805:	learn: 58.7523589	total: 33.3s	remaining: 8.02s
806:	learn: 58.7414215	total: 33.4s	remaining: 7.98s
807:	learn: 58.7240172	total: 33.4s	remaining: 7.94s
808:	learn: 58.7109216	total: 33.4s	remaining: 7.89s
809:	learn: 58.6936598	total: 33.5s	remaining: 7.85s
810:	learn: 58.6866245	total: 33.5s	remaining: 7.81s


949:	learn: 56.5651317	total: 39.3s	remaining: 2.06s
950:	learn: 56.5473406	total: 39.3s	remaining: 2.02s
951:	learn: 56.5228784	total: 39.3s	remaining: 1.98s
952:	learn: 56.5025298	total: 39.4s	remaining: 1.94s
953:	learn: 56.4939203	total: 39.4s	remaining: 1.9s
954:	learn: 56.4781694	total: 39.5s	remaining: 1.86s
955:	learn: 56.4650981	total: 39.5s	remaining: 1.82s
956:	learn: 56.4372323	total: 39.5s	remaining: 1.78s
957:	learn: 56.4196597	total: 39.6s	remaining: 1.74s
958:	learn: 56.3933429	total: 39.6s	remaining: 1.69s
959:	learn: 56.3772555	total: 39.7s	remaining: 1.65s
960:	learn: 56.3615236	total: 39.7s	remaining: 1.61s
961:	learn: 56.3374864	total: 39.7s	remaining: 1.57s
962:	learn: 56.3247722	total: 39.8s	remaining: 1.53s
963:	learn: 56.3148782	total: 39.8s	remaining: 1.49s
964:	learn: 56.3067163	total: 39.9s	remaining: 1.45s
965:	learn: 56.2773248	total: 39.9s	remaining: 1.4s
966:	learn: 56.2722549	total: 39.9s	remaining: 1.36s
967:	learn: 56.2662497	total: 40s	remaining: 1.3

In [115]:
from fun import *
ring()

In [None]:

def tokenize(sentence):
    if pd.isnull(sentence):
        return "NoDescription"
    words = nltk.word_tokenize(sentence)
    new_words= [word for word in words if word.isalnum()]
    tokens = [w for w in new_words if not w in stop_words]
    snowball = nltk.stem.SnowballStemmer("english")
    stems = []
    for t in tokens:    
        stems.append(snowball.stem(t))
    return " ".join(stems)

In [None]:
des = train_raw.description.apply(tokenize)
des