In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# 数据划分方法

In [2]:
# hold-out
from sklearn.model_selection import train_test_split

# K折交叉验证
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold

# K折分布保持交叉验证
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

# 时间序列划分方法
from sklearn.model_selection import TimeSeriesSplit

# booststrap 采样
from sklearn.utils import resample

In [3]:
# X = np.zeros((20, 5))
# Y = np.array([1, 2, 3, 4] * 5)
# print(X, Y)

X = np.zeros((20, 5))
Y = np.array([1]*5 + [2]*5 + [3]*5 + [4]*5)
print(X, Y)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]] [1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4]


In [4]:
# 直接按照比例拆分
train_X, val_X, train_y, val_y = train_test_split(X, Y, test_size = 0.2)
print(train_y, val_y)

# 按照比例 & 标签分布划分
train_X, val_X, train_y, val_y = train_test_split(X, Y, test_size = 0.2, stratify=Y)
print(train_y, val_y)

[3 2 3 3 2 4 1 2 1 3 4 4 2 2 1 4] [4 1 1 3]
[4 3 1 4 3 3 2 3 2 2 2 1 4 1 1 4] [4 2 3 1]


In [5]:
kf = KFold(n_splits=5)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

[ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [0 1 2 3]
Label [1 1 1 1]

[ 0  1  2  3  8  9 10 11 12 13 14 15 16 17 18 19] [4 5 6 7]
Label [1 2 2 2]

[ 0  1  2  3  4  5  6  7 12 13 14 15 16 17 18 19] [ 8  9 10 11]
Label [2 2 3 3]

[ 0  1  2  3  4  5  6  7  8  9 10 11 16 17 18 19] [12 13 14 15]
Label [3 3 3 4]

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] [16 17 18 19]
Label [4 4 4 4]



In [6]:
kf = RepeatedKFold(n_splits=5, n_repeats=2)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

[ 1  2  3  4  5  6  7  8  9 10 11 12 15 16 17 19] [ 0 13 14 18]
Label [1 3 3 4]

[ 0  1  3  4  5  6  7  8 10 11 12 13 14 15 16 18] [ 2  9 17 19]
Label [1 2 4 4]

[ 0  2  3  4  6  7  8  9 10 12 13 14 16 17 18 19] [ 1  5 11 15]
Label [1 2 3 4]

[ 0  1  2  3  4  5  7  9 10 11 13 14 15 17 18 19] [ 6  8 12 16]
Label [2 2 3 4]

[ 0  1  2  5  6  8  9 11 12 13 14 15 16 17 18 19] [ 3  4  7 10]
Label [1 1 2 3]

[ 0  1  3  4  5  7  8  9 10 11 12 14 15 16 17 19] [ 2  6 13 18]
Label [1 2 3 4]

[ 0  1  2  4  6  7  8  9 10 12 13 14 15 16 17 18] [ 3  5 11 19]
Label [1 2 3 4]

[ 0  1  2  3  5  6  8  9 10 11 13 14 15 16 18 19] [ 4  7 12 17]
Label [1 2 3 4]

[ 0  2  3  4  5  6  7  8  9 11 12 13 14 17 18 19] [ 1 10 15 16]
Label [1 3 4 4]

[ 1  2  3  4  5  6  7 10 11 12 13 15 16 17 18 19] [ 0  8  9 14]
Label [1 2 2 3]



In [7]:
kf = StratifiedKFold(n_splits=5)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

[ 1  2  3  4  6  7  8  9 11 12 13 14 16 17 18 19] [ 0  5 10 15]
Label [1 2 3 4]

[ 0  2  3  4  5  7  8  9 10 12 13 14 15 17 18 19] [ 1  6 11 16]
Label [1 2 3 4]

[ 0  1  3  4  5  6  8  9 10 11 13 14 15 16 18 19] [ 2  7 12 17]
Label [1 2 3 4]

[ 0  1  2  4  5  6  7  9 10 11 12 14 15 16 17 19] [ 3  8 13 18]
Label [1 2 3 4]

[ 0  1  2  3  5  6  7  8 10 11 12 13 15 16 17 18] [ 4  9 14 19]
Label [1 2 3 4]



In [8]:
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

[ 0  1  2  4  5  6  7  8 10 11 12 13 15 17 18 19] [ 3  9 14 16]
Label [1 2 3 4]

[ 0  1  3  4  5  7  8  9 11 12 13 14 15 16 17 18] [ 2  6 10 19]
Label [1 2 3 4]

[ 0  1  2  3  5  6  7  9 10 11 12 14 15 16 17 19] [ 4  8 13 18]
Label [1 2 3 4]

[ 0  2  3  4  6  7  8  9 10 11 13 14 16 17 18 19] [ 1  5 12 15]
Label [1 2 3 4]

[ 1  2  3  4  5  6  8  9 10 12 13 14 15 16 18 19] [ 0  7 11 17]
Label [1 2 3 4]

[ 0  1  2  4  5  7  8  9 10 11 12 14 15 16 17 18] [ 3  6 13 19]
Label [1 2 3 4]

[ 1  2  3  4  5  6  7  9 10 11 12 13 15 16 18 19] [ 0  8 14 17]
Label [1 2 3 4]

[ 0  1  2  3  5  6  8  9 10 12 13 14 15 16 17 19] [ 4  7 11 18]
Label [1 2 3 4]

[ 0  2  3  4  5  6  7  8 10 11 13 14 16 17 18 19] [ 1  9 12 15]
Label [1 2 3 4]

[ 0  1  3  4  6  7  8  9 11 12 13 14 15 17 18 19] [ 2  5 10 16]
Label [1 2 3 4]



In [9]:
kf = TimeSeriesSplit(n_splits=5)
for train_idx, test_idx, in kf.split(X, Y):
    print(train_idx, test_idx)
    print('Label', Y[test_idx])
    print('')

[0 1 2 3 4] [5 6 7]
Label [2 2 2]

[0 1 2 3 4 5 6 7] [ 8  9 10]
Label [2 2 3]

[ 0  1  2  3  4  5  6  7  8  9 10] [11 12 13]
Label [3 3 3]

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13] [14 15 16]
Label [3 4 4]

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16] [17 18 19]
Label [4 4 4]



In [10]:
train_X, train_Y = resample(X, Y, n_samples=16)
val_X, val_Y = resample(X, Y, n_samples=4)
print(train_Y, val_Y)

[3 1 2 1 1 3 1 4 3 1 3 1 2 3 4 1] [2 2 3 3]


# 提取 Two-Sigma比赛特征

In [11]:
! unzip ../input/two-sigma-connect-rental-listing-inquiries/train.json.zip
! unzip ../input/two-sigma-connect-rental-listing-inquiries/test.json.zip

Archive:  ../input/two-sigma-connect-rental-listing-inquiries/train.json.zip
  inflating: train.json              
Archive:  ../input/two-sigma-connect-rental-listing-inquiries/test.json.zip
  inflating: test.json               


In [12]:
!ls ./

__notebook__.ipynb  test.json  train.json


In [13]:
train_df = pd.read_json('./train.json')
test_df = pd.read_json('./test.json')
print(train_df.shape)
print(test_df.shape)

features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price", 
                    "num_photos", "num_features", "num_description_words","created_year", 
                    "created_month", "created_day", "listing_id", "created_hour"]

# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

(49352, 15)
(74659, 14)


In [14]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [15]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

4     Dining_Room Pre-War Laundry_in_Building Dishwa...
6     Doorman Elevator Laundry_in_Building Dishwashe...
9     Doorman Elevator Laundry_in_Building Laundry_i...
10                                                     
15    Doorman Elevator Fitness_Center Laundry_in_Bui...
Name: features, dtype: object
(49352, 217) (74659, 217)


In [16]:
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler

In [17]:
clf = LogisticRegression()
clf = RandomForestClassifier()
clf = LGBMClassifier()
# clf = XGBClassifier()

In [18]:
# 这写了一个bug，你能改好吗？
train_X = StandardScaler().fit_transform(train_df[features_to_use])
test_X = StandardScaler().fit_transform(test_df[features_to_use])

kf = StratifiedKFold(n_splits=5)
test_pred = None
for train_idx, test_idx, in kf.split(train_X, train_df['interest_level']):
    
    print(train_idx, test_idx)
    clf.fit(train_X[train_idx], train_y[train_idx])
    print('Val loss', log_loss(train_y[test_idx], 
                   clf.predict_proba(train_X[test_idx])))
    
    if test_pred is None:
        test_pred = clf.predict_proba(test_X)
    else:
        test_pred += clf.predict_proba(test_X)

test_pred /= 5

[ 9769  9770  9771 ... 49349 49350 49351] [    0     1     2 ... 10177 10178 10181]
Val loss 0.5745569344795427
[    0     1     2 ... 49349 49350 49351] [ 9769  9770  9771 ... 20092 20093 20097]
Val loss 0.5766758820502259
[    0     1     2 ... 49349 49350 49351] [19604 19605 19607 ... 29807 29812 29816]
Val loss 0.5743822676079222
[    0     1     2 ... 49349 49350 49351] [29539 29544 29545 ... 39547 39563 39568]
Val loss 0.5817049980112237
[    0     1     2 ... 39547 39563 39568] [38968 38978 38994 ... 49349 49350 49351]
Val loss 0.5864974205079067


In [19]:
train_X = train_df[features_to_use].values
test_X = test_df[features_to_use].values

kf = StratifiedKFold(n_splits=5)
test_pred = None
for train_idx, test_idx, in kf.split(train_X, train_df['interest_level']):
    
    print(train_idx, test_idx)
    clf.fit(train_X[train_idx], train_y[train_idx])
    print('Val loss', log_loss(train_y[test_idx], 
                   clf.predict_proba(train_X[test_idx])))
    
    if test_pred is None:
        test_pred = clf.predict_proba(test_X)
    else:
        test_pred += clf.predict_proba(test_X)

test_pred /= 5

[ 9769  9770  9771 ... 49349 49350 49351] [    0     1     2 ... 10177 10178 10181]
Val loss 0.5750199591295473
[    0     1     2 ... 49349 49350 49351] [ 9769  9770  9771 ... 20092 20093 20097]
Val loss 0.5766117192874167
[    0     1     2 ... 49349 49350 49351] [19604 19605 19607 ... 29807 29812 29816]
Val loss 0.5747959956995047
[    0     1     2 ... 49349 49350 49351] [29539 29544 29545 ... 39547 39563 39568]
Val loss 0.5808522198361222
[    0     1     2 ... 39547 39563 39568] [38968 38978 38994 ... 49349 49350 49351]
Val loss 0.5865867909178608


In [20]:
# lightGBM
clf = LGBMClassifier(learning_rate=0.05, n_estimators=2000, n_jobs=2)

train_X = train_df[features_to_use].values
test_X = test_df[features_to_use].values

kf = StratifiedKFold(n_splits=5)
test_pred = None
for train_idx, test_idx, in kf.split(train_X, train_df['interest_level']):
    
    print(train_idx, test_idx)
    clf.fit(train_X[train_idx], train_y[train_idx], 
            eval_set=[(train_X[test_idx], train_y[test_idx]), (train_X[test_idx], train_y[test_idx])],
           verbose=50, early_stopping_rounds=50)
    print('Val loss', log_loss(train_y[test_idx], 
                   clf.predict_proba(train_X[test_idx])))
    
    if test_pred is None:
        test_pred = clf.predict_proba(test_X)
    else:
        test_pred += clf.predict_proba(test_X)

test_pred /= 5

[ 9769  9770  9771 ... 49349 49350 49351] [    0     1     2 ... 10177 10178 10181]
Training until validation scores don't improve for 50 rounds
[50]	valid_0's multi_logloss: 0.618601	valid_1's multi_logloss: 0.618601
[100]	valid_0's multi_logloss: 0.591652	valid_1's multi_logloss: 0.591652
[150]	valid_0's multi_logloss: 0.580064	valid_1's multi_logloss: 0.580064
[200]	valid_0's multi_logloss: 0.574176	valid_1's multi_logloss: 0.574176
[250]	valid_0's multi_logloss: 0.570872	valid_1's multi_logloss: 0.570872
[300]	valid_0's multi_logloss: 0.568752	valid_1's multi_logloss: 0.568752
[350]	valid_0's multi_logloss: 0.567441	valid_1's multi_logloss: 0.567441
[400]	valid_0's multi_logloss: 0.566707	valid_1's multi_logloss: 0.566707
[450]	valid_0's multi_logloss: 0.566147	valid_1's multi_logloss: 0.566147
[500]	valid_0's multi_logloss: 0.565949	valid_1's multi_logloss: 0.565949
[550]	valid_0's multi_logloss: 0.565372	valid_1's multi_logloss: 0.565372
[600]	valid_0's multi_logloss: 0.564999	va

In [21]:
out_df = pd.DataFrame(test_pred)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_starter2.csv", index=False)

# 参数搜索

In [22]:
LGBMClassifier?

In [23]:
from sklearn.metrics import make_scorer
def my_scorer(clf, X, y_true):
    class_labels = clf.classes_
    y_pred_proba = clf.predict_proba(X)
    return log_loss(y_true, y_pred_proba)

from sklearn.model_selection import GridSearchCV
parameters = {
    'num_leaves':( 4, 8, 16, 32), 
    'subsample':(0.75, 0.85, 0.95),
    'min_child_samples': (5, 10, 15)
}

clf = GridSearchCV(LGBMClassifier(), param_grid=parameters, n_jobs=6, scoring=my_scorer, cv=5)
clf.fit(train_X, train_y)

GridSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=6,
             param_grid={'min_child_samples': (5, 10, 15),
                         'num_leaves': (4, 8, 16, 32),
                         'subsample': (0.75, 0.85, 0.95)},
             scoring=<function my_scorer at 0x7f9fcf36e3b0>)

In [24]:
from sklearn.metrics import make_scorer
def my_scorer(clf, X, y_true):
    class_labels = clf.classes_
    y_pred_proba = clf.predict_proba(X)
    return log_loss(y_true, y_pred_proba)

from sklearn.model_selection import RandomizedSearchCV
parameters = {
    'num_leaves':( 4, 8, 16, 32), 
    'subsample':[0.75, 1],
    'min_child_samples': (5, 10, 15)
}

clf = GridSearchCV(LGBMClassifier(), param_grid=parameters, 
                   n_jobs=6, scoring=my_scorer, cv=StratifiedKFold(n_splits=5))
clf.fit(train_X, train_y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LGBMClassifier(), n_jobs=6,
             param_grid={'min_child_samples': (5, 10, 15),
                         'num_leaves': (4, 8, 16, 32), 'subsample': [0.75, 1]},
             scoring=<function my_scorer at 0x7f9fcf5de5f0>)

# 阅读链接

- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- https://lightgbm.readthedocs.io/en/latest/Python-API.html
- https://xgboost.readthedocs.io/en/latest//python/index.html


- https://github.com/fmfn/BayesianOptimization