In [10]:
import gc
import time
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb

In [11]:
NUM_BRANDS = 4000
NUM_CATEGORIES = 1000
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 55000


def handle_missing_inplace(df):
    df['category_name'].fillna(value='missing', inplace=True)
    df['brand_name'].fillna(value='missing', inplace=True)
    df['item_description'].fillna(value='missing', inplace=True)


def cutting(df):
    pop_brand = df['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    df.loc[~df['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category = df['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    df.loc[~df['category_name'].isin(pop_category), 'category_name'] = 'missing'


def to_categorical(df):
    df['category_name'] = df['category_name'].astype('category')
    df['brand_name'] = df['brand_name'].astype('category')
    df['item_condition_id'] = df['item_condition_id'].astype('category')

In [18]:
start_time = time.time()

train = pd.read_table('~/source/Data/mercari_price/train.tsv', engine='c')
test = pd.read_table('~/source/Data/mercari_price/test.tsv', engine='c')

print('[{}] Finished to load data'.format(time.time() - start_time))
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)

nrow_train = train.shape[0]
y = np.log1p(train["price"])
merge: pd.DataFrame = pd.concat([train, test])
submission: pd.DataFrame = test[['test_id']]

del train
del test
gc.collect()

[18.689811944961548] Finished to load data
Train shape:  (1482535, 8)
Test shape:  (693359, 7)


311

In [19]:
handle_missing_inplace(merge)
print('[{}] Finished to handle missing'.format(time.time() - start_time))

[20.585251092910767] Finished to handle missing


In [20]:
merge.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,missing,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0
3,missing,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0
4,missing,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0


In [28]:
cutting(merge)
print('[{}] Finished to cut'.format(time.time() - start_time))

[246.43642687797546] Finished to cut


In [29]:
merge.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,missing,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0
3,missing,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0
4,missing,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0


In [30]:
to_categorical(merge)
print('[{}] Finished to convert categorical'.format(time.time() - start_time))

[247.08253002166748] Finished to convert categorical


In [31]:
merge.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,missing,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0
3,missing,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0
4,missing,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0


In [32]:
merge.dtypes

brand_name           category
category_name        category
item_condition_id    category
item_description       object
name                   object
price                 float64
shipping                int64
test_id               float64
train_id              float64
dtype: object

In [33]:
print(merge.brand_name.nunique())
print(merge.category_name.nunique())
print(merge.item_condition_id.nunique())

4001
1001
5


In [34]:
cv = CountVectorizer(min_df=NAME_MIN_DF)
X_name = cv.fit_transform(merge['name'])
print('[{}] Finished count vectorize `name`'.format(time.time() - start_time))

[407.9425690174103] Finished count vectorize `name`


In [47]:
X_name

<2175894x21257 sparse matrix of type '<class 'numpy.int64'>'
	with 8946093 stored elements in Compressed Sparse Row format>

In [35]:
cv = CountVectorizer()
X_category = cv.fit_transform(merge['category_name'])
print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time))

[421.8015308380127] Finished count vectorize `category_name`


In [48]:
X_category

<2175894x874 sparse matrix of type '<class 'numpy.int64'>'
	with 8707919 stored elements in Compressed Sparse Row format>

In [36]:
tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                     ngram_range=(1, 3),
                     stop_words='english')
X_description = tv.fit_transform(merge['item_description'])
print('[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time))

In [49]:
X_description

<2175894x55000 sparse matrix of type '<class 'numpy.float64'>'
	with 49142431 stored elements in Compressed Sparse Row format>

In [38]:
lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(merge['brand_name'])
print('[{}] Finished label binarize `brand_name`'.format(time.time() - start_time))

[715.862056016922] Finished label binarize `brand_name`


In [50]:
X_brand

<2175894x4001 sparse matrix of type '<class 'numpy.int64'>'
	with 2175894 stored elements in Compressed Sparse Row format>

In [39]:
X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                      sparse=True).values)
print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'.format(time.time() - start_time))

[720.5948569774628] Finished to get dummies on `item_condition_id` and `shipping`


In [51]:
X_dummies

<2175894x6 sparse matrix of type '<class 'numpy.int64'>'
	with 3149424 stored elements in Compressed Sparse Row format>

In [40]:
sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name)).tocsr()
print('[{}] Finished to create sparse merge'.format(time.time() - start_time))

[753.0859997272491] Finished to create sparse merge


In [42]:
sparse_merge

<2175894x81138 sparse matrix of type '<class 'numpy.float64'>'
	with 72121761 stored elements in Compressed Sparse Row format>

In [43]:
X = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_train:]

In [56]:
X.shape

(1482535, 81138)

In [57]:
X_test.shape

(693359, 81138)

In [58]:
y.shape

(1482535,)

In [68]:
# import scipy.sparse
# scipy.sparse.save_npz('tmp/X_train.npz', X)
# scipy.sparse.save_npz('tmp/X_test.npz', X_test)

# sparse_matrix = scipy.sparse.load_npz('/tmp/sparse_matrix.npz')

In [46]:
model = Ridge(solver="sag", fit_intercept=True, random_state=205, alpha=3)
model.fit(X, y)
print('[{}] Finished to train ridge sag'.format(time.time() - start_time))
predsR = model.predict(X=X_test)
print('[{}] Finished to predict ridge sag'.format(time.time() - start_time))

KeyboardInterrupt: 

In [None]:
model = Ridge(solver="lsqr", fit_intercept=True, random_state=145, alpha = 3)
model.fit(X, y)
print('[{}] Finished to train ridge lsqrt'.format(time.time() - start_time))
predsR2 = model.predict(X=X_test)
print('[{}] Finished to predict ridge lsqrt'.format(time.time() - start_time))

In [None]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) 
d_train = lgb.Dataset(train_X, label=train_y, max_bin=8192)
d_valid = lgb.Dataset(valid_X, label=valid_y, max_bin=8192)
watchlist = [d_train, d_valid]

params = {
    'learning_rate': 0.76,
    'application': 'regression',
    'max_depth': 3,
    'num_leaves': 99,
    'verbosity': -1,
    'metric': 'RMSE',
    'nthread': 4
}

model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
early_stopping_rounds=500, verbose_eval=500) 
predsL = model.predict(X_test)

print('[{}] Finished to predict lgb 1'.format(time.time() - start_time))

In [None]:
train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X, y, test_size = 0.1, random_state = 101) 
d_train2 = lgb.Dataset(train_X2, label=train_y2, max_bin=8192)
d_valid2 = lgb.Dataset(valid_X2, label=valid_y2, max_bin=8192)
watchlist2 = [d_train2, d_valid2]

params2 = {
    'learning_rate': 0.85,
    'application': 'regression',
    'max_depth': 3,
    'num_leaves': 110,
    'verbosity': -1,
    'metric': 'RMSE',
    'nthread': 4
}

model = lgb.train(params2, train_set=d_train2, num_boost_round=3000, valid_sets=watchlist2, \
early_stopping_rounds=50, verbose_eval=500) 
predsL2 = model.predict(X_test)

print('[{}] Finished to predict lgb 2'.format(time.time() - start_time))

In [None]:
preds = predsR2*0.15 + predsR*0.15 + predsL*0.5 + predsL2*0.2

submission['price'] = np.expm1(preds)
submission.to_csv("submission_lgbm_ridge_11.csv", index=False)}