# [Mercari Price Suggestion Challenge](https://www.kaggle.com/c/mercari-price-suggestion-challenge)
Can you automatically suggest product prices to online sellers?

# Import packages

In [55]:
import pandas as pd #data processing
import numpy as np #linear algebra
from scipy import stats
from scipy.cluster.hierarchy import linkage, dendrogram

%matplotlib inline
import matplotlib.pyplot as plt #commonly used visualization tool
import seaborn as sns #new visualization tool
from string import ascii_letters
from wordcloud import WordCloud

from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV #hyper-parameter tuning
from sklearn.ensemble import RandomForestRegressor

# Import data

In [56]:
%%time
train_df = pd.read_csv("data/train_df.csv", low_memory= True)
test_df = pd.read_csv("data/test_df.tsv", low_memory= True)

CPU times: user 7.46 s, sys: 1.25 s, total: 8.71 s
Wall time: 8.07 s


In [57]:
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [58]:
train_df.columns

Index(['train_id', 'name', 'item_condition_id', 'category_name', 'brand_name',
       'price', 'shipping', 'item_description'],
      dtype='object')

# Predictive Modeling

先把id改好，改成統一

In [137]:
train_df = train_df.rename(columns = {'train_id' : 'id'})
test_df = test_df.rename(columns = {"test_id" : "id"})

要把test and train dataset combine

In [138]:
train_df['is_train'] = 1
test_df['is_train'] = 0 

利用pd.concat，不包含price

In [139]:
y_train = train_df.price
train_test_combine = pd.concat([train_df.drop(['price'] , axis = 1) , test_df], axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [140]:
train_test_combine.head()

Unnamed: 0,brand_name,category_name,des_len,general_category,id,is_train,item_condition_id,item_description,log_price,name,name_len,shipping,subcategory_1,subcategory_2
0,NoBrand,Men/Tops/T-shirts,18,Men,0,1,3,No description yet,2.397895,MLB Cincinnati Reds T Shirt Size XL,35,1,Tops,T-shirts
1,Razer,Electronics/Computers & Tablets/Components & P...,188,Electronics,1,1,3,This keyboard is in great condition and works ...,3.970292,Razer BlackWidow Chroma Keyboard,32,0,Computers & Tablets,Components & Parts
2,Target,Women/Tops & Blouses/Blouse,124,Women,2,1,1,Adorable top with a hint of lace and a key hol...,2.397895,AVA-VIV Blouse,14,1,Tops & Blouses,Blouse
3,NoBrand,Home/Home Décor/Home Décor Accents,173,Home,3,1,1,New with tags. Leather horses. Retail for [rm]...,3.583519,Leather Horse Statues,21,1,Home Décor,Home Décor Accents
4,NoBrand,Women/Jewelry/Necklaces,41,Women,4,1,1,Complete with certificate of authenticity,3.806662,24K GOLD plated rose,20,0,Jewelry,Necklaces


Convert string to category

In [141]:
train_test_combine.category_name = train_test_combine.category_name.astype('category')
train_test_combine.item_description = train_test_combine.item_description.astype('category')
train_test_combine.name = train_test_combine.name.astype('category')
train_test_combine.brand_name = train_test_combine.brand_name.astype('category')
train_test_combine.general_category = train_test_combine.general_category.astype('category')
train_test_combine.subcategory_1 = train_test_combine.subcategory_1.astype('category')
train_test_combine.subcategory_2 = train_test_combine.subcategory_2.astype('category')

cat.codes提取底層整數的方法

In [142]:
train_test_combine.name = train_test_combine.name.cat.codes
train_test_combine.brand_name = train_test_combine.brand_name.cat.codes
train_test_combine.item_description = train_test_combine.item_description.cat.codes
train_test_combine.category_name = train_test_combine.category_name.cat.codes
train_test_combine.general_category = train_test_combine.general_category.cat.codes
train_test_combine.subcategory_1 = train_test_combine.subcategory_1.cat.codes
train_test_combine.subcategory_2 = train_test_combine.subcategory_2.cat.codes

In [143]:
train_test_combine.head()

Unnamed: 0,brand_name,category_name,des_len,general_category,id,is_train,item_condition_id,item_description,log_price,name,name_len,shipping,subcategory_1,subcategory_2
0,3357,829,18,5,0,1,3,1172053,2.397895,916335,35,1,103,774
1,3890,86,188,1,1,1,3,1585539,3.970292,1292428,32,0,30,215
2,4589,1278,124,10,2,1,1,167133,2.397895,131013,14,1,104,97
3,3357,503,173,3,3,1,1,1136643,3.583519,802671,21,1,55,410
4,3357,1205,41,10,4,1,1,531909,3.806662,65051,20,0,58,542


In [144]:
print(pd.isnull(train_test_combine).sum())

brand_name                0
category_name             0
des_len                   0
general_category          0
id                        0
is_train                  0
item_condition_id         0
item_description          0
log_price            693359
name                      0
name_len                  0
shipping                  0
subcategory_1             0
subcategory_2             0
dtype: int64


In [145]:
columns = ['log_price']
train_test_combine.drop(columns, inplace=True, axis=1)

成功轉換

In [146]:
train_test_combine.dtypes

brand_name           int16
category_name        int16
des_len              int64
general_category      int8
id                   int64
is_train             int64
item_condition_id    int32
item_description     int32
name                 int32
name_len             int64
shipping              int8
subcategory_1         int8
subcategory_2        int16
dtype: object

再把他分開

In [147]:
train_df = train_test_combine.loc[train_test_combine['is_train'] == 1]
test_df = train_test_combine.loc[train_test_combine['is_train'] == 0]

In [148]:
train_df = train_df.drop(['is_train'] , axis = 1)
test_df = test_df.drop(['is_train'] , axis = 1)

In [149]:
train_df.head()

Unnamed: 0,brand_name,category_name,des_len,general_category,id,item_condition_id,item_description,name,name_len,shipping,subcategory_1,subcategory_2
0,3357,829,18,5,0,3,1172053,916335,35,1,103,774
1,3890,86,188,1,1,3,1585539,1292428,32,0,30,215
2,4589,1278,124,10,2,1,167133,131013,14,1,104,97
3,3357,503,173,3,3,1,1136643,802671,21,1,55,410
4,3357,1205,41,10,4,1,531909,65051,20,0,58,542


In [150]:
test_df.head()

Unnamed: 0,brand_name,category_name,des_len,general_category,id,item_condition_id,item_description,name,name_len,shipping,subcategory_1,subcategory_2
0,3357,1206,6,10,0,1,1395407,323913,40,1,58,667
1,3357,900,251,7,1,1,95839,65692,40,1,72,701
2,1094,1001,55,9,2,1,383739,410310,9,1,7,382
3,3357,1257,67,10,3,2,34209,544668,13,0,97,166
4,3357,861,167,7,4,3,1316300,810652,16,1,14,662


In [151]:
train_df.shape

(1482535, 12)

In [152]:
test_df.shape

(693359, 12)

In [154]:
#train_df['price'] = train_df.price
#x_train, y_train = train_df.drop(['price'] , axis = 1) , train_df.price

In [157]:
x_train = train_df
y_train = y_train.apply(lambda x: np.log(x) if x > 0 else x)

隨機森林開始!

In [158]:
%%time
rf = RandomForestRegressor(n_jobs = -1, min_samples_leaf = 3 , n_estimators = 200)
rf.fit(x_train, y_train)
rf.score(x_train, y_train)

0.8391567698255982

最後匯出資料

In [159]:
predict_df = rf.predict(test_df)
predict_df = pd.Series(np.exp(predict_df))
submission = pd.concat([test_df.id , predict_df] , axis = 1)
submission.columns = ['test_id' , 'price']
submission.to_csv('data/y_pred_rf.csv' , index = False)