# [Mercari Price Suggestion Challenge](https://www.kaggle.com/c/mercari-price-suggestion-challenge)
Can you automatically suggest product prices to online sellers?

# Import packages

In [1]:
import pandas as pd #data processing
import numpy as np #linear algebra

%matplotlib inline
import matplotlib.pyplot as plt #commonly used visualization tool
import seaborn as sns #new visualization tool
import pydot

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV #hyper-parameter tuning
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz

# Import data

In [2]:
%%time
train_df = pd.read_csv("data/train_df.csv", low_memory= True)
test_df = pd.read_csv("data/test_df.csv", low_memory= True)

CPU times: user 10.6 s, sys: 1.29 s, total: 11.9 s
Wall time: 11 s


In [3]:
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,general_category,subcategory_1,...,item_description_char_count,item_description_word_count,item_description_word_density,name_char_count,name_word_count,name_word_density,item_description_top_tfidf_word,item_description_top_tfidf_value,name_top_tfidf_word,name_top_tfidf_value
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,NoBrand,10.0,1,No description yet,Men,Tops,...,18,3,4.5,35,7,4.375,,0.0,,0.0
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,...,188,36,5.081081,32,4,6.4,test,7.723773,,0.0
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,...,124,29,4.133333,14,2,4.666667,ink,7.717062,,0.0
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,NoBrand,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,...,173,32,5.242424,21,3,5.25,,0.0,,0.0
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,NoBrand,44.0,0,Complete with certificate of authenticity,Women,Jewelry,...,41,5,6.833333,20,4,4.0,,0.0,,0.0


In [4]:
train_df.columns

Index(['train_id', 'name', 'item_condition_id', 'category_name', 'brand_name',
       'price', 'shipping', 'item_description', 'general_category',
       'subcategory_1', 'subcategory_2', 'item_description_char_count',
       'item_description_word_count', 'item_description_word_density',
       'name_char_count', 'name_word_count', 'name_word_density',
       'item_description_top_tfidf_word', 'item_description_top_tfidf_value',
       'name_top_tfidf_word', 'name_top_tfidf_value'],
      dtype='object')

# Predictive Modeling

change train_id and test_id to id

In [5]:
train_df.rename(columns = {'train_id' : 'id'}, inplace=True)
test_df.rename(columns = {"test_id" : "id"}, inplace=True)

In [6]:
train_df.columns

Index(['id', 'name', 'item_condition_id', 'category_name', 'brand_name',
       'price', 'shipping', 'item_description', 'general_category',
       'subcategory_1', 'subcategory_2', 'item_description_char_count',
       'item_description_word_count', 'item_description_word_density',
       'name_char_count', 'name_word_count', 'name_word_density',
       'item_description_top_tfidf_word', 'item_description_top_tfidf_value',
       'name_top_tfidf_word', 'name_top_tfidf_value'],
      dtype='object')

convert string to category

In [7]:
%%time
def str_to_cat(data, col=["name", "brand_name", "category_name", "item_description", "general_category", "subcategory_1", "subcategory_2", "item_description_top_tfidf_word", "name_top_tfidf_word"]):
    for i in col:
        data[i] = data[i].astype("category").cat.codes
    return data[col]

str_to_cat(train_df)
str_to_cat(test_df)

CPU times: user 21.5 s, sys: 816 ms, total: 22.3 s
Wall time: 18.1 s


Unnamed: 0,name,brand_name,category_name,item_description,general_category,subcategory_1,subcategory_2,item_description_top_tfidf_word,name_top_tfidf_word
0,110376,2490,1119,457436,10,58,626,10,9
1,21757,2490,826,31753,7,72,657,10,9
2,140281,814,922,125968,9,7,355,10,9
3,186403,2490,1170,11516,10,97,158,10,9
4,277503,2490,790,431600,7,14,621,1,9
...,...,...,...,...,...,...,...,...,...
693354,433680,2490,443,207449,3,55,381,9,9
693355,242028,1616,31,272071,0,63,473,219,9
693356,204562,2490,79,378171,1,23,165,10,9
693357,228765,2490,1188,207686,10,99,215,10,9


In [8]:
train_df.dtypes

id                                    int64
name                                  int32
item_condition_id                     int64
category_name                         int16
brand_name                            int16
price                               float64
shipping                              int64
item_description                      int32
general_category                       int8
subcategory_1                          int8
subcategory_2                         int16
item_description_char_count           int64
item_description_word_count           int64
item_description_word_density       float64
name_char_count                       int64
name_word_count                       int64
name_word_density                   float64
item_description_top_tfidf_word       int16
item_description_top_tfidf_value    float64
name_top_tfidf_word                   int16
name_top_tfidf_value                float64
dtype: object

In [9]:
train_df.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,general_category,subcategory_1,...,item_description_char_count,item_description_word_count,item_description_word_density,name_char_count,name_word_count,name_word_density,item_description_top_tfidf_word,item_description_top_tfidf_value,name_top_tfidf_word,name_top_tfidf_value
0,0,640809,3,808,3074,10.0,1,806610,5,103,...,18,3,4.5,35,7,4.375,10,0.0,9,0.0
1,1,903932,3,86,3558,52.0,0,1090878,1,30,...,188,36,5.081081,32,4,6.4,212,7.723773,9,0.0
2,2,91532,1,1255,4181,10.0,1,115289,10,104,...,124,29,4.133333,14,2,4.666667,103,7.717062,9,0.0
3,3,561143,1,485,3074,35.0,1,782305,3,55,...,173,32,5.242424,21,3,5.25,10,0.0,9,0.0
4,4,45483,1,1182,3074,44.0,0,366652,10,58,...,41,5,6.833333,20,4,4.0,10,0.0,9,0.0


In [10]:
test_df.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description,general_category,subcategory_1,subcategory_2,item_description_char_count,item_description_word_count,item_description_word_density,name_char_count,name_word_count,name_word_density,item_description_top_tfidf_word,item_description_top_tfidf_value,name_top_tfidf_word,name_top_tfidf_value
0,0,110376,1,1119,2490,1,457436,10,58,626,6,2,2.0,40,8,4.444444,10,0.0,9,0.0
1,1,21757,1,826,2490,1,31753,7,72,657,251,38,6.435897,40,7,5.0,10,0.0,9,0.0
2,2,140281,1,922,814,1,125968,9,7,355,55,11,4.583333,9,2,3.0,10,0.0,9,0.0
3,3,186403,2,1170,2490,0,11516,10,97,158,67,10,6.090909,13,2,4.333333,10,0.0,9,0.0
4,4,277503,3,790,2490,1,431600,7,14,621,167,29,5.566667,16,3,4.0,1,7.696101,9,0.0


In [11]:
train_df.shape

(1482535, 21)

In [12]:
test_df.shape

(693359, 20)

In [13]:
train_df.var().map('{:.2f}'.format)

id                                  183159292396.39
name                                124109338563.97
item_condition_id                              0.82
category_name                             219013.71
brand_name                               1129465.33
price                                       1488.88
shipping                                       0.25
item_description                    129805895968.83
general_category                              15.91
subcategory_1                               1267.29
subcategory_2                              63921.71
item_description_char_count                30431.55
item_description_word_count                  923.77
item_description_word_density                  0.78
name_char_count                               83.99
name_word_count                                2.74
name_word_density                              0.84
item_description_top_tfidf_word             2842.52
item_description_top_tfidf_value              12.79
name_top_tfi

In [14]:
X = train_df.drop("price", axis=1)
y = train_df["price"]

In [15]:
scaler = StandardScaler()
X_scaled = X.copy()
test_df_scaled = test_df.copy()

X_scaled[X_scaled.columns] = scaler.fit_transform(X[X.columns])
test_df_scaled[test_df_scaled.columns] = scaler.fit_transform(test_df[test_df.columns])

In [16]:
X_scaled.var().map('{:.2f}'.format)

id                                  1.00
name                                1.00
item_condition_id                   1.00
category_name                       1.00
brand_name                          1.00
shipping                            1.00
item_description                    1.00
general_category                    1.00
subcategory_1                       1.00
subcategory_2                       1.00
item_description_char_count         1.00
item_description_word_count         1.00
item_description_word_density       1.00
name_char_count                     1.00
name_word_count                     1.00
name_word_density                   1.00
item_description_top_tfidf_word     1.00
item_description_top_tfidf_value    1.00
name_top_tfidf_word                 1.00
name_top_tfidf_value                1.00
dtype: object

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

x_train = train_df
y_train = y_train.apply(lambda x: np.log(x) if x > 0 else x)

## Random Forest

In [18]:
%%time
rfr = RandomForestRegressor(n_jobs = -1, min_samples_leaf = 3 , n_estimators = 1000)
rfr.fit(X_train, y_train)

CPU times: user 9h 57min 26s, sys: 2min 58s, total: 10h 24s
Wall time: 1h 25min 17s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [19]:
rfr.score(X_train, y_train)

0.7585515314858644

In [20]:
mean_squared_log_error(y_test, rfr.predict(X_test))

0.3316270147660772

In [23]:
tree = rfr.estimators_[0]

In [26]:
%%time
export_graphviz(tree, out_file = 'tree.dot', feature_names = X_scaled.columns, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('tree.dot')

In [None]:
graph.write_png('tree.png')

In [27]:
importances = list(rfr.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_scaled.columns, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: category_name        Importance: 0.16
Variable: brand_name           Importance: 0.15
Variable: name                 Importance: 0.11
Variable: item_description     Importance: 0.07
Variable: id                   Importance: 0.06
Variable: subcategory_2        Importance: 0.06
Variable: item_description_char_count Importance: 0.06
Variable: item_description_word_density Importance: 0.06
Variable: name_word_density    Importance: 0.05
Variable: item_condition_id    Importance: 0.04
Variable: name_char_count      Importance: 0.04
Variable: subcategory_1        Importance: 0.03
Variable: item_description_word_count Importance: 0.03
Variable: name_word_count      Importance: 0.02
Variable: item_description_top_tfidf_word Importance: 0.02
Variable: item_description_top_tfidf_value Importance: 0.02
Variable: shipping             Importance: 0.01
Variable: name_top_tfidf_value Importance: 0.01
Variable: general_category     Importance: 0.0
Variable: name_top_tfidf_word  Importance: 

# Output submission file

In [21]:
y_pred_rfr = rfr.predict(test_df_scaled)
y_pred_rfr = pd.Series(y_pred_rfr)

submission = pd.concat([test_df.id , y_pred_rfr] , axis = 1)
submission.columns = ["test_id" , "price"]
submission.to_csv("data/y_pred_rf.csv" , index = False)