# [Mercari Price Suggestion Challenge](https://www.kaggle.com/c/mercari-price-suggestion-challenge)
Can you automatically suggest product prices to online sellers?

# Import packages

In [85]:
import pandas as pd #data processing
import numpy as np #linear algebra
from scipy import stats
from scipy.cluster.hierarchy import linkage, dendrogram

%matplotlib inline
import matplotlib.pyplot as plt #commonly used visualization tool
import seaborn as sns #new visualization tool
from string import ascii_letters
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV #hyper-parameter tuning
from sklearn.ensemble import RandomForestRegressor

# Import data

set train and test data, split by tab because of the tsv format.

In [56]:
%%time
train_df = pd.read_csv("data/train.tsv", delimiter='\t', low_memory= True)
test_df = pd.read_csv("data/test.tsv", delimiter='\t', low_memory= True)

CPU times: user 7.46 s, sys: 1.25 s, total: 8.71 s
Wall time: 8.07 s


In [57]:
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [58]:
train_df.columns

Index(['train_id', 'name', 'item_condition_id', 'category_name', 'brand_name',
       'price', 'shipping', 'item_description'],
      dtype='object')

### converting column types

In [59]:
print(train_df.dtypes)
print("------------")
print(test_df.dtypes)

train_id               int64
name                  object
item_condition_id      int64
category_name         object
brand_name            object
price                float64
shipping               int64
item_description      object
dtype: object
------------
test_id               int64
name                 object
item_condition_id     int64
category_name        object
brand_name           object
shipping              int64
item_description     object
dtype: object


In [60]:
train_df["item_condition_id"] = train_df["item_condition_id"].astype("int32")
train_df["shipping"] = train_df["shipping"].astype("int8")

test_df["item_condition_id"] = test_df["item_condition_id"].astype("int32")
test_df["shipping"] = test_df["shipping"].astype("int8")

In [61]:
print(train_df.dtypes)
print("------------")
print(test_df.dtypes)

train_id               int64
name                  object
item_condition_id      int32
category_name         object
brand_name            object
price                float64
shipping                int8
item_description      object
dtype: object
------------
test_id               int64
name                 object
item_condition_id     int32
category_name        object
brand_name           object
shipping               int8
item_description     object
dtype: object


do a little exploring on discriptive statistics

In [62]:
print('train_df shape: {}\ntest_df shape: {}'.format(train_df.shape, test_df.shape))

train_df shape: (1482535, 8)
test_df shape: (693359, 7)


In [63]:
pd.set_option('float_format', '{:f}'.format)
train_df.describe()

Unnamed: 0,train_id,item_condition_id,price,shipping
count,1482535.0,1482535.0,1482535.0,1482535.0
mean,741267.0,1.90738,26.737516,0.447274
std,427971.135004,0.903159,38.586066,0.497212
min,0.0,1.0,0.0,0.0
25%,370633.5,1.0,10.0,0.0
50%,741267.0,2.0,17.0,0.0
75%,1111900.5,3.0,29.0,1.0
max,1482534.0,5.0,2009.0,1.0


checking individual values

In [64]:
train_df.apply(lambda x: x.nunique())

train_id             1482535
name                 1225273
item_condition_id          5
category_name           1287
brand_name              4809
price                    828
shipping                   2
item_description     1281426
dtype: int64

# data preprocessing

### checking missing values

found missing values in category_name, brand_name, and item_discription

In [65]:
print(train_df.isnull().sum()[train_df.isnull().sum() != 0])
print("------------")
print(test_df.isnull().sum()[test_df.isnull().sum() != 0])

category_name         6327
brand_name          632682
item_description         4
dtype: int64
------------
category_name      3058
brand_name       295525
dtype: int64


- Fill products with no brand name with 'NoBrand'
- Fill products with no category name with 'No/No/No'
- Fill products with no item descriptions with 'No description yet' (same as the first data)

In [66]:
train_df["brand_name"] = train_df["brand_name"].fillna("NoBrand")
test_df["brand_name"] = test_df["brand_name"].fillna("NoBrand")

train_df["category_name"] = train_df["category_name"].fillna("No/No/No")
test_df["category_name"] = test_df["category_name"].fillna("No/No/No")

train_df["item_description"] = train_df["item_description"].fillna("No description yet")

In [67]:
print(train_df.isnull().sum()[train_df.isnull().sum() != 0])
print("------------")
print(test_df.isnull().sum()[test_df.isnull().sum() != 0])

Series([], dtype: int64)
------------
Series([], dtype: int64)


# feature engineering

extracting data from category_name, brand_name, and item_discription

split category_name into:
- general_category
- subcategory_1
- subcategory_2

In [68]:
split_category_name = train_df["category_name"].str.split("/", n = 2, expand = True)
split_category_name

Unnamed: 0,0,1,2
0,Men,Tops,T-shirts
1,Electronics,Computers & Tablets,Components & Parts
2,Women,Tops & Blouses,Blouse
3,Home,Home Décor,Home Décor Accents
4,Women,Jewelry,Necklaces
...,...,...,...
1482530,Women,Dresses,Mid-Calf
1482531,Kids,Girls 2T-5T,Dresses
1482532,Sports & Outdoors,Exercise,Fitness accessories
1482533,Home,Home Décor,Home Décor Accents


In [69]:
train_df['general_category'] = split_category_name[0]
train_df['subcategory_1'] = split_category_name[1]
train_df['subcategory_2'] = split_category_name[2]

In [70]:
split_category_name_2 = test_df["category_name"].str.split("/", n = 2, expand = True)
split_category_name_2

Unnamed: 0,0,1,2
0,Women,Jewelry,Rings
1,Other,Office supplies,Shipping Supplies
2,Vintage & Collectibles,Bags and Purses,Handbag
3,Women,Sweaters,Cardigan
4,Other,Books,Religion & Spirituality
...,...,...,...
693354,Home,Home Décor,Home Décor Accents
693355,Beauty,Makeup,Makeup Sets
693356,Electronics,Cell Phones & Accessories,"Cases, Covers & Skins"
693357,Women,Swimwear,Cover-Ups


In [71]:
test_df['general_category'] = split_category_name_2[0]
test_df['subcategory_1'] = split_category_name_2[1]
test_df['subcategory_2'] = split_category_name_2[2]

In [72]:
train_df.drop(columns =["category_name"], inplace = True) 

test_df.drop(columns =["category_name"], inplace = True) 

dealing with item_description
<br>[Extensive Text Data Feature Engineering](https://www.kaggle.com/shivamb/extensive-text-data-feature-engineering)
- character length
- word count
- word density
- puncutation count

In [82]:
train_df["char_count"] = train_df["item_description"].apply(len)
train_df["word_count"] = train_df["item_description"].apply(lambda x: len(x.split()))
train_df['word_density'] = train_df['char_count'] / (train_df['word_count']+1)

test_df["char_count"] = test_df["item_description"].apply(len)
test_df["word_count"] = test_df["item_description"].apply(lambda x: len(x.split()))
test_df['word_density'] = test_df['char_count'] / (test_df['word_count']+1)

[CountVectorizer, TfidfVectorizer, Predict Comments](https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments)
<br>[Feature Engineering + Tfidfvectorizer](https://www.kaggle.com/anu0012/feature-engineering-tfidfvectorizer)
- TFIDF

In [118]:
vectorizer = TfidfVectorizer(min_df=3, max_features=2500, dtype=np.float32, 
                             strip_accents='unicode', analyzer='word', ngram_range=(1, 3), stop_words={'english', 'rm'})

In [119]:
%%time
description_text = list(train_df["item_description"].values)
tfidf_matrix = vectorizer.fit_transform(description_text)

In [120]:
tfidf_matrix

<1482535x2500 sparse matrix of type '<class 'numpy.float32'>'
	with 33068023 stored elements in Compressed Sparse Row format>

In [121]:
tfidf_matrix.shape

(1482535, 2500)

The learned corpus vocabulary

In [122]:
vectorizer.vocabulary_

{'no': 1393,
 'description': 593,
 'yet': 2462,
 'no description': 1395,
 'description yet': 594,
 'no description yet': 1396,
 'this': 2162,
 'is': 1035,
 'in': 975,
 'great': 867,
 'condition': 513,
 'and': 122,
 'works': 2432,
 'like': 1160,
 'it': 1059,
 'came': 407,
 'out': 1535,
 'of': 1443,
 'the': 2114,
 'box': 326,
 'all': 93,
 'are': 181,
 'tested': 2098,
 'work': 2429,
 'perfectly': 1594,
 'lights': 1158,
 'via': 2314,
 'app': 173,
 'on': 1466,
 'your': 2491,
 'pc': 1581,
 'is in': 1043,
 'in great': 983,
 'great condition': 868,
 'condition and': 514,
 'out of': 1539,
 'of the': 1449,
 'the box': 2119,
 'all of': 102,
 'on your': 1484,
 'is in great': 1045,
 'in great condition': 984,
 'adorable': 85,
 'top': 2221,
 'with': 2392,
 'lace': 1118,
 'key': 1102,
 'hole': 944,
 'back': 228,
 'pink': 1618,
 '1x': 18,
 'also': 110,
 'have': 910,
 '3x': 48,
 'available': 222,
 'white': 2367,
 'top with': 2226,
 'in the': 998,
 'the back': 2115,
 'also have': 111,
 'available in': 2

In [123]:
#  create a dictionary mapping the tokens to their tfidf values
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['description_text_tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['description_text_tfidf']

In [124]:
# features with highest tf-idf (in title)
tfidf.sort_values(by=['description_text_tfidf'], ascending=False).head(20)

Unnamed: 0,description_text_tfidf
pipes,9.532511
note note,8.140242
teeth,7.754545
s6 edge,7.723212
generation,7.717619
ipad mini,7.703223
acne,7.570585
ipod touch,7.496821
playstation,7.494587
labels,7.466429


view the data again

In [83]:
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,general_category,subcategory_1,subcategory_2,char_count,word_count,word_density
0,0,MLB Cincinnati Reds T Shirt Size XL,3,NoBrand,10.0,1,No description yet,Men,Tops,T-shirts,18,3,4.5
1,1,Razer BlackWidow Chroma Keyboard,3,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts,188,36,5.081081
2,2,AVA-VIV Blouse,1,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse,124,29,4.133333
3,3,Leather Horse Statues,1,NoBrand,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents,173,32,5.242424
4,4,24K GOLD plated rose,1,NoBrand,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces,41,5,6.833333


In [84]:
test_df.head()

Unnamed: 0,test_id,name,item_condition_id,brand_name,shipping,item_description,general_category,subcategory_1,subcategory_2,char_count,word_count,word_density
0,0,"Breast cancer ""I fight like a girl"" ring",1,NoBrand,1,Size 7,Women,Jewelry,Rings,6,2,2.0
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,NoBrand,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined...",Other,Office supplies,Shipping Supplies,251,38,6.435897
2,2,Coach bag,1,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...,Vintage & Collectibles,Bags and Purses,Handbag,55,11,4.583333
3,3,Floral Kimono,2,NoBrand,0,-floral kimono -never worn -lightweight and pe...,Women,Sweaters,Cardigan,67,10,6.090909
4,4,Life after Death,3,NoBrand,1,Rediscovering life after the loss of a loved o...,Other,Books,Religion & Spirituality,167,29,5.566667


checking the shape again

In [75]:
print('Train shape: {}\nTest shape: {}'.format(train_df.shape, test_df.shape))

Train shape: (1482535, 10)
Test shape: (693359, 9)
