# [Mercari Price Suggestion Challenge](https://www.kaggle.com/c/mercari-price-suggestion-challenge)
Can you automatically suggest product prices to online sellers?

# Import packages

In [1]:
import pandas as pd #data processing
import numpy as np #linear algebra
from scipy import stats
from scipy.cluster.hierarchy import linkage, dendrogram

%matplotlib inline
import matplotlib.pyplot as plt #commonly used visualization tool
import seaborn as sns #new visualization tool
from string import ascii_letters
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV #hyper-parameter tuning
from sklearn.ensemble import RandomForestRegressor

# Import data

set train and test data, split by tab because of the tsv format.

In [2]:
%%time
train_df = pd.read_csv("data/train.tsv", delimiter="\t", low_memory= True)
test_df = pd.read_csv("data/test.tsv", delimiter="\t", low_memory= True)

CPU times: user 7.21 s, sys: 765 ms, total: 7.97 s
Wall time: 7.12 s


In [3]:
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [4]:
train_df.columns

Index(['train_id', 'name', 'item_condition_id', 'category_name', 'brand_name',
       'price', 'shipping', 'item_description'],
      dtype='object')

### converting column types

In [5]:
print(train_df.dtypes)
print("------------")
print(test_df.dtypes)

train_id               int64
name                  object
item_condition_id      int64
category_name         object
brand_name            object
price                float64
shipping               int64
item_description      object
dtype: object
------------
test_id               int64
name                 object
item_condition_id     int64
category_name        object
brand_name           object
shipping              int64
item_description     object
dtype: object


In [6]:
train_df["item_condition_id"] = train_df["item_condition_id"].astype("int32")
train_df["shipping"] = train_df["shipping"].astype("int8")

test_df["item_condition_id"] = test_df["item_condition_id"].astype("int32")
test_df["shipping"] = test_df["shipping"].astype("int8")

In [7]:
print(train_df.dtypes)
print("------------")
print(test_df.dtypes)

train_id               int64
name                  object
item_condition_id      int32
category_name         object
brand_name            object
price                float64
shipping                int8
item_description      object
dtype: object
------------
test_id               int64
name                 object
item_condition_id     int32
category_name        object
brand_name           object
shipping               int8
item_description     object
dtype: object


do a little exploring on discriptive statistics

In [8]:
print("train_df shape: {}\ntest_df shape: {}".format(train_df.shape, test_df.shape))

train_df shape: (1482535, 8)
test_df shape: (693359, 7)


In [9]:
pd.set_option("float_format", "{:f}".format)
train_df.describe()

Unnamed: 0,train_id,item_condition_id,price,shipping
count,1482535.0,1482535.0,1482535.0,1482535.0
mean,741267.0,1.90738,26.737516,0.447274
std,427971.135004,0.903159,38.586066,0.497212
min,0.0,1.0,0.0,0.0
25%,370633.5,1.0,10.0,0.0
50%,741267.0,2.0,17.0,0.0
75%,1111900.5,3.0,29.0,1.0
max,1482534.0,5.0,2009.0,1.0


checking individual values

In [10]:
train_df.apply(lambda x: x.nunique())

train_id             1482535
name                 1225273
item_condition_id          5
category_name           1287
brand_name              4809
price                    828
shipping                   2
item_description     1281426
dtype: int64

# data preprocessing

### checking missing values

found missing values in category_name, brand_name, and item_discription

In [11]:
print(train_df.isnull().sum()[train_df.isnull().sum() != 0])
print("------------")
print(test_df.isnull().sum()[test_df.isnull().sum() != 0])

category_name         6327
brand_name          632682
item_description         4
dtype: int64
------------
category_name      3058
brand_name       295525
dtype: int64


- Fill products with no brand name with 'NoBrand'
- Fill products with no category name with 'No/No/No'
- Fill products with no item descriptions with 'No description yet' (same as the first data)

In [12]:
train_df["brand_name"] = train_df["brand_name"].fillna("NoBrand")
test_df["brand_name"] = test_df["brand_name"].fillna("NoBrand")

train_df["category_name"] = train_df["category_name"].fillna("No/No/No")
test_df["category_name"] = test_df["category_name"].fillna("No/No/No")

train_df["item_description"] = train_df["item_description"].fillna("No description yet")

In [13]:
print(train_df.isnull().sum()[train_df.isnull().sum() != 0])
print("------------")
print(test_df.isnull().sum()[test_df.isnull().sum() != 0])

Series([], dtype: int64)
------------
Series([], dtype: int64)


# feature engineering

extracting data from category_name, brand_name, and item_discription

split category_name into:
- general_category
- subcategory_1
- subcategory_2

In [14]:
split_category_name = train_df["category_name"].str.split("/", n = 2, expand = True)
split_category_name

Unnamed: 0,0,1,2
0,Men,Tops,T-shirts
1,Electronics,Computers & Tablets,Components & Parts
2,Women,Tops & Blouses,Blouse
3,Home,Home Décor,Home Décor Accents
4,Women,Jewelry,Necklaces
...,...,...,...
1482530,Women,Dresses,Mid-Calf
1482531,Kids,Girls 2T-5T,Dresses
1482532,Sports & Outdoors,Exercise,Fitness accessories
1482533,Home,Home Décor,Home Décor Accents


In [15]:
train_df["general_category"] = split_category_name[0]
train_df["subcategory_1"] = split_category_name[1]
train_df["subcategory_2"] = split_category_name[2]

In [16]:
split_category_name_2 = test_df["category_name"].str.split("/", n = 2, expand = True)
split_category_name_2

Unnamed: 0,0,1,2
0,Women,Jewelry,Rings
1,Other,Office supplies,Shipping Supplies
2,Vintage & Collectibles,Bags and Purses,Handbag
3,Women,Sweaters,Cardigan
4,Other,Books,Religion & Spirituality
...,...,...,...
693354,Home,Home Décor,Home Décor Accents
693355,Beauty,Makeup,Makeup Sets
693356,Electronics,Cell Phones & Accessories,"Cases, Covers & Skins"
693357,Women,Swimwear,Cover-Ups


In [17]:
test_df["general_category"] = split_category_name_2[0]
test_df["subcategory_1"] = split_category_name_2[1]
test_df["subcategory_2"] = split_category_name_2[2]

In [18]:
#train_df.drop(columns =["category_name"], inplace = True) 

#test_df.drop(columns =["category_name"], inplace = True) 

dealing with item_description
<br>[Extensive Text Data Feature Engineering](https://www.kaggle.com/shivamb/extensive-text-data-feature-engineering)
- character length
- word count
- word density
- puncutation count

In [19]:
train_df["char_count"] = train_df["item_description"].apply(len)
train_df["word_count"] = train_df["item_description"].apply(lambda x: len(x.split()))
train_df["word_density"] = train_df["char_count"] / (train_df["word_count"]+1)

test_df["char_count"] = test_df["item_description"].apply(len)
test_df["word_count"] = test_df["item_description"].apply(lambda x: len(x.split()))
test_df["word_density"] = test_df["char_count"] / (test_df["word_count"]+1)

[CountVectorizer, TfidfVectorizer, Predict Comments](https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments)
- TFIDF

In [20]:
vectorizer = TfidfVectorizer(min_df=3, max_features=2500, dtype=np.float32, 
                             strip_accents="unicode", analyzer="word", ngram_range=(1, 3), stop_words={"english", "rm", "co"})

In [21]:
%%time
description_text = list(train_df["item_description"].values)
tfidf_matrix = vectorizer.fit_transform(description_text)

CPU times: user 4min 41s, sys: 20.7 s, total: 5min 2s
Wall time: 5min 20s


In [22]:
tfidf_matrix

<1482535x2500 sparse matrix of type '<class 'numpy.float32'>'
	with 33068128 stored elements in Compressed Sparse Row format>

In [23]:
tfidf_matrix.shape

(1482535, 2500)

[Using TfidfVectorizer output to create columns in a pandas df](https://www.reddit.com/r/learnpython/comments/7aduzh/using_tfidfvectorizer_output_to_create_columns_in/)

the learned corpus vocabulary

In [24]:
vectorizer.vocabulary_

{'no': 1392,
 'description': 592,
 'yet': 2462,
 'no description': 1394,
 'description yet': 593,
 'no description yet': 1395,
 'this': 2162,
 'is': 1034,
 'in': 974,
 'great': 866,
 'condition': 512,
 'and': 122,
 'works': 2432,
 'like': 1159,
 'it': 1058,
 'came': 407,
 'out': 1534,
 'of': 1442,
 'the': 2114,
 'box': 326,
 'all': 93,
 'are': 181,
 'tested': 2098,
 'work': 2429,
 'perfectly': 1593,
 'lights': 1157,
 'via': 2314,
 'app': 173,
 'on': 1465,
 'your': 2491,
 'pc': 1580,
 'is in': 1042,
 'in great': 982,
 'great condition': 867,
 'condition and': 513,
 'out of': 1538,
 'of the': 1448,
 'the box': 2119,
 'all of': 102,
 'on your': 1483,
 'is in great': 1044,
 'in great condition': 983,
 'adorable': 85,
 'top': 2221,
 'with': 2392,
 'lace': 1117,
 'key': 1101,
 'hole': 943,
 'back': 228,
 'pink': 1617,
 '1x': 18,
 'also': 110,
 'have': 909,
 '3x': 48,
 'available': 222,
 'white': 2367,
 'top with': 2226,
 'in the': 997,
 'the back': 2115,
 'also have': 111,
 'available in': 2

In [25]:
# create a dictionary mapping the tokens to their tfidf values
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf_df = pd.DataFrame(columns=["description_text_tfidf"]).from_dict(dict(tfidf), orient='index')
tfidf_df.columns = ["description_text_tfidf"]

In [26]:
tfidf_df.shape

(2500, 1)

In [27]:
# features with highest tf-idf (in title)
top_n_tfidf = tfidf_df.sort_values(by=["description_text_tfidf"], ascending=False).head(100)
top_n_tfidf

Unnamed: 0,description_text_tfidf
pipes,9.532511
note note,8.140242
teeth,7.754545
s6 edge,7.723212
generation,7.717619
...,...
peel,7.245109
max,7.245109
your skin,7.244066
bra size,7.243372


In [28]:
tfidf_df.to_dict()

{'description_text_tfidf': {'00': 6.609201908111572,
  '10': 4.2809906005859375,
  '100': 4.332520008087158,
  '100 authentic': 5.300342082977295,
  '100 brand': 6.821724891662598,
  '100 brand new': 6.82628870010376,
  '100 cotton': 6.781996250152588,
  '100 polyester': 7.100942611694336,
  '11': 5.499847412109375,
  '12': 4.786893844604492,
  '13': 5.896909236907959,
  '14': 5.570263862609863,
  '14k': 7.381226539611816,
  '15': 5.58800745010376,
  '16': 5.642789363861084,
  '17': 6.400447368621826,
  '18': 5.509363174438477,
  '19': 6.9482550621032715,
  '1x': 6.629848003387451,
  '20': 5.517424583435059,
  '2016': 6.845222473144531,
  '2017': 6.940276622772217,
  '21': 5.612709999084473,
  '22': 6.4055399894714355,
  '23': 7.027264595031738,
  '24': 5.367812156677246,
  '24 hours': 6.640998363494873,
  '25': 5.740413665771484,
  '26': 6.555619239807129,
  '27': 6.744839668273926,
  '28': 6.482945919036865,
  '29': 6.924255847930908,
  '2nd': 6.892719745635986,
  '2t': 7.11062192916

In [29]:
%%time
top_tfidf_word = []
top_tfidf_value = []
for i in range(len(list(train_df["item_description"].values))):
    for word in top_n_tfidf.index:
        if word in train_df["item_description"][i]:
            #print("YES, #", i, ":", word, "is in", train_df["item_description"][i])
            chosen_word = word
            chosen_tfidf = float(top_n_tfidf.loc[word])
            #print(chosen_word, chosen_tfidf)
            #print(type(chosen_word), type(chosen_tfidf))
            top_tfidf_word.insert(i, chosen_word)
            top_tfidf_value.insert(i, chosen_tfidf)
            break
        else:
            chosen_word = "None"
            chosen_tfidf = 0.0
            top_tfidf_word.insert(i, chosen_word)
            top_tfidf_value.insert(i, chosen_tfidf)
            continue
    #if i > 30:
        #break

KeyboardInterrupt: 

In [None]:
%%time
top_tfidf_word_2 = []
top_tfidf_value_2 = []
for i in range(len(list(test_df["item_description"].values))):
    for word in top_n_tfidf.index:
        if word in test_df["item_description"][i]:
            #print("YES, #", i, ":", word, "is in", train_df["item_description"][i])
            chosen_word = word
            chosen_tfidf = float(top_n_tfidf.loc[word])
            #print(chosen_word, chosen_tfidf)
            #print(type(chosen_word), type(chosen_tfidf))
            top_tfidf_word_2.insert(i, chosen_word)
            top_tfidf_value_2.insert(i, chosen_tfidf)
            break
        else:
            chosen_word = "None"
            chosen_tfidf = 0.0
            top_tfidf_word_2.insert(i, chosen_word)
            top_tfidf_value_2.insert(i, chosen_tfidf)
            continue

In [None]:
train_df["top_tfidf_word"] = top_tfidf_word
train_df["top_tfidf_value"] = top_tfidf_value

test_df["top_tfidf_word"] = top_tfidf_word_2
test_df["top_tfidf_value"] = top_tfidf_value_2

view the data again

In [None]:
train_df.head()

In [None]:
test_df.head()

checking the shape again

In [None]:
print('Train shape: {}\nTest shape: {}'.format(train_df.shape, test_df.shape))

In [None]:
train_df.to_csv("data/train_df.csv", index=False)
test_df.to_csv("data/test_df.csv", index=False)