# Product Price Suggestion

In [51]:
import pandas as pd

## Mercari data

Read the data

In [52]:
train_df = pd.read_csv('mercari_data/train.tsv', sep='\t')

In [53]:
train_df.head(100)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity
5,5,Bundled items requested for Ruie,3,Women/Other/Other,,59.0,0,"Banana republic bottoms, Candies skirt with ma..."
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,0,Size small but straps slightly shortened to fi...
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,1,You get three pairs of Sophie cheer shorts siz...
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,0,Girls Size small Plus green. Three shorts total.
9,9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,,8.0,0,I realized his pants are on backwards after th...


1. Extract information from the item description
2. Separate categories


Separating Categories

In [54]:
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
warnings.filterwarnings(action = 'ignore') 
  

Change description to lower case.

Change "no description yet" to ""

In [55]:
train_df['item_description'] = train_df['item_description'].str.lower()
train_df.loc[train_df['item_description'] == 'no description yet','item_description'] = ''

In [56]:
# Uncomment the next line if you have not installed nltk already
#! pip install nltk
import nltk

nltk.data.path = ['/home/jovyan/work/nltk_data']

from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import numpy as np
import re

"""
# Returns True if the input (string) parameter has
# any sort of letter in it, else returns False.
"""
def has_letter(x):
    return re.match('.*[a-zA-Z].*',x) != None

# Stopwords are words we will ignore for search
# purposes, because they are too common to be useful
stopwords = set()

stop_file = open('stopwords.txt')
for line in stop_file:
    stopwords.add(line.strip())

# The NLTK parser breaks apostrophe-s into a separate "word"
# so we'll want to add it to the list... Though it's technically
# not a stop word in the traditional sense.
stopwords.add("'s")

# Use this as the maximum number of words we will index
MAX_WORDS = 10000

# Create the word stemmer
stemmer = PorterStemmer()

In [57]:
def clean_description(description):
    #tokenize
    tokens = nltk.word_tokenize(description);
    #lowercase
    tokens = [token.lower() for token in tokens]
    #remove stopwords
    tokens = [token for token in tokens if token not in stopwords ]
    #stemmer
    tokens = [stemmer.stem(token) for token in tokens]
    #remove non letters
    tokens = [token for token in tokens if has_letter(token)]
    return tokens

In [58]:
short_df = train_df.head(1000)

In [59]:
short_df['item_description_tokens'] = short_df['item_description'].apply(clean_description)

In [60]:
short_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,item_description_tokens
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,,[]
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,this keyboard is in great condition and works ...,"[keyboard, great, condit, work, like, came, bo..."
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,adorable top with a hint of lace and a key hol...,"[ador, top, hint, lace, key, hole, back, pale,..."
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,new with tags. leather horses. retail for [rm]...,"[new, tag, leather, hors, retail, rm, stand, f..."
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,complete with certificate of authenticity,"[complet, certif, authent]"


In [61]:
import itertools

combined_words = list(itertools.chain.from_iterable(short_df.item_description_tokens))
word_freq = nltk.FreqDist(combined_words)
frequencies = [(v,k) for k,v in word_freq.items()]
wordids = [x[1] for x in sorted(frequencies, reverse = True)]
wordids = wordids[:MAX_WORDS]
lexicon = {}
for x in combined_words:
    if(x in wordids):
        lexicon[x] = wordids.index(x)

In [62]:
lexicon

{'keyboard': 2736,
 'great': 10,
 'condit': 3,
 'work': 85,
 'like': 20,
 'came': 667,
 'box': 18,
 'port': 750,
 'test': 575,
 'perfectli': 756,
 'light': 46,
 'customiz': 3177,
 'via': 482,
 'razer': 2296,
 'synaps': 1991,
 'app': 1120,
 'pc': 759,
 'ador': 685,
 'top': 33,
 'hint': 1020,
 'lace': 349,
 'key': 307,
 'hole': 179,
 'back': 47,
 'pale': 2434,
 'pink': 22,
 '1x': 420,
 'also': 98,
 '3x': 3612,
 'avail': 157,
 'white': 37,
 'new': 0,
 'tag': 17,
 'leather': 101,
 'hors': 1416,
 'retail': 71,
 'rm': 6,
 'stand': 382,
 'foot': 643,
 'high': 120,
 'sold': 216,
 'pair': 88,
 'question': 87,
 'pleas': 28,
 'ask': 66,
 'free': 7,
 'ship': 2,
 'just': 39,
 'got': 224,
 'storag': 717,
 'complet': 279,
 'certif': 3297,
 'authent': 35,
 'banana': 560,
 'republ': 597,
 'bottom': 104,
 'candi': 471,
 'skirt': 436,
 'match': 271,
 'blazer': 3416,
 'ami': 3534,
 'byer': 3337,
 'suit': 581,
 'loft': 2662,
 'cami': 3324,
 'size': 1,
 'small': 23,
 'strap': 67,
 'slightli': 334,
 'shorten

In [63]:
def doc_vector(content):
    word_freq = nltk.FreqDist(content)
    vector = np.zeros(len(lexicon))
    if(content!=""):
        max_freq = max(word_freq.values())
        for word in content:
            if(word in wordids):
                vector[lexicon[word]] = float(word_freq[word])/float(max_freq)
    return vector

In [64]:
short_df['tf'] = short_df['item_description'].apply(doc_vector)

In [65]:
sum([ 1 if'great' in x else 0 for x in short_df['item_description_tokens']])

95

In [66]:
import math
idf = [0] * len(lexicon)
for word in lexicon:
    dc = sum([ 1 if word in x else 0 for x in short_df['item_description_tokens']])
    idf[lexicon[word]] = math.log(len(short_df)/dc)
idf

[1.2275826699650698,
 1.313043899380298,
 1.9173226922034008,
 1.8451602459551701,
 1.8904754421672127,
 1.9805015938249324,
 2.407945608651872,
 2.137070654516472,
 2.162823150618887,
 2.2256240518579173,
 2.3538783873815965,
 2.4769384801388235,
 2.7806208939370456,
 2.5902671654458267,
 2.5257286443082556,
 2.703062659591171,
 2.6882475738060303,
 2.5383074265151158,
 2.659260036932778,
 2.6036901857779675,
 2.5510464522925456,
 2.6736487743848776,
 2.7488721956224653,
 2.7181005369557116,
 2.9374633654300153,
 2.8302178350764176,
 2.882403588246988,
 2.995732273553991,
 2.9374633654300153,
 2.9187712324178627,
 2.9187712324178627,
 2.9374633654300153,
 2.995732273553991,
 3.0159349808715104,
 3.079113882493042,
 2.9759296462578115,
 3.123565645063876,
 3.079113882493042,
 2.995732273553991,
 3.036554268074246,
 3.170085660698769,
 3.079113882493042,
 3.0576076772720784,
 3.123565645063876,
 3.123565645063876,
 3.2188758248682006,
 3.3813947543659757,
 3.170085660698769,
 3.19418321

In [67]:
short_df['tfidf'] = short_df['tf'] * idf

In [68]:
short_df.tfidf[1]


array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [69]:
tfidf_df = pd.DataFrame(short_df['tfidf'].tolist(), columns=['word' + str(x) for x in range(1,len(lexicon)+1)])

In [70]:
col_list = ['name','item_condition_id','category_name','brand_name','price','shipping'] + ['word' + str(x) for x in range(1,len(lexicon)+1)]
short_df = pd.concat([short_df, tfidf_df], axis=1, sort=False)[col_list]

In [71]:
def splitCategory(categoryString):
    return categoryString.split("/")

In [72]:
short_df['category_name'] = short_df['category_name'].fillna('/ / ')
short_df['categories'] = short_df['category_name'].apply(splitCategory)

In [73]:
short_df

Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping,word1,word2,word3,word4,...,word3764,word3765,word3766,word3767,word3768,word3769,word3770,word3771,word3772,categories
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Men, Tops, T-shirts]"
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Electronics, Computers & Tablets, Components ..."
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Women, Tops & Blouses, Blouse]"
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Home, Home Décor, Home Décor Accents]"
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Women, Jewelry, Necklaces]"
5,Bundled items requested for Ruie,3,Women/Other/Other,,59.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Women, Other, Other]"
6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Women, Swimwear, Two-Piece]"
7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Sports & Outdoors, Apparel, Girls]"
8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Sports & Outdoors, Apparel, Girls]"
9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,,8.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[Vintage & Collectibles, Collectibles, Doll]"


In [74]:
short_df['category_sub_1'], short_df['category_sub_2'], short_df['category_sub_3'] = \
zip(*short_df['categories'])

In [75]:
col_list = ['name','item_condition_id','category_sub_1','category_sub_2','category_sub_3','brand_name','price','shipping'] + ['word' + str(x) for x in range(1,len(lexicon)+1)]
short_df = short_df[col_list]

In [76]:
short_df

Unnamed: 0,name,item_condition_id,category_sub_1,category_sub_2,category_sub_3,brand_name,price,shipping,word1,word2,...,word3763,word3764,word3765,word3766,word3767,word3768,word3769,word3770,word3771,word3772
0,MLB Cincinnati Reds T Shirt Size XL,3,Men,Tops,T-shirts,,10.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Razer BlackWidow Chroma Keyboard,3,Electronics,Computers & Tablets,Components & Parts,Razer,52.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AVA-VIV Blouse,1,Women,Tops & Blouses,Blouse,Target,10.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Leather Horse Statues,1,Home,Home Décor,Home Décor Accents,,35.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,24K GOLD plated rose,1,Women,Jewelry,Necklaces,,44.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Bundled items requested for Ruie,3,Women,Other,Other,,59.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Acacia pacific tides santorini top,3,Women,Swimwear,Two-Piece,Acacia Swimwear,64.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors,Apparel,Girls,Soffe,6.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Girls Nike Pro shorts,3,Sports & Outdoors,Apparel,Girls,Nike,19.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles,Collectibles,Doll,,8.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
