# Price for OfferUp

In [43]:
import graphlab
import re, string
import graphlab.aggregate as agg
graphlab.canvas.set_target('ipynb')

## Load image analysis datasets

I've reduced the data to 6 categories in 3 groups: phones, home (Furniture, Household, Home & Garden), apparel (Baby & Kids, Clothing & Shoes) 
This dataset is already split into a training set and test set (80/20).

In [44]:
phones_train = graphlab.load_sframe('data/models/phones_train')
phones_test = graphlab.load_sframe('data/models/phones_test')

home_train = graphlab.load_sframe('data/models/home_train')
home_test = graphlab.load_sframe('data/models/home_test')

apparel_train = graphlab.load_sframe('data/models/apparel_train')
apparel_test = graphlab.load_sframe('data/models/apparel_test')

### Features

In [45]:
features_lst = ['id', 'category_id', 'category_name', 'count_words', 'tfidf', 'image', 'deep_features','price']

def features(sf):
    # Make price float from string
    sf['price'] = sf['price'].astype(float)
    #Remove outliers
    sf = sf[sf['price'] < 1500]
    
    # Combine words from title and description
    sf['words'] = sf['title'] + ' ' + sf['description']
    # Remove punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    sf['words'] = sf['words'].apply(lambda x: regex.sub('', x))
    #Ttransforms row into an ordered list of strings that represents the a simpler version of the 
    #Penn-Tree-Bank-style (PTB-style) tokenization
    sf['words'] = graphlab.text_analytics.tokenize(sf['words'])

    #Bag-of-words
    sf['count_words'] = graphlab.text_analytics.count_words(sf['words'])
    #Text cleaning. Remove stop words.
    sf['count_words'] = sf['count_words'].dict_trim_by_keys(graphlab.text_analytics.stopwords(), exclude=True)
    #TF-IDF (term frequency - inverse document frequency)
    sf['tfidf'] = graphlab.text_analytics.tf_idf(sf['count_words'])
    
    return sf.select_columns(features_lst)

In [46]:
phones_train = features(phones_train)
home_train = features(home_train)
apparel_train = features(apparel_train)
phones_test = features(phones_test)
home_test = features(home_test)
apparel_test = features(apparel_test)

## Work with title and description

I use Topic models from GraphLab Create to generate topic by title and description

In [47]:
# Learn topic model
model = graphlab.topic_model.create(apparel_train['count_words'], num_topics=100, num_iterations=100)

In [None]:
apparel_train['topic'] = model.predict(apparel_train['count_words'])

In [None]:
#As with other models in GraphLab Create, it's also easy to save and load models.

#model.save('my_model')
#new_model = graphlab.load_model('my_model')

### Evaluating topic models

In [48]:
model.evaluate(apparel_test['count_words'])

{'perplexity': 885.1069232300025}

Stop on 100 topics with 100 iterations as it gives one of the best results.

In [49]:
# Learn topic model
def topic_model(sf):
    model = graphlab.topic_model.create(sf['count_words'], num_topics=100, num_iterations=100)
    sf['topic'] = model.predict(sf['count_words'])
    for i in  xrange(100):
        sf['topic_' + str(i)] = sf['topic'].apply(lambda x: (1 if int(x) == i else 0))
    return sf

In [50]:
phones_train = topic_model(phones_train)
home_train = topic_model(home_train)
apparel_train = topic_model(apparel_train)

# Final model 1
## Find 5 nearest neighbours

In [82]:
def nearest_neighbors(sf, name):
    #'deep_features','tfidf','topic_0','topic_1','topic_2', ...
    feature_lst = set(
                      sf.column_names()) - \
                      set(['id', 'category_id', 'category_name', 'count_words', 'image', 'price', 'topic']
                      )
    model = graphlab.nearest_neighbors.create(sf, features=list(feature_lst))
    model.save('data/similar_images_for_' + name)
    return model

In [102]:
def get_neighbours(model, item):
    similar_images = model.query(item, k = 5)
    similar_images = similar_images.groupby(key_columns='query_label', operations={"neighbours":agg.CONCAT("reference_label")})
    return similar_images

In [83]:
phones_model = nearest_neighbors(phones_train, 'phones_train')

Defaulting to brute force instead of ball tree because there are multiple distance components.


In [105]:
#phones_model = graphlab.load_model('data/similar_images_for_phones_train')

In [103]:
get_neighbours(phones_model, graphlab.SFrame(phones_train.head(1)))

query_label,neighbours
0,"[0, 2, 1, 5430, 5431]"


In [110]:
apparel_model = nearest_neighbors(apparel_train, 'apparel_train')

Defaulting to brute force instead of ball tree because there are multiple distance components.


In [112]:
#apparel_model = graphlab.load_model('data/similar_images_for_apparel_train')

In [111]:
get_neighbours(apparel_model, graphlab.SFrame(apparel_train.head(1)))

query_label,neighbours
0,"[10233, 2963, 64391, 64389, 0] ..."


In [113]:
home_model = nearest_neighbors(home_train, 'home_train')

Defaulting to brute force instead of ball tree because there are multiple distance components.


In [114]:
#home_model = graphlab.load_model('data/similar_images_for_home_train')

In [135]:
get_neighbours(home_model, graphlab.SFrame(home_train.head(1)))

query_label,neighbours
0,"[49131, 0, 54850, 32480, 6175] ..."


In [130]:
graphlab.canvas.set_target('ipynb')
graphlab.SArray([apparel_train['image'][0]]).show() 

In [131]:
graphlab.SArray([apparel_train['image'][10233]]).show()

In [132]:
graphlab.SArray([apparel_train['image'][2963]]).show()

In [133]:
graphlab.SArray([apparel_train['image'][64391]]).show()

In [134]:
graphlab.SArray([apparel_train['image'][64389]]).show()

# Final model 2
## Predict exact price

In [None]:
# Create a model.
model = graphlab.random_forest_regression.create(apparel_train, target='price',
                                          features = ['deep_features', 'topic'],
                                          max_iterations = 20,
                                          max_depth =  3
                                          )

# Save predictions to an SFrame (class and corresponding class-probabilities)
predictions = model.predict(apparel_test)
results = model.evaluate(apparel_test)
results