# Price for OfferUp

In [1]:
import graphlab
import re, string
import graphlab.aggregate as agg
graphlab.canvas.set_target('ipynb')

A newer version of GraphLab Create (v2.1) is available! Your current version is v1.9.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.


## Load image analysis datasets

I've reduced the data to 6 categories in 3 groups: phones, home (Furniture, Household, Home & Garden), apparel (Baby & Kids, Clothing & Shoes) 
This dataset is already split into a training set and test set (80/20).

In [2]:
phones_train = graphlab.load_sframe('../../data/models/phones_train')
phones_test = graphlab.load_sframe('../../data/models/phones_test')

home_train = graphlab.load_sframe('../../data/models/home_train')
home_test = graphlab.load_sframe('../../data/models/home_test')

apparel_train = graphlab.load_sframe('../../data/models/apparel_train')
apparel_test = graphlab.load_sframe('../../data/models/apparel_test')

2016-08-07 09:23:56,482 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.9 started. Logging: /tmp/graphlab_server_1470587034.log


This trial license of GraphLab Create is assigned to tkutsenko@gmail.com and will expire on April 14, 2017. Please contact trial@turi.com for licensing options or to request a free non-commercial license for academic use.


IOError: /Users/tk/Projects/Galvanize/data/models/phones_train not found.: unspecified iostream_category error: unspecified iostream_category error

### Features

In [3]:
features_lst = ['id', 'category_id', 'category_name', 'count_words', 'tfidf', 'image', 'deep_features','price']

def features(sf):
    # Make price float from string
    sf['price'] = sf['price'].astype(float)
    #Remove outliers
    sf = sf[sf['price'] < 1500]
    
    # Combine words from title and description
    sf['words'] = sf['title'] + ' ' + sf['description']
    # Remove punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    sf['words'] = sf['words'].apply(lambda x: regex.sub('', x))
    #Ttransforms row into an ordered list of strings that represents the a simpler version of the 
    #Penn-Tree-Bank-style (PTB-style) tokenization
    sf['words'] = graphlab.text_analytics.tokenize(sf['words'])

    #Bag-of-words
    sf['count_words'] = graphlab.text_analytics.count_words(sf['words'])
    #Text cleaning. Remove stop words.
    sf['count_words'] = sf['count_words'].dict_trim_by_keys(graphlab.text_analytics.stopwords(), exclude=True)
    #TF-IDF (term frequency - inverse document frequency)
    sf['tfidf'] = graphlab.text_analytics.tf_idf(sf['count_words'])
    
    return sf.select_columns(features_lst)

In [4]:
phones_train = features(phones_train)
home_train = features(home_train)
apparel_train = features(apparel_train)
phones_test = features(phones_test)
home_test = features(home_test)
apparel_test = features(apparel_test)

## Work with title and description

I use Topic models from GraphLab Create to generate topic by title and description

In [6]:
# Learn topic model
#model = graphlab.topic_model.create(apparel_train['count_words'], num_topics=100, num_iterations=100)

In [None]:
#apparel_train['topic'] = model.predict(apparel_train['count_words'])

In [None]:
#As with other models in GraphLab Create, it's also easy to save and load models.

#model.save('my_model')
#new_model = graphlab.load_model('my_model')

### Evaluating topic models

In [None]:
#model.evaluate(apparel_test['count_words'])

Stop on 100 topics with 100 iterations as it gives one of the best results.

In [5]:
# Learn topic model
def topic_model(sf):
    model = graphlab.topic_model.create(sf['count_words'], num_topics=100, num_iterations=100)
    sf['topic'] = model.predict(sf['count_words'])
    for i in  xrange(100):
        sf['topic_' + str(i)] = sf['topic'].apply(lambda x: (1 if int(x) == i else 0))
    return sf

In [68]:
phones_train = topic_model(phones_train)
home_train = topic_model(home_train)
apparel_train = topic_model(apparel_train)
phones_test = topic_model(phones_test)
home_test = topic_model(home_test)
apparel_test = topic_model(apparel_test)


# Final model 1
## Find 5 nearest neighbours

In [70]:
def nearest_neighbors(sf, name):
    #'deep_features','tfidf','topic_0','topic_1','topic_2', ...
    feature_lst = set(
                      sf.column_names()) - \
                      set(['id', 'category_id', 'category_name', 'count_words', 'image', 'price', 'topic']
                      )
    model = graphlab.nearest_neighbors.create(sf, features=list(feature_lst))
    model.save('data/similar_images_for_' + name)
    return model

In [71]:
def get_neighbours(model, item):
    similar_images = model.query(item, k = 5)
    similar_images = similar_images.groupby(key_columns='query_label', operations={"neighbours":agg.CONCAT("reference_label")})
    return similar_images

In [72]:
phones_model = nearest_neighbors(phones_train, 'phones_train')

Defaulting to brute force instead of ball tree because there are multiple distance components.


In [73]:
#phones_model = graphlab.load_model('data/similar_images_for_phones_train')

In [74]:
get_neighbours(phones_model, graphlab.SFrame(phones_test.head(1)))

query_label,neighbours
0,"[8531, 7164, 2450, 1418, 5443] ..."


In [75]:
apparel_model = nearest_neighbors(apparel_train, 'apparel_train')

Defaulting to brute force instead of ball tree because there are multiple distance components.


In [76]:
#apparel_model = graphlab.load_model('data/similar_images_for_apparel_train')

In [77]:
get_neighbours(apparel_model, graphlab.SFrame(apparel_test.head(1)))

query_label,neighbours
0,"[11, 22475, 62252, 15549, 31027] ..."


In [13]:
home_model = nearest_neighbors(home_train, 'home_train')

Defaulting to brute force instead of ball tree because there are multiple distance components.


In [14]:
#home_model = graphlab.load_model('data/similar_images_for_home_train')

In [15]:
get_neighbours(home_model, graphlab.SFrame(home_test.head(1)))

query_label,neighbours
0,"[32480, 6175, 54850, 0, 49131] ..."


In [88]:
graphlab.canvas.set_target('ipynb')
graphlab.SArray([apparel_test['image'][0]]).show() 
apparel_train['price'][0]

100.0

In [87]:
graphlab.SArray([apparel_train['image'][11]]).show()
apparel_train['price'][11]

50.0

In [89]:
graphlab.SArray([apparel_train['image'][22475]]).show()
apparel_train['price'][22475]

75.0

In [90]:
graphlab.SArray([apparel_train['image'][62252]]).show()
apparel_train['price'][62252]

100.0

In [91]:
graphlab.SArray([apparel_train['image'][15549]]).show()
apparel_train['price'][15549]

25.0

# Final model 2
## Predict exact price

In [96]:
# Create a model.
def random_forest_model(sf, name):
    #'deep_features','tfidf','topic_0','topic_1','topic_2', ...
    feature_lst = set(
                      sf.column_names()) - \
                      set(['id', 'category_id', 'category_name', 'count_words', 'image', 'price', 'topic']
                      )

    model = graphlab.random_forest_regression.create(sf, target='price',
                                          features=list(feature_lst),
                                          #max_iterations = 20,
                                          max_depth =  3
                                          )

    model.save('data/random_forest_regression_for_' + name)
    return model

In [97]:
def evaluate(model, test_set):
    results = model.evaluate(test_set)
    return results

In [98]:
phones_model = random_forest_model(phones_train, 'phones_train')

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [99]:
#phones_model = graphlab.load_model('data/random_forest_regression_for_phones_train')

In [100]:
evaluate(phones_model, graphlab.SFrame(phones_test.head(1)))

{'max_error': 70.70128631591797, 'rmse': 70.70128631591797}

In [26]:
home_model = random_forest_model(home_train, 'home_train')

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [61]:
#home_model = graphlab.load_model('data/random_forest_regression_for_home_train')

In [28]:
evaluate(home_model, graphlab.SFrame(home_train.head(1)))

{'max_error': 58.84905242919922, 'rmse': 58.84905242919922}

In [31]:
apparel_model = random_forest_model(apparel_train, 'apparel_train')

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [32]:
#apparel_model = graphlab.load_model('data/random_forest_regression_for_apparel_train')

In [33]:
evaluate(apparel_model, graphlab.SFrame(apparel_train.head(1)))

{'max_error': 60.160614013671875, 'rmse': 60.160614013671875}

In [122]:
# Create a model.
def  gradient_boosted_regression_trees_model(sf, name):
    #'deep_features','tfidf','topic_0','topic_1','topic_2', ...
    feature_lst = set(
                      sf.column_names()) - \
                      set(['id', 'category_id', 'category_name', 'count_words', 'image', 'price', 'topic']
                      )

    model = graphlab.boosted_trees_regression.create(sf, target='price',
                                          features=list(feature_lst),
                                          max_iterations = 20,
                                          max_depth =  3
                                          )

    model.save('data/boosted_trees_regression_for_' + name)
    return model

In [123]:
phones_model = gradient_boosted_regression_trees_model(phones_train, 'phones_train')

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [124]:
evaluate(phones_model, graphlab.SFrame(phones_test.head(1)))

{'max_error': 35.06368637084961, 'rmse': 35.06368637084961}

# Final model 3
## Predict price bin