In [1]:
import pandas as pd
import numpy as np
import fasttext
import collections
import string
from tqdm import tqdm
import pickle
import csv
import re
import os
from collections import Counter

In [2]:
df = pd.read_csv('../../data/SkidSteer_2019-08.csv')
df.shape

(8172, 20)

In [3]:
COLUMN_NAMES = ['Unique_ID', 'Winning Bid', 'Hours Final', 'Age at Sale (bin)',
                'Bucket', 'Engine', 'Tires', 'Transmission', 'details remaining']
RENAME_SCHEMA = {
    'Unique_ID': "unique_id",
    'Hours Final': "hours_final",
    'Winning Bid': "winning_bid",
    'Age at Sale (bin)': "age_at_sale",
    'Bucket': "bucket",
    'Engine': "engine",
    'Tires': "tires",
    'Transmission': "transmission",
    'details remaining': "details_remaining",
    'socre': "colorfulness_score"
}

In [4]:
image_root = "../../data/images/"
score_df = pd.read_csv("../colorfulness/skid_steer_color_score.csv")

processed_df = df.copy()
processed_df['Unique_ID'] = processed_df[['Source', 'item#']].apply(lambda x: '_'.join(x), axis=1)
processed_df = processed_df.filter(COLUMN_NAMES, axis=1)
processed_df = pd.merge(processed_df, score_df, on='Unique_ID', how='inner')
processed_df = processed_df.rename(columns=RENAME_SCHEMA)

duplicated_item = [item for item, count in Counter(processed_df["unique_id"]).items() if count > 1]
processed_df = processed_df[~processed_df['unique_id'].isin(duplicated_item)]

image_item = [img_name.strip(".jpg") for img_name in os.listdir(image_root)]
processed_df = processed_df[processed_df["unique_id"].isin(image_item)]

processed_df = processed_df[processed_df['unique_id'] != "rbauction_10525632"]

processed_df["winning_bid"] = processed_df["winning_bid"].str.replace(',', '').astype(int)

In [5]:
df = processed_df
df.shape

(6168, 10)

In [118]:
df.tail()

Unnamed: 0,unique_id,winning_bid,hours_final,age_at_sale,bucket,engine,tires,transmission,details_remaining,score
6177,rbauction_10471129,5000,9016,18.0,bkt,,,,"aux hyd, canopy",30.589238
6178,rbauction_11251937,11000,2396,12.0,bkt,,,,canopy,36.586281
6179,ironplanet_1963734,8300,1,,,Manual Coupler The engine started and ran. The...,Cushion Tires,,"Auxiliary Hydraulic Plumbing, Open Operator St...",16.456728
6180,ironplanet_1864149,14200,322,3.0,"66"" Wide General Purpose Smooth Edge Bucket",,Cushion Tires,,"Heater, Hydraulic Coupler, Enclosed Cab",26.860051
6181,ironplanet_1686964,8500,2,9.0,"68"" General Purpose Smooth Edge Bucket The eng...",,,,"Auxiliary Hydraulic Plumbing, Manual Coupler, ...",27.659484


### Two ways to preprocess 'detail_remaining' column
1. Basic: with every punctuation removed, each word has its own embedding
2. Another: each phrase separated by a comma has its own embedding

### Two approches to treat embedding coming from 4 different sources
1. Treat them as coming from one source, only deal with text
2. Train one fasttext embedding for each source, i.e. rbauction, PW,et.

### 1) Basic

In [125]:
def getNonfloat(textSeries, textIndex):
    subtext = []
    subindex = []
    for sentence, index in zip(textSeries, textIndex):
        if type(sentence) != float:
            subtext.append(sentence)
            subindex.append(index)
    return subtext,subindex

In [126]:
def getIndexText(df,source):
    details_text = df[df['unique_id'].str.startswith(source)]['details_remaining'].str.strip().str.lower().str.replace('[{}]'.format(string.punctuation), '')
    details_index = list(details_text.index)
    text, index = getNonfloat(details_text, details_index)
    return text, index

In [127]:
"""
Run it only once to 
1) save the textdf csv
2) get the fasttext model saved
"""
def getEmbedModel(df, source):
    text, index = getIndexText(df,source)
    textdf = pd.DataFrame(text)
    textdf.to_csv("data/{}_text.csv".format(source), sep='\t', index=False)
    
    # train fasttext embedding --> this is slow
    model = fasttext.train_unsupervised("./data/{}_text.csv".format(source), model='skipgram')
    model.save_model('./models/{}_skipgram_model.bin'.format(source))

In [128]:
"""
Read embeddings from fasttext model
return: 1) embeddings for source
        2) index for embeddings in the original df
"""
def getEmbed(df, source):
    text, index = getIndexText(df,source)
    model = fasttext.load_model('./models/{}_skipgram_model.bin'.format(source))
    sentence_embedding = []
    for i in range(len(text)):
        sentence_embedding.append(model.get_sentence_vector(text[i]))
    return sentence_embedding, index

In [129]:
## main execution
sources = ['rbauct','big','iron','PW']
list_embed = []
list_index = []
for source in tqdm(sources):
    getEmbedModel(df, source)
    sentence_embedding, index = getEmbed(df, source)
    list_embed += sentence_embedding
    list_index += index

100%|██████████| 4/4 [00:06<00:00,  1.69s/it]


In [130]:
len(list_embed), len(list_index)

(6155, 6155)

In [131]:
embed_df = pd.DataFrame(np.vstack(list_embed))
embed_df.insert(0, column="index", value=list_index) # the index are not sorted yet
embed_df = embed_df.sort_values(embed_df.columns[0], ascending = True)

In [132]:
embed_df.shape

(6155, 101)

In [133]:
pickle.dump(embed_df, open("models/embed.p", "wb"))

In [134]:
# Check the row number of each source
# 4 different sources: rbauct, bigiron, ironplanet, PW
rbauct_text, rbauct_index = getIndexText(df, 'rbauct')
big_text, big_index = getIndexText(df, 'big')
iron_text, iron_index = getIndexText(df, 'iron')
PW_text, PW_index = getIndexText(df, 'PW')

In [135]:
len(rbauct_text), len(rbauct_index), len(big_text), len(big_index), len(iron_text), len(iron_index), len(PW_text), len(PW_index)

(3930, 3930, 423, 423, 501, 501, 1301, 1301)

In [136]:
set(df.index) - set(list_index)

{300, 379, 393, 422, 1241, 3790, 4781, 4790, 4802, 4935, 5188, 5371, 6052}

In [138]:
list(np.where(df.details_remaining.isna())[0])

[300, 379, 393, 422, 1241, 3776, 4767, 4776, 4788, 4921, 5174, 5357, 6038]

In [142]:
df.loc[5371]

unique_id            ironplanet_1830435
winning_bid                       12000
hours_final                         NaN
age_at_sale                           6
bucket                              NaN
engine                              NaN
tires                               NaN
transmission                        NaN
details_remaining                   NaN
score                           39.2404
Name: 5371, dtype: object

### 2) Another -- actually don't quite make sense except for text from 'rbauction'

In [17]:
def getNonfloat2(textSeries, textIndex):
    text_list = []
    index_list = []
    for sentence, index in zip(textSeries, textIndex):
        subtext = ''
        if type(sentence)!= float:
            for phrase in sentence:
                word_concat = '_'.join(phrase.strip().split(' '))
                subtext += word_concat + ' '
            text_list.append(subtext.strip())
            index_list.append(index)
    return text_list,index_list

In [18]:
def getIndexText2(df,source):
    details_text = df[df['unique_id'].str.startswith(source)]['details_remaining'].str.strip().str.lower().str.replace('[^,\w\s]','').str.split(',')
    details_index = list(details_text.index)
    text, index = getNonfloat2(details_text, details_index)
    return text, index

In [19]:
"""
Run it only once to 
1) save the textdf csv
2) get the fasttext model saved
"""
def getEmbedModel2(df, source):
    text, index = getIndexText2(df,source)
    textdf = pd.DataFrame(text)
    textdf.to_csv("data2/{}_text.csv".format(source), sep='\t', index=False)
    
    # train fasttext embedding --> this is slow
    model = fasttext.train_unsupervised("data2/{}_text.csv".format(source), model='skipgram')
    model.save_model('./models2/{}_skipgram_model.bin'.format(source))

In [20]:
"""
Read embeddings from fasttext model
return: 1) embeddings for source
        2) index for embeddings in the original df
"""
def getEmbed2(df, source):
    text, index = getIndexText2(df,source)
    model = fasttext.load_model('./models2/{}_skipgram_model.bin'.format(source))
    sentence_embedding = []
    for i in range(len(text)):
        sentence_embedding.append(model.get_sentence_vector(text[i]))
    return sentence_embedding, index

In [21]:
## main execution
sources = ['rbauct','big','iron','PW']
list_embed = []
list_index = []
for source in tqdm(sources):
    getEmbedModel2(df, source)
    sentence_embedding, index = getEmbed2(df, source)
    list_embed += sentence_embedding
    list_index += index

100%|██████████| 4/4 [00:06<00:00,  1.67s/it]
