In [1]:
import os
import re
import copy
import string
import pickle
import logging
from PIL import Image
from collections import Counter

import numpy as np
import pandas as pd
import fasttext

In [2]:
def getNonfloat(textSeries, textIndex):
    subtext = []
    subindex = []
    for sentence, index in zip(textSeries, textIndex):
        if type(sentence) != float:
            subtext.append(sentence)
            subindex.append(index)
    return subtext,subindex


def getIndexText(df, source):
    details_text = df[df['unique_id'].str.startswith(source)]['details_remaining'].str.strip().str.lower().str.replace('[{}]'.format(string.punctuation), '')
    details_index = list(details_text.index)
    text, index = getNonfloat(details_text, details_index)
    return text, index


def getEmbedModel(df, source, model_save_path):
    '''Save the textdf csv and the fasttext model.'''
    text, index = getIndexText(df, source)
    textdf = pd.DataFrame(text)
    textdf_path = os.path.join(model_save_path, "details_text_{}.csv".format(source))
    textdf.to_csv(textdf_path, sep='\t', index=False)

    model = fasttext.train_unsupervised(textdf_path, model='skipgram')
    model_path = os.path.join(model_save_path, "text_skipgram_model_{}.bin".format(source))
    model.save_model(model_path)


def getEmbed(df, source, model_save_path):
    '''Read embeddings from fasttext model and return embeddings and index.'''
    text, index = getIndexText(df,source)
    model_path = os.path.join(model_save_path, "text_skipgram_model_{}.bin".format(source))
    model = fasttext.load_model(model_path)
    sentence_embedding = []
    for i in range(len(text)):
        sentence_embedding.append(model.get_sentence_vector(text[i]))
    return sentence_embedding, index

In [3]:
df = pd.read_csv('../../data/SkidSteer_2019-08.csv')
df.shape

(8172, 20)

In [4]:
COLUMN_NAMES = ['Unique_ID', 'Winning Bid', 'Hours Final', 'Age at Sale (bin)',
                'Bucket', 'Engine', 'Tires', 'Transmission', 'details remaining']
RENAME_SCHEMA = {
    'Unique_ID': "unique_id",
    'Hours Final': "hours_final",
    'Winning Bid': "winning_bid",
    'Age at Sale (bin)': "age_at_sale",
    'Bucket': "bucket",
    'Engine': "engine",
    'Tires': "tires",
    'Transmission': "transmission",
    'details remaining': "details_remaining",
    'socre': "colorfulness_score"
}

In [5]:
score_df = pd.read_csv("../colorfulness/skid_steer_color_score.csv")
image_root = "../../data/images"

processed_df = df.copy()
processed_df['Unique_ID'] = processed_df[['Source', 'item#']].apply(lambda x: '_'.join(x), axis=1)
processed_df = processed_df.filter(COLUMN_NAMES, axis=1)
processed_df = pd.merge(processed_df, score_df, on='Unique_ID', how='inner')
processed_df = processed_df.rename(columns=RENAME_SCHEMA)

duplicated_item = [item for item, count in Counter(processed_df["unique_id"]).items() if count > 1]
processed_df = processed_df[~processed_df['unique_id'].isin(duplicated_item)]

image_item = [img_name.strip(".jpg") for img_name in os.listdir(image_root)]
processed_df = processed_df[processed_df["unique_id"].isin(image_item)]

processed_df = processed_df[processed_df['unique_id'] != "rbauction_10525632"]

processed_df["winning_bid"] = processed_df["winning_bid"].str.replace(',', '').astype(int)

df = processed_df

In [7]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


## parts sentiments
def part_sentiment_df(data, parts):
    for part in parts:
        part_sentiment_name = part + "_sentiment"
        data[part_sentiment_name] = ""
        analyzer = SentimentIntensityAnalyzer()
        list_indices = np.where(~data[part].isnull())[0]
        for indice in list_indices:
            sentence = data[part].tolist()[indice]
            if len(sentence)>100:  #pick up only very strong signals
                vs = analyzer.polarity_scores(sentence)
                lb = vs['compound']
                if lb >= 0.05:
                    score = 1
                elif (lb > -0.05) and (lb < 0.05):
                    score = 0
                else:
                    score = -1
                data[part_sentiment_name][indice] = score
            else:
                data[part_sentiment_name][indice] = 0
        data[part_sentiment_name] = pd.to_numeric(data[part_sentiment_name]).fillna(0)

In [10]:
df.tail()

Unnamed: 0,unique_id,winning_bid,hours_final,age_at_sale,bucket,engine,tires,transmission,details_remaining,score,engine_sentiment,bucket_sentiment,tires_sentiment,transmission_sentiment
6177,rbauction_10471129,5000,9016,18.0,bkt,,,,"aux hyd, canopy",30.589238,0.0,0.0,0.0,0.0
6178,rbauction_11251937,11000,2396,12.0,bkt,,,,canopy,36.586281,0.0,0.0,0.0,0.0
6179,ironplanet_1963734,8300,1,,,Manual Coupler The engine started and ran. The...,Cushion Tires,,"Auxiliary Hydraulic Plumbing, Open Operator St...",16.456728,0.0,0.0,0.0,0.0
6180,ironplanet_1864149,14200,322,3.0,"66"" Wide General Purpose Smooth Edge Bucket",,Cushion Tires,,"Heater, Hydraulic Coupler, Enclosed Cab",26.860051,0.0,0.0,0.0,0.0
6181,ironplanet_1686964,8500,2,9.0,"68"" General Purpose Smooth Edge Bucket The eng...",,,,"Auxiliary Hydraulic Plumbing, Manual Coupler, ...",27.659484,0.0,0.0,0.0,0.0


In [8]:
part_sentiment('engine')
part_sentiment('bucket')
part_sentiment('tires')
part_sentiment('transmission')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[part_sentiment][indice] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[part_sentiment][indice] = score


In [11]:
df.tail()

Unnamed: 0,unique_id,winning_bid,hours_final,age_at_sale,bucket,engine,tires,transmission,details_remaining,score,engine_sentiment,bucket_sentiment,tires_sentiment,transmission_sentiment
6177,rbauction_10471129,5000,9016,18.0,bkt,,,,"aux hyd, canopy",30.589238,0.0,0.0,0.0,0.0
6178,rbauction_11251937,11000,2396,12.0,bkt,,,,canopy,36.586281,0.0,0.0,0.0,0.0
6179,ironplanet_1963734,8300,1,,,Manual Coupler The engine started and ran. The...,Cushion Tires,,"Auxiliary Hydraulic Plumbing, Open Operator St...",16.456728,0.0,0.0,0.0,0.0
6180,ironplanet_1864149,14200,322,3.0,"66"" Wide General Purpose Smooth Edge Bucket",,Cushion Tires,,"Heater, Hydraulic Coupler, Enclosed Cab",26.860051,0.0,0.0,0.0,0.0
6181,ironplanet_1686964,8500,2,9.0,"68"" General Purpose Smooth Edge Bucket The eng...",,,,"Auxiliary Hydraulic Plumbing, Manual Coupler, ...",27.659484,0.0,0.0,0.0,0.0


In [6]:
def add_embeddings_df(data, model_save_path):
    '''Compute, save and add embeddings of detais_remaining to the dataframe'''
    os.makedirs(model_save_path)
    sources = ['rbauct','big','iron','PW']
    list_embed = []
    list_index = []
    for source in sources:
        getEmbedModel(df, source, model_save_path)
        sentence_embedding, index = getEmbed(df, source, model_save_path)
        list_embed += sentence_embedding
        list_index += index
    index_nan = set(df.index) - set(list_index)
    
    source_col = []
    for ii in df.unique_id:
        if ii.startswith(sources[0]):
            source_col.append(0)
        elif ii.startswith(sources[1]):
            source_col.append(1)
        elif ii.startswith(sources[2]):
            source_col.append(2)
        elif ii.startswith(sources[3]):
            source_col.append(3)
    data.insert(len(data.columns), "details_remaining_source", source_col)
    
    data.insert(len(data.columns), "details_remaining_nan", 0)
    data.loc[tt["details_remaining"].isna(), "details_remaining_nan"] = 1
    
    embeds_median = np.median(np.array(list_embed), axis=0)
    embeds = {index:embed for index, embed in zip(list_index, list_embed)}
    embeds_list = [embeds[ii] if ii not in index_nan else embeds_median for ii in tt.index]
    data.insert(len(data.columns), "details_remaining_embedding", embeds_list)
    return None

In [8]:
model_save_path = "./model"
tt = df.copy()
add_embeddings_df(tt, model_save_path)



In [10]:
import pickle

In [11]:
pickle.dump(tt, open("./temp.pickle", "wb"))

In [12]:
ttt = pickle.load(open("./temp.pickle", "rb"))

In [35]:
a = torch.tensor(ttt.iloc[0, [3,9]])

In [36]:
b = torch.tensor(ttt.iloc[0, 12])

In [40]:
torch.cat([a, b])

tensor([ 6.0000e+00,  3.7880e+01,  1.4204e-01,  1.5712e-02,  5.4453e-02,
        -5.8210e-02,  8.6322e-02,  1.8004e-01, -2.8230e-01,  6.9809e-02,
         5.2404e-02,  3.4871e-02, -5.2265e-03, -4.7505e-03, -5.8680e-02,
         1.1041e-01, -4.9544e-02, -5.0127e-02,  1.2761e-01, -1.0516e-03,
        -1.4326e-01,  1.7514e-01,  6.4962e-02,  3.2404e-02,  1.7245e-01,
         1.6262e-01, -7.8393e-02, -7.3804e-02, -7.6977e-02, -2.7525e-02,
         6.7521e-02, -9.8476e-02,  1.6419e-02, -1.1085e-02,  2.3954e-02,
         1.7158e-03, -1.0959e-01, -4.6925e-02, -9.6573e-02,  7.3604e-02,
        -1.7706e-02, -5.6208e-02, -1.2453e-01,  1.3220e-01,  1.0696e-02,
         1.0472e-01,  2.4383e-01,  6.5440e-02,  1.3386e-01,  1.0416e-02,
         9.9227e-02,  5.7530e-02, -1.6891e-01,  1.4985e-03, -6.3314e-02,
         1.1022e-02, -6.5610e-02, -1.0488e-01, -1.3849e-01,  5.0657e-02,
         1.3833e-01, -1.3773e-03,  3.1448e-02, -1.2821e-02,  1.9473e-01,
         5.6056e-02, -4.2543e-02,  4.0395e-02, -3.4

In [201]:
tt.insert(len(tt.columns), "details_remaining_nan", 0)
tt.loc[tt["details_remaining"].isna(), "details_remaining_nan"] = 1

In [202]:
embeds_median = np.median(np.array(list_embed), axis=0)
embeds = dict(zip(list_index, list_embed))
embeds_list = [embeds[ii] if ii not in details_remaining_nan else embeds_median for ii in tt.index]

In [191]:
tt.insert(len(tt.columns), "details_remaining_embedding", embeds_list)

In [20]:
tt.head()

Unnamed: 0,unique_id,winning_bid,hours_final,age_at_sale,bucket,engine,tires,transmission,details_remaining,score,details_remaining_embedding,details_remaining_nan
0,rbauction_10199737,18500,4695,6.0,,,,,"canopy, aux hyd",37.880233,"[0.14849268, 0.01646403, 0.043964192, -0.05360...",0
1,rbauction_10323508,7000,4675,18.0,bkt,,,,"aux hyd, cab",14.743169,"[0.14741972, 0.015798977, 0.043977134, -0.0528...",0
2,rbauction_10544103,22000,4047,6.0,bkt,,solid tires,,"A/C cab, aux hyd, joystick strg",34.750538,"[0.14739814, 0.016416423, 0.044850267, -0.0553...",0
3,rbauction_10612921,18000,1370,6.0,bkt,,solid tires,,"A/C cab, aux hyd",32.387521,"[0.14705817, 0.015862277, 0.044167537, -0.0548...",0
4,rbauction_10533374,34000,1531,6.0,bkt,,,,"A/C cab, aux hyd, hyd Q/C",22.688279,"[0.1482124, 0.015561438, 0.044632696, -0.05430...",0


In [197]:
ttt.head()

Unnamed: 0,unique_id,winning_bid,hours_final,age_at_sale,bucket,engine,tires,transmission,details_remaining,score,details_remaining_nan,details_remaining_embedding
0,rbauction_10199737,18500,4695,6.0,,,,,"canopy, aux hyd",37.880233,0,"[0.14982077, 0.003935953, 0.05000712, -0.04379..."
1,rbauction_10323508,7000,4675,18.0,bkt,,,,"aux hyd, cab",14.743169,0,"[0.1489636, 0.0030177056, 0.050183028, -0.0430..."
2,rbauction_10544103,22000,4047,6.0,bkt,,solid tires,,"A/C cab, aux hyd, joystick strg",34.750538,0,"[0.14887814, 0.003691967, 0.050965883, -0.0454..."
3,rbauction_10612921,18000,1370,6.0,bkt,,solid tires,,"A/C cab, aux hyd",32.387521,0,"[0.14858319, 0.0031052101, 0.050345324, -0.044..."
4,rbauction_10533374,34000,1531,6.0,bkt,,,,"A/C cab, aux hyd, hyd Q/C",22.688279,0,"[0.14974837, 0.0027739904, 0.05080121, -0.0444..."


In [194]:
(ttt.details_remaining_embedding == tt.details_remaining_embedding).sum()

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [148]:
temp = [embeds[ii] if ii not in details_remaining_nan else embeds_median for ii in tt.index]

In [142]:
details_remaining_nan = set(df.index) - set(list_index)

In [145]:
embeds = [embed for _, embed in sorted(zip(list_index, list_embed))]
embeds_median = np.median(np.array(embeds), axis=0)
embeds = {index:embed for index, embed in sorted(zip(list_index, list_embed))}

In [86]:
pd.read_csv("../../models/2020-03-27T22:06_test/results_val.csv", index_col=0).iloc[557]

unique_id                                          PW_A6021
winning_bid                                       -0.682682
hours_final                                        0.974533
hours_final_nan                                           0
age_at_sale                                        0.333333
age_at_sale_nan                                           0
bucket                                           70" bucket
bucket_bin                                                1
engine             73 HP, Deutz four cylinder diesel engine
tires                    Recently replaced 12-16.5NHS tires
transmission                                            NaN
score                                               42.2019
original_price                                         7600
predicted_price                                     7300.09
Name: 2332, dtype: object