In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

sample_submission.csv
test.tsv
train.tsv



In [None]:
%matplotlib inline
import matplotlib
import math
import os
import seaborn as sns
import collections
import matplotlib.pyplot as plt
import keras.layers as KL
import keras.optimizers as KO
import keras.models as KM
import keras.backend as K

In [None]:
train_raw = pd.read_csv('../input/train.tsv', sep='\t')

In [None]:
test_raw = pd.read_csv('../input/test.tsv', sep='\t')

In [None]:
train_brand_list = list(set(train_raw['brand_name']))
test_brand_list = list(set(test_raw['brand_name']))
print('training brand size: ', len(train_brand_list))
print('testing brand size: ', len(test_brand_list))
print('common brand size: ', len(list(set(train_brand_list)&set(test_brand_list))))
total_brand_list = list(set(train_brand_list)|set(test_brand_list))
brand_dict = {}
for b in total_brand_list:
    brand_dict[b] = len(brand_dict)
print('brand dict size: ', len(brand_dict))

In [None]:
total_category = []
total_category.extend(train_raw['category_name'].tolist())
total_category.extend(test_raw['category_name'].tolist())
print('total category count: ', len(total_category))
dicts = []
for c in total_category:
    if type(c) == str:
        clist = c.split('/')
        level_of_category = len(clist)
        while level_of_category > len(dicts):
            dicts.append({})
        for i in range(level_of_category):
            current = clist[i]
            if current not in dicts[i]:
                dicts[i][current] = len(dicts[i])
print('level of dicts: ', len(dicts))
category_dict_size = len(dicts[0]) + len(dicts[1])

In [None]:
total_item_condition = []
total_item_condition.extend(train_raw['item_condition_id'])
total_item_condition.extend(test_raw['item_condition_id'])
print('total item condition id count: ', len(total_item_condition))
print(list(set(total_item_condition)))
condition_list = list(set(total_item_condition))
condition_dict = {}
for i in range(len(condition_list)):
    condition_dict[condition_list[i]] = i
print('condition dict: ', condition_dict)

In [None]:
def encode_category(data, dicts, level):
    res = []
    for d in data:
        row = []
        for lv in range(level):
            exrow = [0 for i in range(len(dicts[lv]))]
            if type(d) == str:
                clist = d.split('/')
                if lv < len(clist) and clist[lv] in dicts[lv]:
                    exrow[dicts[lv][clist[lv]]] = 1
            row.extend(exrow)
        res.append(row)
    output = np.array(res)
    return output

In [None]:
def encode_condition(data, condition_dict):
    res = []
    for d in data:
        row = np.zeros((len(condition_dict)))
        row[condition_dict[d]] = 1
        res.append(row)
    output = np.array(res)
    return output

In [None]:
def encode_shipping(data):
    res = [[i] for i in data]
    output = np.array(res)
    return output

In [None]:
def encode_brand(data, brand_dict):
    res = []
    for d in data:
        row = np.zeros((len(brand_dict)))
        if d in brand_dict:
            row[brand_dict[d]] = 1
        res.append(row)
    output = np.array(res)
    return output

In [None]:
def get_price_range(price):
    res = 0
    if price < 100:
        res = price // 10
    elif price < 500:
        res = (100 // 10) + 1 + (price - 100) // 50
    else:
        res = (100 // 10) + 1 + ((500 - 100) // 50) + 1 + (price - 500) // 100
    return int(res)

In [None]:
def encode_price_category(data):
    max_price = 2250
    category_size = get_price_range(max_price) + 1
    res = []
    for d in data:
        row = np.zeros((category_size))
        row[get_price_range(d)] = 1
        res.append(row)
    output = np.array(res)
    return output

In [None]:
# brand portion
brand_inputs = KL.Input(shape=(5290,))
brand_dnn_1 = KL.Dense(units=128, activation='relu')
x_brand = brand_dnn_1(brand_inputs)
x_brand = KL.Dropout(0.2)(x_brand)
brand_output_layer = KL.Dense(units=38, activation='softmax')
brand_outputs = brand_output_layer(x_brand)
# category portion
category_inputs = KL.Input(shape=(123,))
category_dnn_1 = KL.Dense(units=128, activation='relu')
x_category = category_dnn_1(category_inputs)
x_category = KL.Dropout(0.2)(x_category)
category_output_layer = KL.Dense(units=38, activation='softmax')
category_outputs = category_output_layer(x_category)
# condition portion
condition_inputs = KL.Input(shape=(6,))
x_condition = KL.Dense(128, activation='relu')(condition_inputs)
x_condition = KL.Dropout(0.2)(x_condition)
condition_outputs = KL.Dense(16)(x_condition)
# combine portion
x_combine = KL.concatenate([brand_outputs, category_outputs, condition_outputs], axis=-1)
x_combine = KL.Dense(128, activation='relu')(x_combine)
x_combine = KL.Dropout(0.2)(x_combine)
final_outputs = KL.Dense(1, activation='relu')(x_combine)

In [None]:
brand_model = KM.Model(inputs=brand_inputs, outputs=brand_outputs)
category_model = KM.Model(inputs=category_inputs, outputs=category_outputs)
combined_model = KM.Model(inputs=[brand_inputs, category_inputs, condition_inputs], outputs=[final_outputs])

In [None]:
total_sample_size = len(train_raw)
trunk = 1000
iter_cnt = 50
for i in range((total_sample_size//trunk) + 1):
    print(i, '/', total_sample_size//trunk)
    start = i*trunk
    end = min((i+1)*trunk, total_sample_size)
    train_condition = encode_condition(train_raw['item_condition_id'][start:end], condition_dict)
    train_shipping = encode_shipping(train_raw['shipping'][start:end])
    train_condition = np.hstack([train_condition, train_shipping])
    train_category = encode_category(train_raw['category_name'][start:end], dicts, 2)
    train_brand = encode_brand(train_raw['brand_name'][start:end], brand_dict)
    train_price_category = encode_price_category(train_raw['price'][start:end])
    train_price = np.array([[i] for i in train_raw['price'][start:end]])
    brand_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    for i in range(iter_cnt):
        brand_model.fit(x=train_brand, y=train_price_category, verbose=0)
    category_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    for i in range(iter_cnt):
        category_model.fit(x=train_category, y=train_price_category, verbose=0)
    brand_dnn_1.trainable = False
    brand_output_layer.trainable = False
    category_dnn_1.trainable = False
    category_output_layer.trainable = False
    combined_model.compile(optimizer='rmsprop', loss='mean_squared_error')
    for i in range(iter_cnt):
        combined_model.fit(x=[train_brand, train_category, train_condition], y=train_price, verbose=0)

In [None]:
test_size = len(test_raw)
trunk = 100
res = []
for i in range((test_size//trunk) + 1):
    if i % 100 == 0:
        print(i, '/', test_size//trunk)
    start = i*trunk
    end = min((i+1)*trunk, test_size)
    test_condition = encode_condition(test_raw['item_condition_id'][start:end], condition_dict)
    test_shipping = encode_shipping(test_raw['shipping'][start:end])
    test_condition = np.hstack([test_condition, test_shipping])
    test_category = encode_category(test_raw['category_name'][start:end], dicts, 2)
    test_brand = encode_brand(test_raw['brand_name'][start:end], brand_dict)
    tmp = combined_model.predict(x=[test_brand, test_category, test_condition], verbose=0)
    res.extend(sum(tmp.tolist(), []))
print(len(res))

In [None]:
output_dict = {'test_id': test_raw['test_id'], 'price': res}
output_dataframe = pd.DataFrame(output_dict)
output_dataframe.to_csv('submission.csv')

In [3]:
print(check_output(["ls"]).decode("utf8"))

__notebook_source__.ipynb

