In [1]:
import pandas as pd
import os
import datetime
import re
import numpy as np
from matplotlib import pyplot
import pyflux as pf
import warnings
import statsmodels.api as sm
from math import sqrt
from itertools import permutations

In [2]:
brand = 'Saint Laurent'
directory = 'SL_Sac_de_Jour_small'
os.chdir(directory)

In [22]:
fabrics = {
    'Chanel': ['Lambskin Leather', 'Leather', 'Lambskin', 'Patent Leather', 'Caviar Leather','lambskin',
               'Calfskin Leather', 'Caviar', 'leather', 'caviar', 'Calfskin', 'lambskin leather', 'Lambskin leather',
               'caviar leather', 'calfskin', 'Caviar leather', 'LEATHER', 'patent leather', 'Classic Flap', 
               'Patent', 'Suede Leather', 'Goat Skin Leather', 'Calf Leather', 'CAVIAR LEATHER', 'Lamb leather', 
               'Lamskin', 'Quilted', 'Patent leather', 'Lamb Skin', 'CAVIAR', 'LAMBSKIN', 'Cavier Leather',
               'Suede', 'Goatskin', 'Iridescent Caviar'],
    'Fendi': ['Leather', 'Calfskin Leather', 'leather', 'Calf Leather', 'LEATHER'],
    'Gucci': ['Leather', 'Calfskin Leather', 'leather', 'Calfskin', 'Sylvie', 'calfskin', 'Calf Leather'],
    'Saint Laurent': ['Leather', 'Calfskin Leather', 'leather', 'Grained Leather', 'Sac de Jour', 'Calfskin',
                      'Grained leather', 'Smooth Leather', 'Calf Leather', 'Grain Leather']
}

sizes = {
    'Chanel': (10, 3, 6.3, 5),
    'LV': (12.6, 9.4, 6, 7),
    'Saint Laurent': (12.5, 9.8, 6.4, 7)
}

In [4]:
df = pd.DataFrame()
for file in [file for file in os.listdir() if file.endswith('.csv')]:
    MONTH, DAY = int(re.search('\_(\d+)\_(\d+)', file)[1]), int(re.search('\_(\d+)\_(\d+)', file)[2])
    df1 = pd.read_csv(file)
    df1['sc_date'] = datetime.date(year=2019, month=MONTH, day=DAY)
    df = pd.DataFrame(df.append(df1))

# clean dataset

In [9]:
df = df[df['bags_brand'] == brand]

In [10]:
if brand in fabrics:
    df = df[df['bags_fabric'].isin(fabrics[brand])]

In [11]:
def extract_price(text_string):
    if type(text_string) == str:
        try:
            digits = re.search('\$(.*)\.\d{2}$', text_string)[1]
            digits = ''.join(digits.split(','))
            return int(digits)
        except:
            return 'Error'
    return

In [12]:
df['bags_price_refined'] = df['bags_price'].map(lambda x: extract_price(x))
df['sold_price_refined'] = df['bags_sold_for'].map(lambda x: extract_price(x))
df['retail_price_refined'] = df['bags_retail_price'].map(lambda x: extract_price(x))

In [13]:
def bag_id(bag_dict):
    if type(bag_dict['bags_on_sale']) == str:
        res = re.search('\/(\d{5,})\/', bag_dict['bags_on_sale'])
        if res:
            return res[1]
    elif type(bag_dict['bags_sold']) == str:
        res = re.search('\/(\d{5,})\/', bag_dict['bags_sold'])
        if res:
            return res[1]
    else:
        return

In [14]:
df['id'] = df.apply(lambda x: bag_id(x), axis = 1)

In [15]:
def popular(text):
    number = str(text).split(' ')[0]
    if number.isdigit():
        return number
    else:
        return 0

In [16]:
df['likes'] = df['bags_popular'].map(popular)

In [17]:
df['condition'] = df['bags_condition'].map({'New with tags': 3, 'Like new': 2, 'Gently used': 1})

In [18]:
def extract_size(text):
    try:
        l = float(re.search('(\d*\.*\d*)"L', text)[1])
        w = float(re.search('(\d*\.*\d*)"W', text)[1])
        h = float(re.search('(\d*\.*\d*)"H', text)[1])
        return l, w, h
    except:
        return (100, 100, 100)

In [19]:
def vector_distance(x, y):
    if len(x) == len(y):
        return sqrt(sum((x[i] - y[i])**2 for i in range(len(x))))
    else:
        raise Exception('Different dimension of vectors')

In [20]:
def correct_size(text):
    l, w, h, diff = sizes[brand]
    L, W, H = extract_size(text)
    return min([vector_distance((L,W,H), c) for c in permutations([l, w, h])]) < diff

In [25]:
#filter out incorrect sizes
if brand in sizes:
    df = df[df['bags_size'].map(correct_size)]

In [26]:
df['id'] = df['id'].astype(int)
df['likes'] = df['likes'].astype(int)

In [27]:
export_columns = ['bags_brand', 'sc_date', 'bags_color', 'bags_condition', 'bags_fabric', 'bags_price_refined',
                 'sold_price_refined', 'retail_price_refined', 'id', 'likes', 'condition']

In [28]:
df.reset_index()[export_columns].to_pickle('%s_dataset.pkl' % brand)