In [1]:
import pandas as pd
import os
import datetime
import re
import statsmodels.api as sm
from math import sqrt
from itertools import permutations

In [2]:
brand = 'Chanel'
directory = '/Users/yulia/Desktop/chanel/Chanel_classic_flap_medium'
os.chdir(directory)

In [3]:
fabrics = {
    'Chanel': [['lambskin', 'lamb', 'lamskin'], ['patent', 'vernis'], ['caviar', 'cavier'], ['calfskin', 'calf'], ['tweed'],
               ['canvas'], ['velvet'], ['denim'], ['jersey'], ['suede'], ['goat skin', 'goatskin'], ['fabric'], ['python'],
               ['cotton'], ['satin'], ['nylon'], ['ostrich'], ['snakeskin']],
    'Fendi': [['calfskin', 'calf leather'], ['velvet'], ['python'], ['patent']],
    'Gucci': [['calfskin', 'calf', 'calfakin'], ['velvet'], ['canvas'], ['jacquard'], ['velure']],
    'Saint Laurent': [['calfskin', 'calf', 'calf-skin leather'], ['grained leather'], ['suede', 'suded'], ['crocodile', 'croc leather'],
                     ['smooth'], ['patent'], ['lambskin']],
    'Louis Vuitton': [['canvas'], ['epi leather', 'epi', 'epic'], ['patent leather', 'patent'], ['canvas leather'],
                      ['vernis leather', 'vernis']]
}

sizes = {
    'Chanel': {
        'Medium': (10, 3, 6.3, 3)
    },
    'Louis Vuitton': {
        'Pm': (12.6, 9.4, 6, 5)
    },
    'Saint Laurent': {
        'Small': (12.5, 9.8, 6.4, 5)
    },
    'Gucci' : {
        'Handle': (12.5, 8.5, 4.25, 4),
        'Mini': (7.5, 5.5, 3, 4),
        'Shoulder': (10, 6.75, 3, 4)
    },
    'Fendi': {
        'Small': (5.2, 7.5, 3.5, 3),
        'Medium': (7.3, 9.8, 4.3, 3)
    }
}

In [4]:
df = pd.DataFrame()
for file in [file for file in os.listdir() if file.endswith('.csv')]:
    MONTH, DAY = int(re.search('\_(\d+)\_(\d+)', file)[1]), int(re.search('\_(\d+)\_(\d+)', file)[2])
    df1 = pd.read_csv(file)
    df1['sc_date'] = datetime.date(year=2019, month=MONTH, day=DAY)
    df = pd.DataFrame(df.append(df1))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


# clean dataset

In [5]:
df = df[df['bags_brand'] == brand]
size_before = df.shape[0]

In [6]:
def material(string, brand):
    list_of_materials = []
    string_refined = str(string).lower()
    for material_list in fabrics[brand]:
        for material in material_list:
            r = r'\b%s\b'%material
            if re.search(r, string_refined):
                list_of_materials.append(material_list[0])
    if not list_of_materials:
        if 'leather' in string_refined:
            list_of_materials.append('leather')
    if list_of_materials:
        return str(list_of_materials)
    return

In [7]:
def material_refine(row, brand):
    for string in [row['bags_name'], row['bags_description'], row['bags_fabric']]:
        found_materials = material(string, brand)
        if found_materials:
            return found_materials
    return 'Other'

In [8]:
if brand in fabrics:
    df['materials_list'] = df.apply(lambda x: material_refine(x, brand), axis = 1)

In [9]:
def extract_price(text_string):
    if type(text_string) == str:
        try:
            digits = re.search('\$(.*)\.\d{2}$', text_string)[1]
            digits = ''.join(digits.split(','))
            return int(digits)
        except:
            return 'Error'
    return

In [10]:
df['bags_price_refined'] = df['bags_price'].map(lambda x: extract_price(x))
df['sold_price_refined'] = df['bags_sold_for'].map(lambda x: extract_price(x))
df['retail_price_refined'] = df['bags_retail_price'].map(lambda x: extract_price(x))

In [11]:
def bag_id(bag_dict):
    if type(bag_dict['bags_on_sale']) == str:
        res = re.search('\/(\d{5,})\/', bag_dict['bags_on_sale'])
        if res:
            return res[1]
    elif type(bag_dict['bags_sold']) == str:
        res = re.search('\/(\d{5,})\/', bag_dict['bags_sold'])
        if res:
            return res[1]
    else:
        return

In [12]:
df['id'] = df.apply(lambda x: bag_id(x), axis = 1)

In [13]:
def popular(text):
    number = str(text).split(' ')[0]
    if number.isdigit():
        return number
    else:
        return 0

In [14]:
df['likes'] = df['bags_popular'].map(popular)

In [15]:
df['condition'] = df['bags_condition'].map({'New with tags': 3, 'Like new': 2, 'Gently used': 1})

In [16]:
def extract_size(text):
    try:
        l = float(re.search('(\d*\.*\d*)"L', text)[1])
        w = float(re.search('(\d*\.*\d*)"W', text)[1])
        h = float(re.search('(\d*\.*\d*)"H', text)[1])
        return l, w, h
    except:
        return (100, 100, 100)

In [17]:
def vector_distance(x, y):
    if len(x) == len(y):
        return sqrt(sum((x[i] - y[i])**2 for i in range(len(x))))
    else:
        raise Exception('Different dimension of vectors')

In [18]:
def correct_size(text):
    min_diff = 1000
    correct_size = None
    for size in sizes[brand]:
        l, w, h, diff = sizes[brand][size]
        L, W, H = extract_size(text)
        cur_diff = min([vector_distance((L,W,H), c) for c in permutations([l, w, h])])
        if cur_diff < diff and cur_diff < min_diff:
            min_diff = cur_diff
            correct_size = size
    return correct_size

In [19]:
#filter out incorrect sizes
df['size'] = df.apply(lambda x: 
                 correct_size(x['bags_size']) or correct_size(x['bags_fabric']) or 'Other', axis = 1)

In [20]:
df['size'].value_counts()

Medium    142713
Other      16439
Name: size, dtype: int64

In [21]:
df['id'] = df['id'].astype(int)
df['likes'] = df['likes'].astype(int)
df['bags_color'] = df['bags_color'].fillna('Other')

In [22]:
print("{0:.0%}".format(df.shape[0] / size_before))

100%


In [23]:
export_columns = ['bags_brand', 'sc_date', 'bags_color', 'bags_condition', 'materials_list', 'bags_price_refined',
                 'sold_price_refined', 'retail_price_refined', 'id', 'likes', 'condition', 'size']

In [26]:
df.reset_index()[export_columns].to_pickle('%s_dataset.pkl' % brand)

In [27]:
df.reset_index()[export_columns]

Unnamed: 0,bags_brand,sc_date,bags_color,bags_condition,materials_list,bags_price_refined,sold_price_refined,retail_price_refined,id,likes,condition,size
0,Chanel,2019-02-24,Black,Gently used,['caviar'],4177.0,,5600.0,24040653,121,1,Medium
1,Chanel,2019-02-24,Black,Like new,['leather'],6410.0,,,21451785,239,2,Medium
2,Chanel,2019-02-24,Black,Gently used,['lambskin'],2324.0,,5600.0,23325450,108,1,Medium
3,Chanel,2019-02-24,Black,Gently used,['lambskin'],5410.0,,,24675774,196,1,Medium
4,Chanel,2019-02-24,Beige,Gently used,['caviar'],4770.0,,5600.0,22619857,94,1,Medium
5,Chanel,2019-02-24,Black,Gently used,['caviar'],5530.0,,6300.0,14917834,335,1,Other
6,Chanel,2019-02-24,Black,Gently used,['lambskin'],5219.0,,5600.0,22195651,87,1,Medium
7,Chanel,2019-02-24,White,Gently used,['leather'],2871.0,,5300.0,22599214,149,1,Medium
8,Chanel,2019-02-24,Blue,Gently used,['velvet'],3322.0,,5300.0,20229737,75,1,Medium
9,Chanel,2019-02-24,Dark-red,New with tags,['caviar'],6250.0,,,24084859,60,3,Medium
