In [1]:
import pandas as pd
import os
import datetime
import re
import statsmodels.api as sm
from math import sqrt
from itertools import permutations

In [2]:
brand = 'Saint Laurent'
directory = '/Users/yulia/Desktop/chanel/SL_Sac_de_Jour_small'
os.chdir(directory)

In [3]:
fabrics = {
    'Chanel': [['lambskin', 'lamb', 'lamskin'], ['patent', 'vernis'], ['caviar', 'cavier'], ['calfskin', 'calf'], ['tweed'],
               ['canvas'], ['velvet'], ['denim'], ['jersey'], ['suede'], ['goat skin', 'goatskin'], ['fabric'], ['python'],
               ['cotton'], ['satin'], ['nylon'], ['ostrich'], ['snakeskin']],
    'Fendi': ['Leather', 'Calfskin Leather', 'Calf Leather'],
    'Gucci': [['calfskin', 'calf', 'calfakin'], ['velvet'], ['canvas'], ['jacquard'], ['velure']],
    'Saint Laurent': [['calfskin', 'calf', 'calf-skin leather'], ['grained leather'], ['suede', 'suded'], ['crocodile', 'croc leather'],
                     ['smooth'], ['patent'], ['lambskin']]
    'Louis Vuitton': [['canvas'], ['epi leather', 'epi', 'epic'], ['patent leather', 'patent'], ['canvas leather'],
                      ['vernis leather', 'vernis']]
}

sizes = {
    'Chanel': (10, 3, 6.3, 5),
    'Louis Vuitton': (12.6, 9.4, 6, 7),
    'Saint Laurent': (12.5, 9.8, 6.4, 7)
}

In [4]:
df = pd.DataFrame()
for file in [file for file in os.listdir() if file.endswith('.csv')]:
    MONTH, DAY = int(re.search('\_(\d+)\_(\d+)', file)[1]), int(re.search('\_(\d+)\_(\d+)', file)[2])
    df1 = pd.read_csv(file)
    df1['sc_date'] = datetime.date(year=2019, month=MONTH, day=DAY)
    df = pd.DataFrame(df.append(df1))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


# clean dataset

In [5]:
df = df[df['bags_brand'] == brand]
size_before = df.shape[0]

In [7]:
df['bags_fabric'].value_counts()

Leather                                                      17097
Calfskin Leather                                              2820
leather                                                       1693
Grained Leather                                               1327
Sac de Jour                                                    574
suede                                                          431
Calfskin                                                       431
Suede Leather                                                  301
12.5"L x 6.5"W x 10"H                                          286
Grained leather                                                221
Crocodile Skin Leather                                         193
Smooth Leather                                                 188
Calf Leather                                                   177
Sac De Jour Small Leopard Hair                                 175
Suede                                                         

In [7]:
def material(string, brand):
    list_of_materials = []
    string_refined = str(string).lower()
    for material_list in fabrics[brand]:
        for material in material_list:
            r = r'\b%s\b'%material
            if re.search(r, string_refined):
                list_of_materials.append(material_list[0])
    if not list_of_materials:
        if 'leather' in string_refined:
            list_of_materials.append('leather')
    if list_of_materials:
        return str(list_of_materials)
    return

In [8]:
def material_refine(row, brand):
    for string in [row['bags_name'], row['bags_description'], row['bags_fabric']]:
        found_materials = material(string, brand)
        if found_materials:
            return found_materials
    return 'Other'

In [9]:
if brand in fabrics:
    df['materials_list'] = df.apply(lambda x: material_refine(x, brand), axis = 1)

In [10]:
def extract_price(text_string):
    if type(text_string) == str:
        try:
            digits = re.search('\$(.*)\.\d{2}$', text_string)[1]
            digits = ''.join(digits.split(','))
            return int(digits)
        except:
            return 'Error'
    return

In [11]:
df['bags_price_refined'] = df['bags_price'].map(lambda x: extract_price(x))
df['sold_price_refined'] = df['bags_sold_for'].map(lambda x: extract_price(x))
df['retail_price_refined'] = df['bags_retail_price'].map(lambda x: extract_price(x))

In [12]:
def bag_id(bag_dict):
    if type(bag_dict['bags_on_sale']) == str:
        res = re.search('\/(\d{5,})\/', bag_dict['bags_on_sale'])
        if res:
            return res[1]
    elif type(bag_dict['bags_sold']) == str:
        res = re.search('\/(\d{5,})\/', bag_dict['bags_sold'])
        if res:
            return res[1]
    else:
        return

In [13]:
df['id'] = df.apply(lambda x: bag_id(x), axis = 1)

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [14]:
def popular(text):
    number = str(text).split(' ')[0]
    if number.isdigit():
        return number
    else:
        return 0

In [15]:
df['likes'] = df['bags_popular'].map(popular)

In [16]:
df['condition'] = df['bags_condition'].map({'New with tags': 3, 'Like new': 2, 'Gently used': 1})

In [17]:
def extract_size(text):
    try:
        l = float(re.search('(\d*\.*\d*)"L', text)[1])
        w = float(re.search('(\d*\.*\d*)"W', text)[1])
        h = float(re.search('(\d*\.*\d*)"H', text)[1])
        return l, w, h
    except:
        return (100, 100, 100)

In [18]:
def vector_distance(x, y):
    if len(x) == len(y):
        return sqrt(sum((x[i] - y[i])**2 for i in range(len(x))))
    else:
        raise Exception('Different dimension of vectors')

In [19]:
def correct_size(text):
    l, w, h, diff = sizes[brand]
    L, W, H = extract_size(text)
    return min([vector_distance((L,W,H), c) for c in permutations([l, w, h])]) < diff

In [20]:
#filter out incorrect sizes
if brand in sizes:
    df = df[df.apply(lambda x: 
                     correct_size(x['bags_size']) or correct_size(x['bags_fabric']), axis = 1)
           ]

In [21]:
df['id'] = df['id'].astype(int)
df['likes'] = df['likes'].astype(int)

In [22]:
print("{0:.0%}".format(df.shape[0] / size_before))

86%


In [23]:
df[df['materials_list'] == 'Other']['bags_name'].value_counts()

Alma Pm Brown Monogram Satchel                                    497
Alma Pm Damier Ebene Satchel                                      439
Alma Pm Monogram Satchel                                          365
Alma Pm Brown Satchel                                             248
Alma Multicolor White Murakami Alma Satchel                       174
Alma Pm Bb with Adjustable Strap Handbag Ebene Damier Satchel      89
Alma Pm (Ba0926) Monogram Satchel                                  89
Alma Pm Damier Ebene Brown Red Satchel                             89
Alma Pm M51230 Brown Satchel                                       88
Alma Pm Damier Brown Satchel                                       88
Alma Handle Multicolor Satchel                                     88
Alma Pm Damier Satchel                                             88
Brown Satchel                                                      88
Alma **will Need New Zipper** Alma Monogram Satchel                87
Alma Damier Ebene Pm

In [24]:
export_columns = ['bags_brand', 'sc_date', 'bags_color', 'bags_condition', 'materials_list', 'bags_price_refined',
                 'sold_price_refined', 'retail_price_refined', 'id', 'likes', 'condition']

In [25]:
df.reset_index()[export_columns].to_pickle('%s_dataset.pkl' % brand)