In [1]:
# !python3 -m venv env
# !source env/bin/activate
# !python3 -m pip install pandas numpy openpyxl
#

In [2]:
import pandas as pd
import numpy as np
from datetime import date, timedelta

In [3]:
sdate = date(2020,1,22)   
edate = date(2022,12,10)

cities = {'Russia' : ['Moscow', 'Ekaterinburg', 'St.Petersburg'],
          'USA' : ['New York', 'Boston', 'Chicago', 'San Francisco'],
          'Canada' : ['Ontario', 'Toronto', 'London'],
          'UK' : ['Oxford', 'London'],
          'Portugal' : ['Lisbon', 'Porto', 'Braga'],
          'Germany' : ['Berlin', 'Hamburg', 'Munich', 'Frankfurt'],
          'France' : ['Paris', 'Marseille', 'Lyon'],
          'Spain' : ['Barcelona', 'Madrid', 'Vigo'],
          'Italy' : ['Rome']}


streets = {'Russia' : ['Lenin Street', '1905 Street', 'Mira Avenue', 'Svobody Street'],
           'USA' : ['Martin Luther King Drive', 'Ross Clark Circle', 'Wall Street', 'Washington Street'],
           'Portugal' : ['Avenida da Liberdade', 'Rua Augusta', 'Rua Cor de Rosa', 'Rua da Bica'],
           'Germany' : ['Bernauer Straße', 'Ebertstraße', 'Chausseestraße', 'Kurfürstendamm'],
           'France' : ['Rue des Barres', 'Rue des Rosiers', 'Rue Chanoinesse', 'Rue Cremieux'],
           'Spain' : ['Calle Huertas', 'Calle Mayor', 'Calle de Preciados', 'Paseo del Prado'],
           'Italy' : ["Campo de' Fiori", "Via della Conciliazione", "Via del Corso", "Via Margutta"],
           'Canada' : ['Saint Laurent Boulevard', 'Queen Street', 'South Granville Street'],
           'UK' : ['Oxford Street', 'Abbey Road', 'Baker Street']}

zipcodes = {'Russia' : {'Moscow' : '125130', 'Ekaterinburg' : '239032', 'St.Petersburg': '635342'},
            'USA': {'New York' : '35004', 'Boston' : '37806', 'Chicago' : '56004', 'San Francisco' : '14824'},
            'Portugal' : {'Lisbon' : '145-616', 'Porto' : '432-516', 'Braga' : '421-361'},
            'Germany' : {'Berlin' : '76234', 'Hamburg' : '25138', 'Munich' : '27391', 'Frankfurt' : '56234'},
            'France' : {'Paris' : '57234', 'Marseille' : '32983', 'Lyon' : '93274'},
            'Spain' : {'Barcelona' : '01004', 'Madrid' : '06923', 'Vigo' : '33724'},
            'Italy' : {'Rome' : '98168'},
            'Canada' : {'Ontario' : 'K0C 9Z9', 'Toronto' : 'M9Z 9Z9', 'London' : 'N6Z 9Z9'},
            'UK' : {'Oxford' : 'W1G 8RH', 'London' : 'E1 2BL'}
           }

m_names = ['Andrew', 'Vladislav', 'Pedro', 'Jhonny', 
           'David', 'Ricardo', 'Cristiano', 'Jack', 
           'Robert', 'Martin', 'Diego', 'Daniel',
           'Liam', 'Oliver', 'James', 'William']

f_names = ['Ttyana', 'Maria', 'Patricia', 'Ines', 
           'Marianna', 'Anna', 'Julia', 'Helen', 
           'Barbara', 'Babette', 'Jane', 'Olivia',
           'Emma', 'Charlotte', 'Sophia']

l_names = ['Castro', 'Abramov', 'Zurabov', 'Pereira', 'Silva', 
           'Santos', 'Oliveira', 'Motta', 'Smith', 'Williams', 
           'Brown', 'Lavigne', 'Monet', 'Blanchet', 
           'Müller', 'Schneider', 'Schmidt']

categories = {
    1 : 'SmartPhones',
    2 : 'CellPhones',
    3 : 'SmartWatches',
    4 : 'Scales',
    5 : 'Face & Body',
    6 : 'Hair Care',
    7 : 'Laptops',
    8 : 'Desktop Computers',
    9 : 'Monitors',
    10 : 'Tablets',
    11 : 'HeadPhones',
    12 : 'Speakers',
    13 : 'Accessories',
    14 : 'Phones',
    15 : 'Sport&Fitness',
    16 : 'Home',
    17 : 'Computers',
    18 : 'Audio'
}

parent_categories = {
    1 : 14,
    2 : 14,
    3 : 15,
    4 : 16,
    5 : 16,
    6 : 16,
    7 : 17,
    8 : 17,
    9 : 17,
    10 : np.nan,
    11 : 18,
    12 : 18,
    13 : np.nan,
    14 : np.nan,
    15 : np.nan,
    16 : np.nan,
    17 : np.nan,
    18 : np.nan
}

reviews_phrases = {
    5 : [
        'Super nice device!', 
        'Amazingly fast shipped!!!',
        'Wow, it was the best offer',
        'Outstanding quality, as always', 
        'My kid loves it',
        'Oh, it is exactly what i needed. Thank you!'],
    4 : [
        'Good for this price, but the box was a bit damaged',
        'Had to wait three day to get the package', 
        'Good! But! I cannot believe it, the instruction is in Enlish only...',
        'The color is not so good'],

    3 : ['Too long delivery', 'Overpriced :(', 'The package was scratched a lot'],

    2 : ['The delivery guy was very rude, hate it', 'It does not work correctly'],

    1 : ['Cannot turn it on', 'The waranty of the manufacturer has expired!!!', 'Piece of sh*t!']
 }

countries = {
    1 : 'Russia',
    2 : 'USA',
    3 : 'Portugal',
    4 : 'Germany',
    5 : 'France',
    6 : 'Spain',
    7 : 'Italy',
    8 : 'Canada',
    9 : 'UK'
}

taxes = {
    1 : 13,
    2 : 13,
    3 : 6,
    4 : 6,
    5 : 6,
    6 : 6,
    7 : 20,
    8 : 20,
    9 : 20,
    10 : 20,
    11 : 10,
    12 : 10,
    13 : 6,
    14 : 13,
    15: 6, 
    16 : 6, 
    17 : 20, 
    18 : 10}

products_file = 'data/products.xlsx'

In [4]:
def clients_data_generation():
    def login_generator(name, last_name, year):
        include_year = np.random.choice([True, False])
        cut_year = np.random.choice([True, False])
        num_of_letters = np.random.randint(2, 6)
        login = name[:num_of_letters-1] + last_name[:num_of_letters]
        if include_year:
            if cut_year:
                login += year[2:]
            else:
                login += year
        return login.lower()

    def email_generator(name, last_name, year):
        nickname = login_generator(name, last_name, str(year)) + '@examplemail.lol'
        return nickname

    my_m_names = np.random.choice(m_names, 1000)
    my_ml_names = np.random.choice(l_names, 1000)

    my_f_names = np.random.choice(f_names, 1000)
    my_fl_names = np.random.choice(l_names, 1000)

    male_df = pd.DataFrame(data = {'First_Name' : my_m_names, 
                                   'Last_Name' : my_ml_names, 
                                   'Sex' : 'M'}).drop_duplicates()
    
    female_df = pd.DataFrame(data = {'First_Name' : my_f_names, 
                                     'Last_Name' : my_fl_names, 
                                     'Sex' : 'F'}).drop_duplicates()
    clients = pd.concat([male_df, female_df])

    clients['Birth_year'] = np.random.randint(1941, 2015, len(clients))
    clients['Phone_number'] = np.random.randint(3*10e+9, 10e+10, len(clients))
    clients['email'] = clients.apply(lambda x: email_generator(x['First_Name'], 
                                                               x['Last_Name'],
                                                               x['Birth_year']), axis = 1)
    clients = clients.sample(frac = 1).reset_index(drop = True)
    clients['ID'] = np.arange(1, len(clients)+1)
    clients = clients[['ID', 'First_Name', 'Last_Name', 'Sex', 'Birth_year', 'Phone_number', 'email']]
    return clients

def address_data_generation(clients, country_id):
    def give_addr_2_all_customers(clients):
        countries = np.random.choice(list(cities.keys()), len(clients))
        cur_cities, cur_streets, cur_zipcodes = list(), list(), list()

        for country in countries:
            city = np.random.choice(cities[country])
            zip_code = zipcodes[country][city]
            cur_cities.append(city)
            cur_zipcodes.append(zip_code)
            cur_streets.append(np.random.choice(streets[country]))

        df = pd.DataFrame(data = {'Country' : countries,
                                  'City' : cur_cities,
                                  'Street' : cur_streets,
                                  'PostCode' : cur_zipcodes,
                                  'CUSTOMER_ID' : clients['ID']})
        return df
        
    df = give_addr_2_all_customers(clients)
    some_dublicates_df = give_addr_2_all_customers(clients.sample(frac=0.3))
    df = pd.concat([df, some_dublicates_df]).reset_index(drop=True)
    df['Building'] = np.random.randint(1, 200, len(df))
    df['Apartment_tmp'] = np.random.randint(1, 1000, len(df))
    df['Apartment_presence'] = np.random.choice([1,None], p=[0.8, 0.2], size=len(df))
    df['Apartment'] = df['Apartment_tmp'] * df['Apartment_presence']
    df = df.sample(frac=1).reset_index(drop=True)
    df['ID'] = np.arange(1, len(df)+1)
    df = df.merge(country_id, how = 'left', left_on = ['Country'], right_on = ['Name'])
    df.rename(columns = {'ID_x' : 'ID', 'ID_y': 'Country_ID'}, inplace = True)
    df = df[['ID', 'Country_ID', 'City', 'Street', 'Building', 'Apartment', 'PostCode', 'CUSTOMER_ID']]
    return df

def orders_generation(clients, status, addresses):

    def address_selector(customer_id):
        address_id = addresses[addresses['CUSTOMER_ID'] == customer_id].sample(n=1)['ID'].values[0]
        return address_id

    dates = pd.date_range(sdate,edate-timedelta(days=1),freq='d').astype(str).to_list()
    sample_clients =  np.random.choice(clients['ID'], size=100)

    client_dates, client_ids = list(), list()
    for client in sample_clients:
        n_orders = np.random.choice([1,2,3,4], p=[0.5, 0.3, 0.1, 0.1])
        curr_dates = list(np.random.choice(dates, n_orders))
        client_dates += curr_dates
        client_ids += [client for _ in curr_dates]

    orders = pd.DataFrame(data = {'Date' : client_dates, 'CUSTOMER_ID' : client_ids})
    orders['Deadline_Date'] = pd.to_datetime(orders['Date']) + pd.DateOffset(days=3)
    orders['Deadline_Date'] = pd.to_datetime(orders['Deadline_Date']).dt.strftime('%Y-%m-%d')
    orders['Date'] = pd.to_datetime(orders['Date']).dt.strftime('%Y-%m-%d')
    orders['Status_ID'] = np.random.choice([4,5,6], p=[.8,.15,.05], size = len(orders))
    orders.sort_values(by = ['Date'], inplace = True)
    orders['ID'] = np.arange(1, len(orders)+1)
    orders['Address_ID'] = orders['CUSTOMER_ID'].apply(address_selector)
    orders = orders[['ID', 'Date', 'Deadline_Date', 'Status_ID', 'CUSTOMER_ID', 'Address_ID']]
    return orders

def payments_generation(orders):
    def payment_status(order_status_id):
        if order_status_id == 6:
            return 3
        if order_status_id == 5:
            return 2
        return 1

    def payment_date_selector(date1, date2, payment_way):
        if payment_way == 'Instant':
            return date1
        return date2

    payments = pd.DataFrame()
    payments['Order_ID'] = orders['ID'].copy()
    payments['Date_1'] = orders['Date'].copy()
    payments['Date_2'] = orders['Deadline_Date'].copy()
    payments['Method'] = np.random.choice(['Card', 'Cash', 'Crypto', 'Check'], p=[0.5, 0.3, 0.1, 0.1], size = len(orders))
    payments['Way'] = np.random.choice(['Instant', 'When delivered'], p=[0.6, 0.4], size = len(orders))
    payments['Status_ID'] = orders.apply(lambda x: payment_status(x['Status_ID']), axis = 1)
    payments['Date'] = payments.apply(lambda x: payment_date_selector(x['Date_1'], x['Date_2'], x['Way']), axis = 1)
    payments['ID'] = np.arange(1, len(payments)+1)
    return payments[['ID', 'Date', 'Method', 'Way', 'Status_ID', 'Order_ID']]

def products_data_generation(products_file):
    df = pd.read_excel(products_file)
    df = df.drop(columns = ['VAT_ID'])
    df['ABB'] = np.random.choice(['AB', 'HC', 'RT', 'IR', 'OP'], size = len(df))
    df['SKU'] = np.random.randint(10e+5, 10e+6, len(df)).astype('str')
    df['SKU'] = df['ABB'] + df['SKU']
    df = df.rename(columns = {'SKU_NAME' : 'NAME'})
    df = df[['ID', 'SKU', 'NAME', 'PRICE', 'STOCK', 'CATEGORY_ID', 'DESCRIPTION']]
    return df

def order_status_generation():
    ids = [1,2,3,4,5,6]
    status = ['Reviewing', 'Preparation', 'Shipping', 'Delivered', 'Canceled', 'Return']
    statuses_dict = pd.DataFrame(data = {'ID' : ids, 'Name': status})
    return statuses_dict

def order_items_generator(orders, products):
    prd_ids = products['ID'].to_list()
    frames = []
    for order_id in orders['ID'].unique():
        n_of_unique_skus = np.random.choice([1,2,3,4,5], p=[0.4, 0.3, 0.2, 0.05, 0.05])
        selected_sku_ids = np.random.choice(prd_ids, n_of_unique_skus)
        amounts = np.random.choice([1,2,3], p=[0.6, 0.3, 0.1], size = len(selected_sku_ids))
        frames.append(pd.DataFrame(data = {'Product_ID' : selected_sku_ids, 'Amount' : amounts, 'Order_ID' : order_id}))
    res = pd.concat(frames)

    res['ID'] = np.arange(1, len(res)+1)
    res = res[['ID', 'Product_ID', 'Amount', 'Order_ID']].reset_index(drop = True)
    return res

def categories_generator():
    res = pd.DataFrame(data = {
        'ID' : list(categories.keys()), 
        'Name' : list(categories.values()), 
        'Parent_Category_ID' : [parent_categories[_] for _ in list(categories.keys())]})

    return res

def payment_status_generator():
    ids = [1, 2, 3]
    names = ['Paid', 'Canceled', 'Return']
    res = pd.DataFrame(data = {'ID' : ids, 'Name' : names})
    return res

def country_id_generation():
    ids = list(countries.keys())
    names = list(countries.values())
    res = pd.DataFrame(data = {'ID' : ids, 'Name' : names})
    return res

def reviews_generation(orders, order_item):
    def score_giver(ord_stat):
        if ord_stat == 6:
            return 1
        if ord_stat == 5:
            return np.nan
        return np.random.choice([5, 4, 3, 2], p =[.7, .2, .05, .05])

    def phrase_giver(score):
        phrase = np.random.choice(reviews_phrases[score])
        return phrase

    all_orders = orders_df.merge(order_items_df, how = 'left', left_on = ['ID'], right_on = ['Order_ID'])
    all_orders['has_review'] = np.random.choice([1, 0], p =[.7, .3], size = len(all_orders))
    reviewed_orders = all_orders[all_orders['has_review'] == 1]
    reviews = pd.DataFrame()
    reviews['Review_Date'] = pd.to_datetime(reviewed_orders['Deadline_Date'])+ pd.DateOffset(days=3)
    reviews['Review_Date'] = reviews['Review_Date'].dt.strftime('%Y-%m-%d')
    reviews['order_item_id'] = reviewed_orders['ID_y'].copy()
    reviews['Status_ID'] = reviewed_orders['Status_ID'].copy()
    reviews['Review_Score'] = reviews['Status_ID'].apply(score_giver)
    reviews = reviews.dropna(subset = ['Review_Score'])
    reviews['ID'] = np.arange(1, len(reviews)+1)
    reviews['Review_Text'] = reviews['Review_Score'].apply(phrase_giver)
    reviews['Review_Score'] = reviews['Review_Score'].astype(int)
    reviews = reviews[['ID', 'Review_Date', 'order_item_id', 'Review_Text', 'Review_Score']].reset_index(drop = True)
    return reviews

def taxes_generator(categories_df):
    def tax_size(category_id):
        return taxes[category_id]
    tax_df = categories_df[['ID']].copy()
    tax_df['VAT_SIZE'] = tax_df['ID'].apply(tax_size)
    tax_df['CATEGORY_ID'] = tax_df['ID'].to_list()
    tax_df['ID'] = np.arange(1, len(tax_df)+1)
    tax_df = tax_df[['ID', 'CATEGORY_ID', 'VAT_SIZE']]
    return tax_df



In [5]:
clients_df = clients_data_generation()
country_id = country_id_generation()
addresses_df = address_data_generation(clients_df, country_id)
products_df = products_data_generation(products_file)
status_df = order_status_generation()
orders_df = orders_generation(clients_df, status_df, addresses_df)
payments_df = payments_generation(orders_df)
order_items_df = order_items_generator(orders_df, products_df)
categories_df = categories_generator()
payment_status_df = payment_status_generator()
reviews_df = reviews_generation(orders_df, order_items_df)
vat_df = taxes_generator(categories_df)


frames = {
    'customer' : clients_df,
    'country' : country_id,
    'address' : addresses_df.drop(columns = ['CUSTOMER_ID']),
    'product' : products_df,
    'order_status' : status_df,
    'order' : orders_df,
    'payment' : payments_df,
    'order_item' : order_items_df,
    'category' : categories_df,
    'payment_status' : payment_status_df,
    'review' : reviews_df,
    'vat' : vat_df
}

for frame in frames.keys():
    frames[frame].to_excel(f'tables/{frame}.xlsx', index = False)

In [6]:
for frame in frames.keys():
    print(frame)
    display(frames[frame])

customer


Unnamed: 0,ID,First_Name,Last_Name,Sex,Birth_year,Phone_number,email
0,1,Emma,Oliveira,F,1999,81427406294,eol1999@examplemail.lol
1,2,Olivia,Santos,F,2004,65304459450,osa@examplemail.lol
2,3,William,Smith,M,1973,93853409003,willsmith73@examplemail.lol
3,4,William,Schmidt,M,1950,65787655249,wisch1950@examplemail.lol
4,5,Helen,Silva,F,1978,83424190754,hesil@examplemail.lol
...,...,...,...,...,...,...,...
514,515,Vladislav,Santos,M,2009,53067217711,vladsanto09@examplemail.lol
515,516,Charlotte,Motta,F,1992,39072169982,chamott@examplemail.lol
516,517,Julia,Motta,F,1954,36909520407,julimotta@examplemail.lol
517,518,Martin,Blanchet,M,1988,93503973817,mabla@examplemail.lol


country


Unnamed: 0,ID,Name
0,1,Russia
1,2,USA
2,3,Portugal
3,4,Germany
4,5,France
5,6,Spain
6,7,Italy
7,8,Canada
8,9,UK


address


Unnamed: 0,ID,Country_ID,City,Street,Building,Apartment,PostCode
0,1,8,Ontario,South Granville Street,175,651,K0C 9Z9
1,2,4,Frankfurt,Bernauer Straße,7,832,56234
2,3,9,London,Abbey Road,130,49,E1 2BL
3,4,7,Rome,Via Margutta,122,412,98168
4,5,5,Lyon,Rue des Rosiers,87,858,93274
...,...,...,...,...,...,...,...
670,671,6,Barcelona,Calle Mayor,198,741,01004
671,672,6,Madrid,Calle Mayor,21,,06923
672,673,7,Rome,Via della Conciliazione,18,188,98168
673,674,9,Oxford,Baker Street,77,,W1G 8RH


product


Unnamed: 0,ID,SKU,NAME,PRICE,STOCK,CATEGORY_ID,DESCRIPTION
0,1,HC6755022,iPhone 14 PRO,1479.0,13,1,"256GB - Purple - 6,1' - IOS 16"
1,2,AB6220534,iPhone 14,799.0,22,1,"128GB - Black - 6,1' - IOS 16"
2,3,AB6635560,iPhone 14,899.0,23,1,"256GB - Black - 6,1' - IOS 16"
3,4,AB6867027,iPhone 13,699.0,45,1,"256GB - Purple - 6,1' - IOS 16"
4,5,AB3715665,iPhone 13,699.0,12,1,"256GB - Black - 6,1' - IOS 16"
5,6,OP9264411,iPhone 14 Plus,999.0,17,1,"256GB - Black - 6,7' - IOS 17"
6,7,OP6080750,iPhone 14 Plus,899.0,13,1,"128GB - Dark Green - 6,7' - IOS 17"
7,8,AB7788753,iPhone 14 Plus,999.0,15,1,"256GB - Dark Green - 6,7' - IOS 17"
8,9,AB3996141,iPhone 14 PRO max,1629.0,12,1,"512GB - Midnight - 6,7' - IOS 16"
9,10,HC8291264,Samsung Galaxy S22 ULTRA,949.0,11,1,256GB - Black - 6.8' - 12GB


order_status


Unnamed: 0,ID,Name
0,1,Reviewing
1,2,Preparation
2,3,Shipping
3,4,Delivered
4,5,Canceled
5,6,Return


order


Unnamed: 0,ID,Date,Deadline_Date,Status_ID,CUSTOMER_ID,Address_ID
2,1,2020-01-31,2020-02-03,4,13,644
29,2,2020-02-06,2020-02-09,4,331,601
168,3,2020-02-07,2020-02-10,4,450,200
156,4,2020-02-07,2020-02-10,4,196,111
8,5,2020-02-14,2020-02-17,4,190,370
...,...,...,...,...,...,...
149,170,2022-10-14,2022-10-17,5,51,481
9,171,2022-10-21,2022-10-24,4,190,370
130,172,2022-10-24,2022-10-27,4,77,586
48,173,2022-11-18,2022-11-21,5,189,124


payment


Unnamed: 0,ID,Date,Method,Way,Status_ID,Order_ID
2,1,2020-01-31,Check,Instant,1,1
29,2,2020-02-06,Card,Instant,1,2
168,3,2020-02-07,Card,Instant,1,3
156,4,2020-02-07,Crypto,Instant,1,4
8,5,2020-02-17,Cash,When delivered,1,5
...,...,...,...,...,...,...
149,170,2022-10-14,Card,Instant,2,170
9,171,2022-10-21,Cash,Instant,1,171
130,172,2022-10-27,Cash,When delivered,1,172
48,173,2022-11-18,Crypto,Instant,2,173


order_item


Unnamed: 0,ID,Product_ID,Amount,Order_ID
0,1,25,3,1
1,2,34,2,1
2,3,12,1,1
3,4,25,1,2
4,5,24,2,2
...,...,...,...,...
365,366,18,2,171
366,367,12,1,172
367,368,4,2,172
368,369,53,1,173


category


Unnamed: 0,ID,Name,Parent_Category_ID
0,1,SmartPhones,14.0
1,2,CellPhones,14.0
2,3,SmartWatches,15.0
3,4,Scales,16.0
4,5,Face & Body,16.0
5,6,Hair Care,16.0
6,7,Laptops,17.0
7,8,Desktop Computers,17.0
8,9,Monitors,17.0
9,10,Tablets,


payment_status


Unnamed: 0,ID,Name
0,1,Paid
1,2,Canceled
2,3,Return


review


Unnamed: 0,ID,Review_Date,order_item_id,Review_Text,Review_Score
0,1,2020-02-06,1,My kid loves it,5
1,2,2020-02-06,2,Amazingly fast shipped!!!,5
2,3,2020-02-06,3,"Wow, it was the best offer",5
3,4,2020-02-12,4,My kid loves it,5
4,5,2020-02-12,5,"Wow, it was the best offer",5
...,...,...,...,...,...
206,207,2022-10-19,360,The color is not so good,4
207,208,2022-10-19,361,"Oh, it is exactly what i needed. Thank you!",5
208,209,2022-10-27,366,"Oh, it is exactly what i needed. Thank you!",5
209,210,2022-10-30,368,"Outstanding quality, as always",5


vat


Unnamed: 0,ID,CATEGORY_ID,VAT_SIZE
0,1,1,13
1,2,2,13
2,3,3,6
3,4,4,6
4,5,5,6
5,6,6,6
6,7,7,20
7,8,8,20
8,9,9,20
9,10,10,20
