In [49]:
!python3 -m venv env
!source env/bin/activate
!python3 -m pip install pandas numpy openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [5]:
import pandas as pd
import numpy as np
from datetime import date, timedelta

In [6]:
sdate = date(2020,1,22)   
edate = date(2022,11,22)

cities = {'Russia' : ['Moscow', 'Ekaterinburg', 'St.Petersburg'],
          'USA' : ['New York', 'Boston', 'Chicago', 'San Francisco'],
          'Portugal' : ['Lisbon', 'Porto', 'Braga'],
          'Germany' : ['Berlin', 'Hamburg', 'Munich', 'Frankfurt'],
          'France' : ['Paris', 'Marseille', 'Lyon'],
          'Spain' : ['Barcelona', 'Madrid', 'Vigo'],
          'Italy' : ['Rome']}


streets = {'Russia' : ['Lenin Street', '1905 Street', 'Mira Avenue', 'Svobody Street'],
           'USA' : ['Martin Luther King Drive', 'Ross Clark Circle', 'Wall Street', 'Washington Street'],
           'Portugal' : ['Avenida da Liberdade', 'Rua Augusta', 'Rua Cor de Rosa', 'Rua da Bica'],
           'Germany' : ['Bernauer Straße', 'Ebertstraße', 'Chausseestraße', 'Kurfürstendamm'],
           'France' : ['Rue des Barres', 'Rue des Rosiers', 'Rue Chanoinesse', 'Rue Cremieux'],
           'Spain' : ['Calle Huertas', 'Calle Mayor', 'Calle de Preciados', 'Paseo del Prado'],
           'Italy' : ["Campo de' Fiori", "Via della Conciliazione", "Via del Corso", "Via Margutta"]}

zipcodes = {'Moscow' : '125130',
            'Ekaterinburg' : '239032',
            'St.Petersburg': '635342',
            'New York' : '35004',
            'Boston' : '37806',
            'Chicago' : '56004',
            'San Francisco' : '14824',
            'Lisbon' : '145-616',
            'Porto' : '432-516',
            'Braga' : '421-361',
            'Berlin' : '76234',
            'Hamburg' : '25138',
            'Munich' : '27391',
            'Frankfurt' : '56234',
            'Paris' : '57234',
            'Marseille' : '32983',
            'Lyon' : '93274',
            'Barcelona' : '01004',
            'Madrid' : '06923',
            'Vigo' : '33724',
            'Rome' : '98168'
           }

m_names = ['Andrew', 'Vladislav', 'Pedro', 'Jhonny', 
           'David', 'Ricardo', 'Cristiano', 'Jack', 
           'Robert', 'Martin', 'Diego', 'Daniel',
           'Liam', 'Oliver', 'James', 'William']

f_names = ['Ttyana', 'Maria', 'Patricia', 'Ines', 
           'Marianna', 'Anna', 'Julia', 'Helen', 
           'Barbara', 'Babette', 'Jane', 'Olivia',
           'Emma', 'Charlotte', 'Sophia']

l_names = ['Castro', 'Abramov', 'Zurabov', 'Pereira', 'Silva', 
           'Santos', 'Oliveira', 'Motta', 'Smith', 'Williams', 
           'Brown', 'Lavigne', 'Monet', 'Blanchet', 
           'Müller', 'Schneider', 'Schmidt']

categories = {1 : 'Phones',
              2 : 'Wearables',
              3 : 'Home & Care',
              4 : 'Computers',
              5 : 'Tablets',
              6 : 'Audio',
              7 : 'Acessories'}
              
products_file = 'data/products.xlsx'

In [9]:
def clients_data_generation():
    def login_generator(name, last_name, year):
        include_year = np.random.choice([True, False])
        cut_year = np.random.choice([True, False])
        num_of_letters = np.random.randint(2, 6)
        login = name[:num_of_letters-1] + last_name[:num_of_letters]
        if include_year:
            if cut_year:
                login += year[2:]
            else:
                login += year
        return login.lower()

    def email_generator(name, last_name, year):
        nickname = login_generator(name, last_name, str(year)) + '@examplemail.lol'
        return nickname

    my_m_names = np.random.choice(m_names, 1000)
    my_ml_names = np.random.choice(l_names, 1000)

    my_f_names = np.random.choice(f_names, 1000)
    my_fl_names = np.random.choice(l_names, 1000)

    male_df = pd.DataFrame(data = {'First_Name' : my_m_names, 
                                   'Last_Name' : my_ml_names, 
                                   'Sex' : 'M'}).drop_duplicates()
    
    female_df = pd.DataFrame(data = {'First_Name' : my_f_names, 
                                     'Last_Name' : my_fl_names, 
                                     'Sex' : 'F'}).drop_duplicates()
    clients = pd.concat([male_df, female_df])

    clients['Birth_year'] = np.random.randint(1941, 2015, len(clients))
    clients['Phone_number'] = np.random.randint(3*10e+9, 10e+10, len(clients))
    clients['email'] = clients.apply(lambda x: email_generator(x['First_Name'], 
                                                               x['Last_Name'],
                                                               x['Birth_year']), axis = 1)
    clients = clients.sample(frac = 1).reset_index(drop = True)
    clients['ID'] = np.arange(1, len(clients)+1)
    clients = clients[['ID', 'First_Name', 'Last_Name', 'Sex', 'Birth_year', 'Phone_number', 'email']]
    return clients

def address_data_generation(clients):
    def give_addr_2_all_customers(clients):
        countries = np.random.choice(list(cities.keys()), len(clients))
        cur_cities, cur_streets, cur_zipcodes = list(), list(), list()

        for country in countries:
            cur_cities.append(np.random.choice(cities[country]))
            cur_streets.append(np.random.choice(streets[country]))

        for city in cur_cities:
            cur_zipcodes.append(zipcodes[city])

        df = pd.DataFrame(data = {'Country' : countries,
                                  'City' : cur_cities,
                                  'Street' : cur_streets,
                                  'PostCode' : cur_zipcodes,
                                  'CUSTOMER_ID' : clients['ID']})
        return df
        
    df = give_addr_2_all_customers(clients)
    some_dublicates_df = give_addr_2_all_customers(clients.sample(frac=0.3))
    df = pd.concat([df, some_dublicates_df]).reset_index(drop=True)
    df['Building'] = np.random.randint(1, 200, len(df))
    df['Appartment_tmp'] = np.random.randint(1, 1000, len(df))
    df['Appartment_presence'] = np.random.choice([1,None], p=[0.8, 0.2], size=len(df))
    df['Appartment'] = df['Appartment_tmp'] * df['Appartment_presence']
    df = df.sample(frac=1).reset_index(drop=True)
    df['ID'] = np.arange(1, len(df)+1)
    df = df[['ID', 'Country', 'City', 'Street', 'Building', 'Appartment', 'PostCode', 'CUSTOMER_ID']]
    return df

def orders_generation(clients, status):
    dates = pd.date_range(sdate,edate-timedelta(days=1),freq='d').astype(str).to_list()
    sample_clients =  np.random.choice(clients['ID'], size=100)

    client_dates, client_ids = list(), list()
    for client in sample_clients:
        n_orders = np.random.choice([1,2,3,4], p=[0.5, 0.3, 0.1, 0.1])
        curr_dates = list(np.random.choice(dates, n_orders))
        client_dates += curr_dates
        client_ids += [client for _ in curr_dates]

    orders = pd.DataFrame(data = {'Date' : client_dates, 'CLIENT_ID' : client_ids})
    orders['Deadline_Date'] = pd.to_datetime(orders['Date']) + pd.DateOffset(days=3)
    orders['Deadline_Date'] = pd.to_datetime(orders['Deadline_Date']).dt.strftime('%Y-%m-%d')
    orders['Date'] = pd.to_datetime(orders['Date']).dt.strftime('%Y-%m-%d')
    orders['Status_ID'] = np.random.choice(status['ID'].to_list(), len(orders))
    orders['Payment_ID'] = np.arange(1, len(orders)+1)
    orders.sort_values(by = ['Date'], inplace = True)
    orders['ID'] = np.arange(1, len(orders)+1)
    orders = orders[['ID', 'Date', 'Deadline_Date', 'Status_ID', 'Payment_ID', 'CLIENT_ID']]
    return orders

def payments_generation(orders):
    def payment_status(order_status_id):
        if order_status_id == 6:
            return 3
        if order_status_id == 5:
            return 2
        return 1

    def payment_date_selector(date1, date2, payment_way):
        if payment_way == 'Instant':
            return date1
        return date2

    payments = pd.DataFrame()
    payments['ID'] = orders['Payment_ID'].copy()
    payments['Date_1'] = orders['Date'].copy()
    payments['Date_2'] = orders['Deadline_Date'].copy()
    payments['Method'] = np.random.choice(['Card', 'Cash', 'Crypto', 'Bill'], p=[0.5, 0.3, 0.1, 0.1], size = len(orders))
    payments['Way'] = np.random.choice(['Instant', 'When delivered'], p=[0.6, 0.4], size = len(orders))
    payments['Status_ID'] = orders.apply(lambda x: payment_status(x['Status_ID']), axis = 1)
    payments['Date'] = payments.apply(lambda x: payment_date_selector(x['Date_1'], x['Date_2'], x['Way']), axis = 1)
    return payments[['ID', 'Date', 'Method', 'Way', 'Status_ID']]

def products_data_generation(products_file):
    df = pd.read_excel(products_file)
    df = df.drop(columns = ['VAT_ID'])
    return df

def order_status_generation():
    ids = [1,2,3,4,5,6]
    status = ['Reviewing', 'Preparation', 'Shipping', 'Delivered', 'Canceled', 'Return']
    statuses_dict = pd.DataFrame(data = {'ID' : ids, 'Name': status})
    return statuses_dict


In [10]:
clients = clients_data_generation()
addresses = address_data_generation(clients)
products = products_data_generation(products_file)
status = order_status_generation()
orders = orders_generation(clients, status)
payments = payments_generation(orders)

frames = {
    'clients' : clients,
    'address' : addresses,
    'product' : products,
    'order_status' : status,
    'orders' : orders,
    'payments' : payments
}

for frame in frames.keys():
    frames[frame].to_excel(f'tables/{frame}.xlsx', index = False)