### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import os
from faker import Faker
from datetime import date, timedelta
from random import choices

### Simulate Data: 10,000 entries

In [2]:
# size of simulation
k = int(1e4)

In [3]:
# estimated_duration
# a task will take at least 1 day
# up to 20 days (the high parameter is exclusive)
est_duration = np.random.randint(1, 21, k)

In [4]:
# date ranges
beginning, end = np.datetime64('2017-01-01'), np.datetime64('2022-12-31')

# start_date
start_date = [beginning]

# loop to get each date
while beginning != end:
    beginning += np.timedelta64(1, 'D')
    start_date.append(beginning)

# random K dates from pack
start_date = choices(start_date, k=k)

In [5]:
# real task duration
real_duration = []

for number in est_duration:
    x = np.random.randint(int(number * (0.5)), int(number * (1.5)))
    real_duration.append(x)

real_duration = np.array(real_duration)

In [6]:
# end date
end_date = []

for (i, j) in zip(start_date, real_duration):
        one_date = np.datetime64(i) + np.timedelta64(j, 'D')
        end_date.append(one_date)

In [7]:
# test to see if it worked
start_date[50], real_duration[50], end_date[50]

(numpy.datetime64('2017-09-17'), 0, numpy.datetime64('2017-09-17'))

> Note: Now, we have to make sure the dates don't fall on weekends. For that, I'll use [`np.is_busday`](https://numpy.org/devdocs/reference/generated/numpy.is_busday.html) and [`np.busday_offset`](https://numpy.org/doc/stable/reference/generated/numpy.busday_offset.html)

In [8]:
# start_date
# placeholder list for updated start_date
st_date = []

for date in start_date:
    if np.is_busday(date) == False:
        x = np.busday_offset(date, 0, roll='backward')
        st_date.append(x)
    else:
        st_date.append(date)

# now, st_date contains business days only
# so, I'll replace the original start_date
# list with this one
start_date = st_date

In [9]:
# end_date
# placeholder list for updated end_date
e_date = []

for date in end_date:
    if np.is_busday(date) == False:
        x = np.busday_offset(date, 0, roll='backward')
        e_date.append(x)
    else:
        e_date.append(date)
    
# now, e_date contains business days only
# so, I'll replace the original end_date
# list with this one
end_date = e_date

> Note: Getting [total business days](https://www.geeksforgeeks.org/python-program-to-get-total-business-days-between-two-dates/) using [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.busday_count.html)

In [10]:
# calculate real duration once again, but this
# time around, counting business days only
real_duration = []

for (i, j) in zip(start_date, end_date):
    one_duration = np.busday_count(i, j + 1)
    real_duration.append(one_duration)

In [11]:
# days since a ticket was added to the backlog
# I chose three months (roughly 90 days)
added_days = np.random.randint(0, 90, k)

In [12]:
# added_date must be up to three months 
# before (-) start_date
added_date = []

for (i, j) in zip(start_date, added_days):
    one_date = np.datetime64(i) - np.timedelta64(j, 'D')
    # I can offset 1 business day on the same loop
    # to minimize code repetition
    one_date = np.busday_offset(one_date, 0, roll='backward')
    added_date.append(one_date)

In [13]:
# # check to see if it worked
# sum_days = 0

# for date in added_date:
#     if np.is_busday(date) == False:
#         sum_days += 1

# sum_days

In [14]:
# list of hypothetical softwares
software_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [15]:
# get 1e4 entries
software_k = np.random.choice(software_list, size=k)

> Note: [Faker](https://faker.readthedocs.io/en/master/index.html)'s documentation

In [16]:
# for the dev names, I'll use the Faker library
fake = Faker(['pt_BR'])

# empty list
devs = []

for _ in range(10):
    dev = fake.first_name_female()
    devs.append(dev)

# get k entries
dev_k = np.random.choice(devs, size=k)

In [17]:
# # status
# status = ['Não Iniciado', 'Em Desenvolvimento', 'Em Homologação', 'Pausado', 'Concluído']

# # get k entries
# status_k = np.random.choice(status, k, p=[0.2, 0.1, 0.1, 0.1, 0.5])

# # test status_k size
# len(status_k)

In [18]:
# returns (number of times an item returned to developer)
possible_returns = [0, 1, 2, 3]

# k-sized list of possible returns
# notice 0 has the highest probability
returns_k = np.random.choice(possible_returns, k, p=[0.65, 0.2, 0.1, 0.05])

In [19]:
# squad responsible for the ticket
squad = ['Epsilon', 'Theta', 'Zeta', 'Kappa', 'Chi']

# get k entries
squad_k = np.random.choice(squad, k, p=[0.3, 0.2, 0.05, 0.4, 0.05])

In [20]:
# get client list using Faker
# empty list
clients = []

for _ in range(200):
    client = fake.company()
    clients.append(client)

# get k entries
clients_k = np.random.choice(clients, size=k)

In [21]:
df = pd.DataFrame({'added_date': added_date,
                  'estimated_duration': est_duration,
                  'start_date': start_date,
                  'end_date': end_date,
                  'real_duration': real_duration,
                  'returns': returns_k,
                  'software': software_k,
                  'dev': dev_k,
                  'squad': squad_k,
                  'client': clients_k
                  })

In [22]:
print(df.info())
df = df.sort_values(by=['added_date'], ignore_index=True)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   added_date          10000 non-null  datetime64[ns]
 1   estimated_duration  10000 non-null  int32         
 2   start_date          10000 non-null  datetime64[ns]
 3   end_date            10000 non-null  datetime64[ns]
 4   real_duration       10000 non-null  int64         
 5   returns             10000 non-null  int32         
 6   software            10000 non-null  object        
 7   dev                 10000 non-null  object        
 8   squad               10000 non-null  object        
 9   client              10000 non-null  object        
dtypes: datetime64[ns](3), int32(2), int64(1), object(4)
memory usage: 703.2+ KB
None


Unnamed: 0,added_date,estimated_duration,start_date,end_date,real_duration,returns,software,dev,squad,client
0,2016-10-10,16,2017-01-06,2017-01-20,11,0,G,Eduarda,Kappa,Peixoto - ME
1,2016-10-11,14,2016-12-30,2017-01-13,11,3,C,Pietra,Theta,Fogaça
2,2016-10-14,16,2016-12-30,2017-01-24,18,2,C,Rebeca,Epsilon,Costa
3,2016-10-14,4,2017-01-05,2017-01-06,2,1,G,Ana Luiza,Epsilon,da Rocha - ME
4,2016-10-14,6,2017-01-05,2017-01-11,5,0,A,Ana Luiza,Epsilon,Nascimento - EI


### Data Wrangling

In [23]:
# for now, every ticket is done, thus its status should be "Concluído"
df['status'] = "Concluído"

In [24]:
# # to create the status "Não Iniciado", I'll randomly choose
# # 200 tickets from the last three months
# msk = df['added_date'] >= '2022-10-01'
# df[msk]

In [25]:
# # if status is different from 'concluído' or 'em homologação', 
# # then logically the dev isn't done yet, thus there should be
# # no end date nor real duration for the task
# df.loc[(df['status'] != "Concluído") & (df['status'] != "Em Homologação"), 'end_date'] = np.nan
# df.loc[(df['status'] != "Concluído") & (df['status'] != "Em Homologação"), 'real_duration'] = np.nan

In [26]:
# df.loc[(df['status'] == "Em Desenvolvimento"), "start_date"] = df['added_date']

In [27]:
# create directory
folder_name = 'data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [28]:
df.to_csv('data/simulated-data.csv', index=False, encoding='latin1')