In [None]:
"""
Find the failure (email.type = no_completed) rate p when an email is sent from a normal user (user.is_blacklist = 0) to another normal user at each day,
with 3 decimals and order by date ascending
"""

In [None]:
"""SQL table creation and data insertion"""
drop table if exists email;
drop table if exists user;

CREATE TABLE `email` (
`id` int(4) NOT NULL,
`send_id` int(4) NOT NULL,
`receive_id` int(4) NOT NULL,
`type` varchar(32) NOT NULL,
`date` date NOT NULL,
PRIMARY KEY (`id`));

CREATE TABLE `user` (
`id` int(4) NOT NULL,
`is_blacklist` int(4) NOT NULL,
PRIMARY KEY (`id`));

INSERT INTO email VALUES
(4,3,1,'no_completed','2020-01-12'),
(5,3,4,'completed','2020-01-12'),
(6,4,1,'completed','2020-01-12');


INSERT INTO user VALUES
(1,0),
(2,1),
(3,0),
(4,0);

In [None]:
"""SQL solution"""
SELECT nc.date, ROUND(e_nc / e_a, 3) AS p
FROM
(SELECT e1.date, COUNT(*) AS e_nc
FROM email AS e1
WHERE send_id IN (SELECT id FROM user WHERE is_blacklist = 0)
AND receive_id IN (SELECT id FROM user WHERE is_blacklist = 0)
AND type = 'no_completed'
GROUP BY date) AS nc
INNER JOIN
(SELECT e2.date, COUNT(*) AS e_a
FROM email AS e2
WHERE send_id IN (SELECT id FROM user WHERE is_blacklist = 0)
AND receive_id IN (SELECT id FROM user WHERE is_blacklist = 0)
GROUP BY date) AS a
ON nc.date = a.date
ORDER BY a.date

In [2]:
"""pandas dataframe creation"""
import pandas as pd

email = pd.DataFrame([[4,3,1,'no_completed','2020-01-12'],
                      [5,3,4,'completed','2020-01-12'],
                      [6,4,1,'completed','2020-01-12']],
                     columns=['id', 'send_id', 'receive_id', 'type', 'date'])
user = pd.DataFrame([[1, 0],
                     [2, 1],
                     [3, 0],
                     [4, 0]],
                    columns=['id', 'is_blacklist'])
email.head()

Unnamed: 0,id,send_id,receive_id,type,date
0,4,3,1,no_completed,2020-01-12
1,5,3,4,completed,2020-01-12
2,6,4,1,completed,2020-01-12


In [3]:
user.head()

Unnamed: 0,id,is_blacklist
0,1,0
1,2,1
2,3,0
3,4,0


In [7]:
# rename id for clarity
user = user.rename(columns={'id':'user_id'})
user.head()

Unnamed: 0,user_id,is_blacklist
0,1,0
1,2,1
2,3,0
3,4,0


In [24]:
# check if the send_id is in blacklist
email_all = pd.merge(email, user, left_on='send_id', right_on='user_id', how='left')
email_all

Unnamed: 0,id,send_id,receive_id,type,date,user_id,is_blacklist
0,4,3,1,no_completed,2020-01-12,3,0
1,5,3,4,completed,2020-01-12,3,0
2,6,4,1,completed,2020-01-12,4,0


In [25]:
# check if the receive_id is in blacklist
email_all = pd.merge(email_all, user, left_on='receive_id', right_on='user_id', how='left',suffixes=('_send', '_receive'))
email_all

Unnamed: 0,id,send_id,receive_id,type,date,user_id_send,is_blacklist_send,user_id_receive,is_blacklist_receive
0,4,3,1,no_completed,2020-01-12,3,0,1,0
1,5,3,4,completed,2020-01-12,3,0,4,0
2,6,4,1,completed,2020-01-12,4,0,1,0


In [36]:
# find email from normal user to normal user
email_all['is_blacklist'] = email_all['is_blacklist_send'] + email_all['is_blacklist_receive']
email_all = email_all[email_all['is_blacklist'] == 0]
email_all

Unnamed: 0,id,send_id,receive_id,type,date,user_id_send,is_blacklist_send,user_id_receive,is_blacklist_receive,is_black_list,is_blacklist
0,4,3,1,no_completed,2020-01-12,3,0,1,0,0,0
1,5,3,4,completed,2020-01-12,3,0,4,0,0,0
2,6,4,1,completed,2020-01-12,4,0,1,0,0,0


In [38]:
# remove redundant columns
email_all = email_all[['id', 'send_id', 'receive_id', 'type', 'date', 'is_blacklist']]
email_all

Unnamed: 0,id,send_id,receive_id,type,date,is_blacklist
0,4,3,1,no_completed,2020-01-12,0
1,5,3,4,completed,2020-01-12,0
2,6,4,1,completed,2020-01-12,0


In [40]:
# email with failure
email_nc = email_all[email['type'] == 'no_completed']
email_nc

Unnamed: 0,id,send_id,receive_id,type,date,is_blacklist
0,4,3,1,no_completed,2020-01-12,0


In [52]:
# counting all emails by date
email_all_date = email_all.groupby('date')['id'].count()
email_all_date.reset_index().rename(columns={'id':'cnt'})

Unnamed: 0,date,cnt
0,2020-01-12,3


In [53]:
# counting no_completed emails by date
email_nc_date = email_nc.groupby('date')['id'].count()
email_nc_date.reset_index().rename(columns={'id':'cnt'})

Unnamed: 0,date,cnt
0,2020-01-12,1


In [66]:
# calculate the failure rate p
df = pd.merge(email_all_date, email_nc_date, on='date', how='inner', suffixes=('_all', '_no_completed')).reset_index()
df['p'] = round(df['id_no_completed'] / df['id_all'], 3)
df[['date', 'p']]

Unnamed: 0,date,p
0,2020-01-12,0.333
