In [None]:
"""Find the retention rate  of new users on the 2nd day with 3 decimals, order by date ascending"""

In [None]:
"""SQL table creation and data insertion"""
drop table if exists login;

CREATE TABLE `login` (
`id` int(4) NOT NULL,
`user_id` int(4) NOT NULL,
`client_id` int(4) NOT NULL,
`date` date NOT NULL,
PRIMARY KEY (`id`));

INSERT INTO login VALUES
(1,2,1,'2020-10-12'),
(2,3,2,'2020-10-12'),
(3,1,2,'2020-10-12'),
(4,2,2,'2020-10-13'),
(5,1,2,'2020-10-13'),
(6,3,1,'2020-10-14'),
(7,4,1,'2020-10-14'),
(8,4,1,'2020-10-15');

In [None]:
"""SQL solution"""
# 1
SELECT day1.date, ROUND(IF(ISNULL(day2.num_d2/day1.num_d1), 0, day2.num_d2/day1.num_d1), 3)
FROM
(SELECT DISTINCT a.date, IF(ISNULL(d1), 0, d1) AS num_d1
FROM login AS a
LEFT JOIN
(SELECT fl0.fdate0 AS date, COUNT(*) AS d1
FROM
(SELECT user_id, MIN(date)AS fdate0
FROM login
GROUP BY user_id) AS fl0
GROUP BY fl0.fdate0) AS n
ON a.date = n.date) AS day1
LEFT JOIN
(SELECT l.date - 1 AS date, SUM(IF(ISNULL(fl.fdate), 0, 1)) AS num_d2
FROM login AS l
LEFT JOIN
(SELECT user_id, MIN(date)AS fdate
FROM login
GROUP BY user_id) AS fl
ON l.user_id = fl.user_id
AND l.date = DATE_ADD(fl.fdate,INTERVAL 1 DAY)
GROUP BY l.date) AS day2
ON day1.date = day2.date

In [None]:
# Expected
2020-10-12|0.667
2020-10-13|0.000
2020-10-14|1.000
2020-10-15|0.000

In [1]:
"""pandas tadaframe creation"""
import pandas as pd

login = pd.read_csv('data.txt', header=None)
login

Unnamed: 0,0,1,2,3,4
0,(1,2,1,'2020-10-12'),
1,(2,3,2,'2020-10-12'),
2,(3,1,2,'2020-10-12'),
3,(4,2,2,'2020-10-13'),
4,(5,1,2,'2020-10-13'),
5,(6,3,1,'2020-10-14'),
6,(7,4,1,'2020-10-14'),
7,(8,4,1,'2020-10-15');,


In [2]:
# clean the data
login = login.drop(columns=4)

login.columns = ['id', 'user_id', 'client_id', 'date']
login['id'] = login['id'].str[1:]
login['date'] = login['date'].str[1:11]
login

Unnamed: 0,id,user_id,client_id,date
0,1,2,1,2020-10-12
1,2,3,2,2020-10-12
2,3,1,2,2020-10-12
3,4,2,2,2020-10-13
4,5,1,2,2020-10-13
5,6,3,1,2020-10-14
6,7,4,1,2020-10-14
7,8,4,1,2020-10-15


In [4]:
login['date'] = pd.to_datetime(login['date'])

In [12]:
login['day1'] = login.groupby('user_id')['date'].transform(min) 
login['day2'] = login['day1'] + pd.DateOffset(1)
login

Unnamed: 0,id,user_id,client_id,date,day1,day2
0,1,2,1,2020-10-12,2020-10-12,2020-10-13
1,2,3,2,2020-10-12,2020-10-12,2020-10-13
2,3,1,2,2020-10-12,2020-10-12,2020-10-13
3,4,2,2,2020-10-13,2020-10-12,2020-10-13
4,5,1,2,2020-10-13,2020-10-12,2020-10-13
5,6,3,1,2020-10-14,2020-10-12,2020-10-13
6,7,4,1,2020-10-14,2020-10-14,2020-10-15
7,8,4,1,2020-10-15,2020-10-14,2020-10-15


In [15]:
login['new_user'] = login['day1'] == login['date']
login['d2_retention'] = login['day2'] == login['date']
login

Unnamed: 0,id,user_id,client_id,date,day1,day2,d2_retention,new_user
0,1,2,1,2020-10-12,2020-10-12,2020-10-13,False,True
1,2,3,2,2020-10-12,2020-10-12,2020-10-13,False,True
2,3,1,2,2020-10-12,2020-10-12,2020-10-13,False,True
3,4,2,2,2020-10-13,2020-10-12,2020-10-13,True,False
4,5,1,2,2020-10-13,2020-10-12,2020-10-13,True,False
5,6,3,1,2020-10-14,2020-10-12,2020-10-13,False,False
6,7,4,1,2020-10-14,2020-10-14,2020-10-15,False,True
7,8,4,1,2020-10-15,2020-10-14,2020-10-15,True,False


In [27]:
df = login.groupby('date')['new_user','d2_retention'].sum().reset_index()
df

  df = login.groupby('date')['new_user','d2_retention'].sum().reset_index()


Unnamed: 0,date,new_user,d2_retention
0,2020-10-12,3.0,0.0
1,2020-10-13,0.0,2.0
2,2020-10-14,1.0,0.0
3,2020-10-15,0.0,1.0


In [32]:
df['retention rate'] = 0
for i in range(len(df)-1):
    if df.iloc[i, 1]:
        df.iloc[i, 3] = round(df.iloc[i+1, 2]/df.iloc[i, 1], 3)
df

Unnamed: 0,date,new_user,d2_retention,retention rate
0,2020-10-12,3.0,0.0,0.667
1,2020-10-13,0.0,2.0,0.0
2,2020-10-14,1.0,0.0,1.0
3,2020-10-15,0.0,1.0,0.0


In [40]:
df = login.groupby('date')['new_user','d2_retention'].sum().reset_index()
df['d2_retention'] = df['d2_retention'].shift(periods=-1, fill_value=0)
df['retention_rate'] = round(df['d2_retention'] / df['new_user'], 3)
df.loc[df['retention_rate'].isna(), 'retention_rate'] = 0.000
df

  df = login.groupby('date')['new_user','d2_retention'].sum().reset_index()


Unnamed: 0,date,new_user,d2_retention,retention_rate
0,2020-10-12,3.0,2.0,0.667
1,2020-10-13,0.0,0.0,0.0
2,2020-10-14,1.0,1.0,1.0
3,2020-10-15,0.0,0.0,0.0
