In [None]:
"""
Find out the retention rate of new users,
i.e., the probability that a user log in again on the 2nd day of his/her first login
keep 3 decimals
"""

In [None]:
"""SQL table creation and data insertion"""
drop table if exists login;

CREATE TABLE `login` (
`id` int(4) NOT NULL,
`user_id` int(4) NOT NULL,
`client_id` int(4) NOT NULL,
`date` date NOT NULL,
PRIMARY KEY (`id`));

INSERT INTO login VALUES
(1,2,1,'2020-10-12'),
(2,3,2,'2020-10-12'),
(3,2,2,'2020-10-13'),
(4,3,2,'2020-10-14');

In [None]:
"""SQL solution"""
SELECT ROUND((SELECT COUNT(*) AS num_d2
FROM login AS l2,
(SELECT user_id, DATE_ADD(MIN(date), INTERVAL 1 DAY) AS second_log
FROM login 
GROUP BY user_id) AS sl
WHERE l2.date = sl.second_log
AND l2.user_id = sl.user_id) / (SELECT COUNT(DISTINCT user_id) FROM login), 3)

In [6]:
"""pandas dataframe creation"""
import pandas as pd

login = pd.DataFrame([[1,2,1,'2020-10-12'],
                      [2,3,2,'2020-10-12'],
                      [3,2,2,'2020-10-13'],
                      [4,3,2,'2020-10-14']],
                     columns=['id', 'user_id', 'client_id', 'date'])
# convert string to datetime for the following processing
login['date'] = pd.to_datetime(login['date'])
login.head()

Unnamed: 0,id,user_id,client_id,date
0,1,2,1,2020-10-12
1,2,3,2,2020-10-12
2,3,2,2,2020-10-13
3,4,3,2,2020-10-14


In [15]:
# for each new user, find the date of the second day following their first login
login_1st = login[login.groupby('user_id')['date'].transform(min) == login['date']]
login_1st['day2_login'] = login_1st['date'] + pd.DateOffset(1)
login_1st

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  login_1st['day2_login'] = login_1st['date'] + pd.DateOffset(1)


Unnamed: 0,id,user_id,client_id,date,day2_login
0,1,2,1,2020-10-12,2020-10-13
1,2,3,2,2020-10-12,2020-10-13


In [18]:
# check if each user logged in on the 2nd day following their first login
# NaT if no login
df = pd.merge(login[['user_id', 'date']], login_1st[['user_id','day2_login']], left_on=['user_id', 'date'], right_on=['user_id', 'day2_login'], how='right')
df

Unnamed: 0,user_id,date,day2_login
0,2,2020-10-13,2020-10-13
1,3,NaT,2020-10-13


In [27]:
round((len(df) - sum(df['date'].isna()))/len(df), 3)

0.5