In [None]:
"""Find the number of new users for each date, order the results by date ascending"""

In [None]:
"""SQL table creation and data insertion"""
drop table if exists login;
CREATE TABLE `login` (
`id` int(4) NOT NULL,
`user_id` int(4) NOT NULL,
`client_id` int(4) NOT NULL,
`date` date NOT NULL,
PRIMARY KEY (`id`));


INSERT INTO login VALUES
(1,2,1,'2020-10-12'),
(2,3,2,'2020-10-12'),
(3,1,2,'2020-10-12'),
(4,2,2,'2020-10-13'),
(5,1,2,'2020-10-13'),
(6,3,1,'2020-10-14'),
(7,4,1,'2020-10-14'),
(8,4,1,'2020-10-15');

In [None]:
"""SQL solution"""
# 1
SELECT a.date, SUM(new)
FROM
(SELECT l.date, IF(ISNULL(nl.fdate), 0, 1) AS new
FROM login AS l
LEFT JOIN
(SELECT user_id, MIN(date) AS fdate
FROM login
GROUP BY user_id) AS nl
ON l.user_id = nl.user_id
AND l.date = nl.fdate) AS a
GROUP BY a.date
ORDER BY a.date

# 2
SELECT l.date, SUM(IF(fl.user_id IS NULL, 0, 1))
FROM login AS l
LEFT JOIN
(SELECT user_id, MIN(date) AS nl
FROM login
GROUP BY user_id) AS fl
ON l.date = fl.nl
AND l.user_id = fl.user_id
GROUP BY l.date
ORDER BY l.date

# 3
SELECT a.date, SUM(CASE WHEN t_rank = 1 THEN 1 ELSE 0 END) AS new
FROM
(SELECT date, ROW_NUMBER() OVER(PARTITION BY user_id ORDER BY date) AS t_rank
FROM login) AS a
GROUP BY date

In [46]:
"""pandas tadaframe creation"""
import pandas as pd

login = pd.read_csv('data.txt', header=None)
login

Unnamed: 0,0,1,2,3,4
0,(1,2,1,'2020-10-12'),
1,(2,3,2,'2020-10-12'),
2,(3,1,2,'2020-10-12'),
3,(4,2,2,'2020-10-13'),
4,(5,1,2,'2020-10-13'),
5,(6,3,1,'2020-10-14'),
6,(7,4,1,'2020-10-14'),
7,(8,4,1,'2020-10-15');,


In [47]:
# clean the data
login = login.drop(columns=4)

login.columns = ['id', 'user_id', 'client_id', 'date']
login['id'] = login['id'].str[1:]
login['date'] = login['date'].str[1:11]
login

Unnamed: 0,id,user_id,client_id,date
0,1,2,1,2020-10-12
1,2,3,2,2020-10-12
2,3,1,2,2020-10-12
3,4,2,2,2020-10-13
4,5,1,2,2020-10-13
5,6,3,1,2020-10-14
6,7,4,1,2020-10-14
7,8,4,1,2020-10-15


In [18]:
# 1,2 find the date of first login for comparison
login_1st = login[login.groupby('user_id')['date'].transform(min) == login['date']]
login_1st

Unnamed: 0,id,user_id,client_id,date
0,1,2,1,2020-10-12
1,2,3,2,2020-10-12
2,3,1,2,2020-10-12
6,7,4,1,2020-10-14


In [37]:
df = pd.merge(login[['id', 'user_id', 'date']], login_1st[['id', 'user_id', 'date']], on=['user_id', 'date'], how='left', suffixes=['','_1st'])
df['new_user'] = 1 - df['id_1st'].isna()
df

Unnamed: 0,id,user_id,date,id_1st,new_user
0,1,2,2020-10-12,1.0,1
1,2,3,2020-10-12,2.0,1
2,3,1,2020-10-12,3.0,1
3,4,2,2020-10-13,,0
4,5,1,2020-10-13,,0
5,6,3,2020-10-14,,0
6,7,4,2020-10-14,7.0,1
7,8,4,2020-10-15,,0


In [38]:
pd.DataFrame(df.groupby('date')['new_user'].sum()).reset_index()

Unnamed: 0,date,new_user
0,2020-10-12,3
1,2020-10-13,0
2,2020-10-14,1
3,2020-10-15,0


In [62]:
# 3 rank user logins by date
login['date'] = pd.to_datetime(login['date'])
login['#login_by_user'] = login.groupby('user_id')['date'].rank(method='dense')
login['new_user'] = login['#login_by_user'] == 1
login

Unnamed: 0,id,user_id,client_id,date,#login_by_user,new_user
0,1,2,1,2020-10-12,1.0,True
1,2,3,2,2020-10-12,1.0,True
2,3,1,2,2020-10-12,1.0,True
3,4,2,2,2020-10-13,2.0,False
4,5,1,2,2020-10-13,2.0,False
5,6,3,1,2020-10-14,2.0,False
6,7,4,1,2020-10-14,1.0,True
7,8,4,1,2020-10-15,2.0,False


In [66]:
pd.DataFrame(login.groupby('date')['new_user'].sum()).reset_index()

Unnamed: 0,date,new_user
0,2020-10-12,3.0
1,2020-10-13,0.0
2,2020-10-14,1.0
3,2020-10-15,0.0
