# Introduction to sqlite and SQL  

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.io import sql
import sqlite3

### Setup

In [None]:
#The data has information about when users for fictional company signed up and other metadata about them.
users = pd.read_csv('users.csv', low_memory=False)
users.head()

In [None]:
conn = sqlite3.connect('lesson-16.db', detect_types=sqlite3.PARSE_DECLTYPES)

In [None]:
#Put data into the eliflo DB
users.to_sql('users',
            con=conn,
            if_exists='replace',
            index=False)

### Count, Sum, Average

In [None]:
#Our first query in python sqlite!  What does * mean?
#Who knows the HelloWorld of SQL?
a=sql.read_sql('''

WRITE QUERY HERE

''', con=conn)
a
#Question - Given a new data set in SQL, what are ways you start categorizing a data set?  
#What is the equivalent of df.describe?

In [None]:
#See what tables currently exist in your database.
a=sql.read_sql('''
SELECT name FROM sqlite_master WHERE type = "table";
''', con=conn)
a

In [None]:
#Examples of count, sum and average functions
a=sql.read_sql('''
select  
    count(*)  as num_of_rows,
    count(distinct(customerid))  as distinct_customers,
    sum(mailinglist) as num_mailing,
    avg(mailinglist) as mailing_probability
from 
    users ;
''', con=conn)
a

How many distinct users (parentid) invited a customer?

### Min, Max

In [None]:
#use min, max and date to compute date range.
#Date functionality is very useful.
a=sql.read_sql('''
select  
    date(min(datecreated), 'unixepoch') as start_date, 
    date(max(datecreated), 'unixepoch') as end_date
from 
    users ;
''', con=conn)
a

### Case

In [None]:
#Example of using case 
a=sql.read_sql('''
select  
    sum(case when mailinglist = 1 then 1 else 0 end) as mailing_true,
    sum(case when mailinglist = 0 then 1 else 0 end) as mailing_false,
    sum(case when mailinglist not in (0, 1) then 1 else 0 end) as mailing_other
from 
    users ;
''', con=conn)
a

Use case to measure how many customers were invited by someone

### Group By

In [None]:
#Show signups per day
a=sql.read_sql('''
select 
    date(datecreated, 'unixepoch') as day_created, 
    count(*) as new_users 
from 
    users 
group by 1
''', con=conn)
a.plot()

How many users came through each source?

In [None]:
#What are the most popular sources that users came from?
a=sql.read_sql('''
select 
    source, 
    count(*) as new_users 
from 
    users 
group by 1 
order by 2 desc;
''', con=conn)
a

### Modulo

In [None]:
#Bucketing users.  Good for experiments!
a=sql.read_sql('''
select 
    customerid % 4, 
    count(*) as new_users 
from 
    users 
group by 1
''', con=conn)
a

# Growth Analytics with Master Fact 

In [None]:
#Get data about these users subsequent activity
activity = pd.read_csv('activity.csv', low_memory=False)
activity.head()

In [None]:
#Convert activity to sqlite
activity.to_sql('master_fact',
            con=conn,
            if_exists='replace',
            index=False)

### Subquery

In [None]:
# A brief detour
# Example of a subquery
a=sql.read_sql('''
select
visits,
count(distinct(customerid)) as customers
from
(
select
    customerid,
    count(*) as visits
from
    master_fact
where
    eventtype = 'login'
group by 1 
) q
group by 1 
order by 2 desc;
    ''', con=conn)
a.head(5)
               

### Join Example

In [None]:
Give me the users that were active on '2014-01-01' that were also active on '2014-01-07'.

### Measuring a Growth Susceptibility: Retention Rate

In [None]:
#This query show the percentage of customers per cohort that were active in the second week after they signed up.
a=sql.read_sql('''
select 
    a.day_created,
    cast(returning_users as real)/new_users
from
(
select
    substr(date(datecreated, 'unixepoch'), 1,7) as day_created,
    count(distinct(customerid)) as new_users
from
    users 
group by 1
) a
left join
(
select
    substr(date(a.datecreated, 'unixepoch'), 1,7) as day_created,
    count(distinct(b.customerid)) as returning_users
from
    users a
left join
    master_fact b
on
    a.customerid = b.customerid
where
    b.timestamp between a.datecreated + 7*86400 and a.datecreated + 14*86400
group by 1
) b
on a.day_created = b.day_created
    ''', con=conn)
a.head(10)

In [None]:
#This query looks at the retention rate for a selected cohort.
a=sql.read_sql('''
select 
    month as age_in_months,
    cast(returning_users as real)/initial_cohort as retention_rate
from
(select
   cast((b.timestamp - a.datecreated)/(86400*30) as int) as month,
    count(distinct(b.customerid)) as returning_users
from
    users a
left join
    master_fact b
on
    a.customerid = b.customerid
where
    substr(date(a.datecreated, 'unixepoch'), 1,7) = '2012-06' and
    cast((b.timestamp - a.datecreated)/(86400*30) as int) >= 0
group by 1) a,
(select
    count(distinct(customerid)) as initial_cohort
from
    users a
where
    substr(date(a.datecreated, 'unixepoch'), 1,7) = '2012-06' 
)
    ''', con=conn)
a.head(5)

In [None]:
#Generalizing the calculation of a retention curve
def constructRetentionCurve(month):
    string = '''
    select 
        month,
        cast(returning_users as real)/initial_cohort as retention_rate
    from
    (select
        cast((b.timestamp - a.datecreated)/(86400*30) as int) as month,
        count(distinct(b.customerid)) as returning_users
    from
        users a
    left join
        master_fact b
    on
        a.customerid = b.customerid
    where
        substr(date(a.datecreated, 'unixepoch'), 1,7) = '%s' and
        cast((b.timestamp - a.datecreated)/(86400*30) as int) >= 0
    group by 1) a,
    (select
        count(distinct(customerid)) as initial_cohort
    from
        users a
    where
        substr(date(a.datecreated, 'unixepoch'), 1,7) = '%s'  
    )
    ''' % (month, month)
    return string

In [None]:
a=sql.read_sql(constructRetentionCurve('2012-10'), con=conn)
b=sql.read_sql(constructRetentionCurve('2012-06'), con=conn)
#How do we join the retention curves together?

In [None]:
# A function to simulate linear susceptibility.
def linearG(t, b, m):
    if t < 0:
        return 0
    if t == 0:
        return 0.75
    elif b - m*t  > 0:
        return round(b -m*t, 3)
    else:
        return 0

In [None]:
plt.figure(figsize=(8,5))
plt.style.use('seaborn-whitegrid')
plt.rc('lines', linewidth=3)
plt.plot([item for item in a['month']] ,[item for item in a['retention_rate']]) 
plt.plot([item for item in b['month']] ,[item for item in b['retention_rate']]) 
#Compare these measurements to a functional form.
#plt.plot([item for item in range(30)] ,[linearG(item, 0.15, 0.002) for item in range(30)]) 
plt.xlabel('Age (ty - tx)')
plt.ylabel('P(y)')
plt.ylim((0,1))
plt.show()

### Use susceptibility to do projections

In [None]:
def computeYLin(cohorts, daysOut, b, m):
    currentDay = len(cohorts)
    Y = []
    #First Calculate DAU while cohorts are being acquired
    for t in range(currentDay):
        actives=[]
        for i in range(t):
            actives.append(round(cohorts[t-i]*linearG(i, b, m), 3))
        Y.append(sum(actives))
    #Now calculate more DAU as we project out after acquiring cohorts
    for t in range(daysOut):
        actives=[]
        for i in range(currentDay):
            actives.append(round(cohorts[currentDay-i-1]*linearG(t+i-1, b, m), 3))
        Y.append(sum(actives))
    return Y 

In [None]:
#Show signups per day
new_users=sql.read_sql('''
select 
    substr(date(datecreated, 'unixepoch'), 1,7) as month_created, 
    count(*) as new_users 
from 
    users 
group by 1
''', con=conn)
plt.figure(figsize=(6,4))
plt.style.use('seaborn-whitegrid')
plt.xlabel('time (months)')
plt.ylabel('New Users')
plt.plot(new_users['new_users'])

In [None]:
#Show active users per day
dau=sql.read_sql('''
select 
    substr(date(timestamp, 'unixepoch'), 1,7) as month_active, 
    count(distinct(customerid)) as active_users 
from 
    master_fact
group by 1
''', con=conn)
plt.figure(figsize=(6,4))
plt.style.use('seaborn-whitegrid')
plt.xlabel('time (months)')
plt.ylabel('DAU')
plt.plot(dau['active_users'])


In [None]:
cohorts = [item for item in new_users['new_users']]
results=pd.DataFrame(computeYLin(cohorts, 1, 0.15, 0.002))
#Project out further
#results=pd.DataFrame(computeYLin(cohorts, 1, 0.15, 0.002))
plt.figure(figsize=(6,4))
plt.style.use('seaborn-whitegrid')
plt.xlabel('time (months)')
plt.ylabel('DAU')
plt.plot(dau['active_users'])
plt.plot(results)
#What if we half the rate of decay and increase the initial intercept?
#results2=pd.DataFrame(computeYLin(cohorts, 1, 0.2, 0.001))
#plt.plot(results2)

plt.show()

# Building a Churn Prediction Model

#### Write a query that uses the number of logins a customer performs in the first 30 days  (as well as other metadata such as source, mailinglist) to determine whether a customer will be active in  their fourth month (days 90 through 120).