### Used Libraries<a class="anchor" id="chapter1"></a>

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import psycopg2

### Access to the DB <a class="anchor" id="chapter2"></a>

In [2]:
db_config = {'user': 'practicum_student',         # username
             'pwd': 's65BlTKV3faNIGhmvJVzOqhs', # password
             'host': 'rc1b-wcoijxj3yxfsf3fs.mdb.yandexcloud.net',
             'port': 6432,              # connection port
             'db': "data-analyst-eth-payouts-db"}          # the name of the database

connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(db_config['user'],
                                                                     db_config['pwd'],
                                                                       db_config['host'],
                                                                       db_config['port'],
                                                                       db_config['db'])

engine = create_engine(connection_string, connect_args={'sslmode':'require'})


#  Table Queries <a class="anchor" id="chapter3"></a>

function that takes a query and return dataframe for general use

In [3]:
def queryResult(q):
    return pd.io.sql.read_sql(q, con = engine)

In [4]:
# from sqlalchemy import inspect
# inspector = inspect(engine)

# inspector.get_table_names()
engine.table_names()

  engine.table_names()


['payout', 'plan']

## payout table:

**user_id:** user's id

**eth_address:** the eth address that the user used to get the payout in Ethernium. (users can have multiple addresses)

**date:** date of payout to user

**payout:** amount that was paid to user

## plan table:

**user_id:** user's id

**"OS":** Operating system of user

**"Plan":** user's plan on the site

In [5]:
q = '''SELECT *
FROM   payout  '''
df_payout = queryResult(q)
df_payout

Unnamed: 0,user_id,eth_address,date,payout
0,1005,0x4f9117b14426ac44ead4eaef0223830cb16bdb07,2020-07-27,1.68113
1,1005,0xccbe3f17a61f0e8ef05b04dc0c3f510522df6cc8,2020-09-04,2.27831
2,1020,0x2e22950c26899fd0dc0c593b147f6beb7f1c0c37,2020-11-11,0.25199
3,1020,0x2081974d67307ec737dd13c60b524ca4d644ec1d,2020-09-30,2.21076
4,1021,0x370ff36440a36825279db341c6b85bc0371e7c3f,2020-12-10,1.47999
...,...,...,...,...
563,4848,0x72c65e71cc7a2dc71096b64f7a42f8b3192539ab,2020-10-15,2.02452
564,4848,0xd8444ab60e7078d163ecc3ff65fbe01e8770e72d,2020-07-15,3.11877
565,4862,0xbdacd0eca1bfa3b2aabe9687bd3b65969227af34,2020-07-22,1.83956
566,4862,0x446447a208c6e2809f605906a4683e61335e8024,2020-08-13,1.44528


In [6]:
q = '''SELECT *
FROM   plan '''
df_plan = queryResult(q)
df_plan

Unnamed: 0,user_id,OS,Plan
0,1005,Windows,Free
1,1020,MAC,Free
2,1021,Linux,Premium
3,1041,Linux,Free
4,1048,Windows,Free
...,...,...,...
428,4762,MAC,Premium
429,4765,Windows,Free
430,4798,MAC,Premium
431,4848,Linux,Premium


1. How many users got paid?

In [7]:
q = "select count(distinct user_id) as paid_users from payout"
queryResult(q)

Unnamed: 0,paid_users
0,433


<div class="alert alert-success" role="alert">
Great!</div>

 2. Show the 5 users with the highest payouts

In [8]:
q='''SELECT user_id,
       Sum(payout) AS total_payout
FROM   payout
GROUP  BY user_id
ORDER  BY Sum(payout) DESC
LIMIT  5  '''
queryResult(q)

Unnamed: 0,user_id,total_payout
0,1537,10.74169
1,3051,9.73121
2,1512,8.80816
3,1127,8.78861
4,4848,8.42445


<div class="alert alert-success" role="alert">
Great!</div>

3. Show the 5 users with the lowest payouts

In [9]:
q='''SELECT user_id,
       Sum(payout) AS total_payout
FROM   payout
GROUP  BY user_id
ORDER  BY Sum(payout)
LIMIT  5  '''
queryResult(q)

Unnamed: 0,user_id,total_payout
0,2003,0.00775
1,3410,0.00818
2,2813,0.02914
3,2462,0.03381
4,4467,0.05056


<div class="alert alert-success" role="alert">
Great!</div>

4. How much ether was paid out in November 2020?

In [10]:
# one option:
# q='''SELECT Extract(month FROM date) AS month,
#        Extract(year FROM date)  AS year,
#        Sum(payout)
# FROM   payout
# GROUP  BY Extract(month FROM date),
#           Extract(year FROM date)
# HAVING Extract(month FROM date) = 11
#        AND Extract(year FROM date) = 2020  '''

# second:
q='''
SELECT Sum(payout) AS total_nov_2020
FROM   (SELECT *
        FROM   payout
        WHERE  Extract(month FROM date) = 11
               AND Extract(year FROM date) = 2020) AS x  
     
'''
queryResult(q)

Unnamed: 0,total_nov_2020
0,166.0118


<div class="alert alert-success" role="alert">
Great!</div>

5. Which plan is the most popular?

In [11]:
q = '''
SELECT Count(user_id) AS users_number,
       plan."Plan"
FROM   PLAN
GROUP  BY plan."Plan"  
'''
queryResult(q)

Unnamed: 0,users_number,Plan
0,220,Free
1,213,Premium


<div class="alert alert-success" role="alert">
Great!</div>

Which plan is the most popular amongst Linux users?

In [12]:
q='''SELECT Count(user_id) AS users_number,
       plan."Plan"
FROM   PLAN
WHERE  plan."OS" = 'Linux'
GROUP  BY plan."Plan"  
'''
queryResult(q)

Unnamed: 0,users_number,Plan
0,68,Free
1,76,Premium


<div class="alert alert-success" role="alert">
Great!</div>

7. What is the percentage of payout between the different plans?

In [14]:
q='''
SELECT DISTINCT "Plan",
                Sum(payout) OVER(partition BY "Plan") AS payout_plan,
                Sum(payout) OVER() AS total_payout,
                Round(Sum(payout) OVER(partition BY "Plan") / Sum(payout) OVER() * 100, 2) AS percentage
FROM payout
       INNER JOIN plan
               ON payout.user_id = plan.user_id  
'''
queryResult(q)

Unnamed: 0,Plan,payout_plan,total_payout,percentage
0,Premium,416.76178,905.45177,46.03
1,Free,488.68999,905.45177,53.97


<div class="alert alert-success" role="alert">
Great! - but without a round will be better solution in general for percentage as we want more accuarcy</div>

* corrected

8. Users of which operating system earned more in payouts?

In [15]:
q='''
SELECT Sum(payout) AS total_payout,
       "OS"
FROM   payout
       INNER JOIN PLAN
               ON payout.user_id = PLAN.user_id
GROUP  BY "OS"
ORDER  BY Sum(payout) DESC  
'''
queryResult(q)

Unnamed: 0,total_payout,OS
0,322.00353,Linux
1,316.03701,Windows
2,267.41123,MAC


<div class="alert alert-success" role="alert">
Great! - might be good to show the rest here to know by how much</div>

* corrected

9. What is the average payout amount per user for each of the OS in July 2020?

In [16]:
q='''
SELECT AVG(payout) AS avg_payout,
       "OS"
FROM   payout
       INNER JOIN plan
               ON payout.user_id = plan.user_id
WHERE extract(month from date)=7 and extract(year from date)=2020
GROUP  BY "OS"
'''
queryResult(q)

Unnamed: 0,avg_payout,OS
0,1.931114,Linux
1,1.640537,MAC
2,1.5906,Windows


<div class="alert alert-success" role="alert">
Great!</div>

10. What is the daily share of ether earned by users from Linux that are in the free plan in this data?

In [17]:
q='''
select distinct x.user_id,x.date, x.user_sum,y.total_date,
round(x.user_sum/y.total_date,2) as daily_share 
from
(select payout.user_id,date,
sum(payout) over (partition by payout.user_id,date) as user_sum
from payout 
inner join plan
on plan.user_id= payout.user_id
where "Plan"='Free' and "OS"='Linux') as x
inner join 
(select date,
sum(payout) over(partition by date) as total_date
from payout 
inner join plan
on plan.user_id= payout.user_id
) as y 
on x.date=y.date
order by x.date

'''

queryResult(q)

Unnamed: 0,user_id,date,user_sum,total_date,daily_share
0,3415,2020-07-07,2.50566,6.94641,0.36
1,3146,2020-07-08,1.39229,4.45935,0.31
2,2154,2020-07-11,1.72359,4.72946,0.36
3,4235,2020-07-13,3.13763,7.24881,0.43
4,3542,2020-07-14,0.67543,5.77610,0.12
...,...,...,...,...,...
94,2935,2021-01-02,0.30163,3.23708,0.09
95,1330,2021-01-03,1.51269,5.85108,0.26
96,2128,2021-01-04,0.68610,7.50206,0.09
97,4131,2021-01-05,2.36296,9.85304,0.24


<div class="alert alert-success" role="alert">
Great!  
</div>