Itay Koren

In [1]:
pip install psycopg2 -U

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.engine import reflection
import psycopg2

### Access to the DB <a class="anchor" id="chapter2"></a>

In [3]:
db_config = {'user': 'practicum_student',         # username
             'pwd': 's65BlTKV3faNIGhmvJVzOqhs', # password
             'host': 'rc1b-wcoijxj3yxfsf3fs.mdb.yandexcloud.net',
             'port': 6432,              # connection port
             'db': 'data-analyst-eth-payouts-db'}          # the name of the database

connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(db_config['user'],
                                                                     db_config['pwd'],
                                                                       db_config['host'],
                                                                       db_config['port'],
                                                                       db_config['db'])

engine = create_engine(connection_string, connect_args={'sslmode':'require'})
inspector = reflection.Inspector.from_engine(engine)


  inspector = reflection.Inspector.from_engine(engine)


payout table:

        user_id: user's id

        eth_address: the eth address that the user used to get the payout in Ethernium. (users can have multiple addresses)

        date: date of payout to user

        payout: amount that was paid to user

plan table:

        user_id: user's id

        "OS": Operating system of user

        "Plan": user's plan on the site

#  Table Queries <a class="anchor" id="chapter3"></a>

function that takes a query and return dataframe for general use

In [4]:
def queryResult(q):
    return pd.io.sql.read_sql(q, con = engine)
    

In [5]:
inspector.get_table_names()

['payout', 'plan']

1. How many users got paid?

user_id
payout


In [6]:
query = '''
SELECT  count(DISTINCT user_id)
FROM payout
WHERE
    payout > 0
'''
print(queryResult(query))


   count
0    433


2. Show the 5 users with the highest payouts


In [7]:
query = '''
SELECT  *
FROM payout


'''
print(queryResult(query))

     user_id                                 eth_address       date   payout
0       1005  0x4f9117b14426ac44ead4eaef0223830cb16bdb07 2020-07-27  1.68113
1       1005  0xccbe3f17a61f0e8ef05b04dc0c3f510522df6cc8 2020-09-04  2.27831
2       1020  0x2e22950c26899fd0dc0c593b147f6beb7f1c0c37 2020-11-11  0.25199
3       1020  0x2081974d67307ec737dd13c60b524ca4d644ec1d 2020-09-30  2.21076
4       1021  0x370ff36440a36825279db341c6b85bc0371e7c3f 2020-12-10  1.47999
..       ...                                         ...        ...      ...
563     4848  0x72c65e71cc7a2dc71096b64f7a42f8b3192539ab 2020-10-15  2.02452
564     4848  0xd8444ab60e7078d163ecc3ff65fbe01e8770e72d 2020-07-15  3.11877
565     4862  0xbdacd0eca1bfa3b2aabe9687bd3b65969227af34 2020-07-22  1.83956
566     4862  0x446447a208c6e2809f605906a4683e61335e8024 2020-08-13  1.44528
567     4862  0x72b246f6576a320cbcfd9ffe0a6f37f23219ba2b 2020-10-11  1.03908

[568 rows x 4 columns]


In [8]:
query = '''
SELECT  DISTINCT (user_id),
    SUM(payout)
FROM payout
GROUP BY
    user_id
ORDER BY
    SUM(payout) DESC
LIMIT 5

'''
print(queryResult(query))

   user_id       sum
0     1537  10.74169
1     3051   9.73121
2     1512   8.80816
3     1127   8.78861
4     4848   8.42445


3. Show the 5 users with the lowest payouts


In [9]:
query = '''
SELECT  DISTINCT (user_id),
    SUM(payout)
FROM payout
GROUP BY
    user_id
ORDER BY
    SUM(payout) ASC
LIMIT 5

'''
print(queryResult(query))

   user_id      sum
0     2003  0.00775
1     3410  0.00818
2     2813  0.02914
3     2462  0.03381
4     4467  0.05056



4. How much ether was paid out in November 2020?


In [10]:
query = '''
SELECT
    SUM(payout)
FROM
    payout
WHERE
    date::date BETWEEN '2020-11-01' AND '2020-11-30'
'''

queryResult(query)

Unnamed: 0,sum
0,166.0118



5. Which plan is the most popular?


In [14]:
query = '''
SELECT
    plan."Plan",
    COUNT(plan."Plan")
FROM
    payout
    LEFT JOIN plan ON plan.user_id = payout.user_id
GROUP BY 
    plan."Plan"
ORDER BY
    plan."Plan"
'''

queryResult(query)

Unnamed: 0,Plan,count
0,Free,300
1,Premium,268



6. Which plan is the most popular amongst Linux users?


In [17]:
query = '''
SELECT
    plan."Plan",
    COUNT(plan."Plan")
FROM
    payout
    LEFT JOIN plan ON plan.user_id = payout.user_id
WHERE
    plan."OS" = 'Linux'
GROUP BY 
    plan."Plan"
ORDER BY
    count DESC
'''

queryResult(query)

Unnamed: 0,Plan,count
0,Premium,102
1,Free,99



7. What is the percentage of payout between the different plans?


In [38]:
query = '''
SELECT
    DISTINCT plan."Plan",
     SUM(payout) OVER(PARTITION BY plan."Plan") / SUM(payout) OVER()  as percentage_payout

FROM
    payout
    LEFT JOIN plan ON plan.user_id = payout.user_id
'''

queryResult(query)

Unnamed: 0,Plan,percentage_payout
0,Free,0.53972
1,Premium,0.46028



8. Users of which operating system earned more in payouts?


In [39]:
query = '''
SELECT
    plan."OS",
    SUM(payout.payout)
FROM
    payout
    LEFT JOIN plan ON plan.user_id = payout.user_id
GROUP BY 
    plan."OS"
'''

queryResult(query)

Unnamed: 0,OS,sum
0,Linux,322.00353
1,Windows,316.03701
2,MAC,267.41123



9. What is the average payout amount per user for each of the OS in July 2020?

In [46]:

query = '''
SELECT
    plan."OS",
    SUM(payout.payout) / COUNT(payout.user_id) as avg_payout_per_user
FROM
    payout
    LEFT JOIN plan ON plan.user_id = payout.user_id
WHERE
    payout.date::date BETWEEN '2020-07-01' AND '2020-07-31'
GROUP BY 
    plan."OS"
'''

queryResult(query)

Unnamed: 0,OS,avg_payout_per_user
0,Linux,1.931114
1,MAC,1.640537
2,Windows,1.5906



10. What is the daily share of ether earned by users from Linux that are in the free plan in this data?

In [65]:
query = '''
SELECT
    DISTINCT payout.date :: date,
    payout.user_id,
    payout.payout * 100 / SUM(payout.payout) OVER (PARTITION BY payout.date :: date) as ratio
FROM
    payout LEFT JOIN plan ON payout.user_id = plan.user_id
WHERE
    plan."OS" LIKE '%Linux%' and  plan."Plan" LIKE '%Free%'
ORDER BY
    payout.date :: date
'''
queryResult(query)

ProgrammingError: (psycopg2.errors.SyntaxError) syntax error at or near ")"
LINE 5:     EXTRACT(DAY) OVER(PARTITION BY date) as daily_share
                       ^

[SQL: 
SELECT
    user_id,
    payout,
    EXTRACT(DAY) OVER(PARTITION BY date) as daily_share
FROM
    payout
    INNER 
]
(Background on this error at: http://sqlalche.me/e/14/f405)