#### Importing libraries

In [1]:
!pip install psycopg2 



In [2]:
import pandas as pd
from sqlalchemy import create_engine
import psycopg2

#### Access to the DB

In [3]:
db_config = {'user': 'practicum_student',         # username
             'pwd': 's65BlTKV3faNIGhmvJVzOqhs', # password
             'host': 'rc1b-wcoijxj3yxfsf3fs.mdb.yandexcloud.net',
             'port': 6432,              # connection port
             'db': 'data-analyst-eth-payouts-db'}          # the name of the database

connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(db_config['user'],
                                                                     db_config['pwd'],
                                                                       db_config['host'],
                                                                       db_config['port'],
                                                                       db_config['db'])

engine = create_engine(connection_string, connect_args={'sslmode':'require'})

### Tables 

We want to see how our tables looks. Let's make a function that takes a query and return a dataframe.

In [4]:
def printTable(q):
    return pd.io.sql.read_sql(q, con = engine)

**payout table**

In [5]:
q = '''
    SELECT
        *
    FROM
        payout
'''
data_payout = printTable(q)
data_payout

Unnamed: 0,user_id,eth_address,date,payout
0,1005,0x4f9117b14426ac44ead4eaef0223830cb16bdb07,2020-07-27,1.68113
1,1005,0xccbe3f17a61f0e8ef05b04dc0c3f510522df6cc8,2020-09-04,2.27831
2,1020,0x2e22950c26899fd0dc0c593b147f6beb7f1c0c37,2020-11-11,0.25199
3,1020,0x2081974d67307ec737dd13c60b524ca4d644ec1d,2020-09-30,2.21076
4,1021,0x370ff36440a36825279db341c6b85bc0371e7c3f,2020-12-10,1.47999
...,...,...,...,...
563,4848,0x72c65e71cc7a2dc71096b64f7a42f8b3192539ab,2020-10-15,2.02452
564,4848,0xd8444ab60e7078d163ecc3ff65fbe01e8770e72d,2020-07-15,3.11877
565,4862,0xbdacd0eca1bfa3b2aabe9687bd3b65969227af34,2020-07-22,1.83956
566,4862,0x446447a208c6e2809f605906a4683e61335e8024,2020-08-13,1.44528


#### plan table

In [6]:
q = '''
    SELECT
        *
    FROM
        plan
'''
data_plan = printTable(q)
data_plan

Unnamed: 0,user_id,OS,Plan
0,1005,Windows,Free
1,1020,MAC,Free
2,1021,Linux,Premium
3,1041,Linux,Free
4,1048,Windows,Free
...,...,...,...
428,4762,MAC,Premium
429,4765,Windows,Free
430,4798,MAC,Premium
431,4848,Linux,Premium


### Our queries for today are: ###

#### 1. How many users got paid? #### 

* Python

In [7]:
#Using count() to count the differents user_id in data_payout table
data_payout.user_id.value_counts().count()

433

* SQL

In [8]:
q = '''
    SELECT
         COUNT(DISTINCT user_id)  as cnt
    FROM
        payout
'''
r1 = printTable(q)
r1

Unnamed: 0,cnt
0,433


#### 2. Show the 5 users with the highest payouts ####

* Python

In [9]:
#Group the data by user_id 
top5_payout = data_payout.groupby('user_id')['payout'].agg(lambda s: s.sum()).sort_values(ascending=False)

top5_payout.head(5)

user_id
1537    10.74169
3051     9.73121
1512     8.80816
1127     8.78861
4848     8.42445
Name: payout, dtype: float64

* SQL

In [10]:
q = '''
    SELECT
        user_id as user_id,
        SUM(payout) AS sum
    FROM
        payout
    GROUP BY
        user_id
    ORDER BY
        sum DESC
    LIMIT 5;
'''
r2 = printTable(q)
r2

Unnamed: 0,user_id,sum
0,1537,10.74169
1,3051,9.73121
2,1512,8.80816
3,1127,8.78861
4,4848,8.42445


#### 3. Show the 5 users with the lowest payouts ####

* Python

In [11]:
#Group the data by user_id 
top5_less_payout = data_payout.groupby('user_id')['payout'].agg(lambda s: s.sum()).sort_values()

top5_less_payout.head(5)

user_id
2003    0.00775
3410    0.00818
2813    0.02914
2462    0.03381
4467    0.05056
Name: payout, dtype: float64

* SQL

In [12]:
q = '''
    SELECT
        user_id as user_id,
        SUM(payout) AS sum
    FROM
        payout
    GROUP BY
        user_id
    ORDER BY
        sum 
    LIMIT 5;
'''
r3 = printTable(q)
r3

Unnamed: 0,user_id,sum
0,2003,0.00775
1,3410,0.00818
2,2813,0.02914
3,2462,0.03381
4,4467,0.05056


#### 4. How much ether was paid out in November 2020?

* Python

In [13]:
# locate rows from month 11 and sum()
data_payout.loc[data_payout['date'].dt.month == 11,'payout'].sum()

166.01180000000002

* SQL

In [14]:
q = '''
    SELECT
        SUM(payout) AS sum
    FROM
        payout
    WHERE
        date BETWEEN '2020-11-01' AND '2020-11-30'
'''
r4 = printTable(q)
r4

Unnamed: 0,sum
0,166.0118


#### 5. Which plan is the most popular?

* Python

In [15]:
# count how many users have each plan
data_plan.Plan.value_counts()

Free       220
Premium    213
Name: Plan, dtype: int64

Free plan is the most popular.

* SQL

In [16]:
q = '''
    SELECT
        "Plan" AS plan,
        COUNT("Plan") AS cnt
    FROM
        plan
    GROUP BY
        plan
    ORDER BY
        cnt DESC
'''
r5 = printTable(q)
r5

Unnamed: 0,plan,cnt
0,Free,220
1,Premium,213


Free plan is the most popular.

#### 6. Which plan is the most popular amongst Linux users?

* Python

In [17]:
# select Linux users and count how many users have each plan 
data_plan.query('OS == "Linux"').Plan.value_counts()

Premium    76
Free       68
Name: Plan, dtype: int64

The Premium plan is the most popular amongst Linux users

* SQL

In [27]:
q = '''
    SELECT
        "Plan" AS plan,
        COUNT("Plan") AS cnt
    FROM
        plan
    WHERE
        "OS" = 'Linux'
    GROUP BY
        plan
    ORDER BY
        cnt DESC
'''
r6 = printTable(q)
r6

Unnamed: 0,plan,cnt
0,Premium,76
1,Free,68


The Premium plan is the most popular amongst Linux users

#### 7. What is the percentage of payout between the different plans?

* Python

In [20]:
# merging the tables with the merge() method
data_union = data_payout.merge(data_plan, on=['user_id'])
data_union.head()

Unnamed: 0,user_id,eth_address,date,payout,OS,Plan
0,1005,0x4f9117b14426ac44ead4eaef0223830cb16bdb07,2020-07-27,1.68113,Windows,Free
1,1005,0xccbe3f17a61f0e8ef05b04dc0c3f510522df6cc8,2020-09-04,2.27831,Windows,Free
2,1020,0x2e22950c26899fd0dc0c593b147f6beb7f1c0c37,2020-11-11,0.25199,MAC,Free
3,1020,0x2081974d67307ec737dd13c60b524ca4d644ec1d,2020-09-30,2.21076,MAC,Free
4,1021,0x370ff36440a36825279db341c6b85bc0371e7c3f,2020-12-10,1.47999,Linux,Premium


In [21]:
# create totals for plans
total_payout = data_union.pivot_table(index='Plan', values='payout',aggfunc='sum')
#Resent index
total_payout = total_payout.reset_index(level=0)
total_payout

Unnamed: 0,Plan,payout
0,Free,488.68999
1,Premium,416.76178


Let's check if we have missing values after all this merging, if we have we will need to take care of them.

In [22]:
total_payout.isnull().sum()

Plan      0
payout    0
dtype: int64

No missing values, we can continue with our calculations.

In [25]:
#adding a new row for the total payout of the two plans toguether
total_payout.loc['total'] = total_payout[['payout']].sum()
#removing missing values - we dont want Nan in the plan column with the total row
total_payout.loc['total'] = total_payout.loc['total'].fillna('') 
#adding a new row for precents and calculating the percents
total_payout['percent'] = total_payout.payout / total_payout.loc['total','payout'] * 100
total_payout.head()

Unnamed: 0,Plan,payout,percent
0,Free,488.68999,17.99065
1,Premium,416.76178,15.342683
total,,2716.35531,100.0


* SQL

In [29]:
q = '''
    SELECT
        plan."Plan" AS plan,
        SUM(payout.payout) / SUM(SUM(payout.payout)) OVER () * 100 AS precent_pay
    FROM
        payout
        LEFT JOIN plan ON plan.user_id = payout.user_id
    GROUP BY
        plan."Plan"
        
'''
r7 = printTable(q)
r7

Unnamed: 0,plan,precent_pay
0,Free,53.971951
1,Premium,46.028049


#### 8. Users of which operating system earned more in payouts?

* Python

Using the data_union table : the merge of the two originals tables

In [33]:
#Creating total payout by operating system
total_os = data_union.pivot_table(index='OS', values='payout',aggfunc='sum')
#Resent index
total_os = total_os.reset_index(level=0)
total_os

Unnamed: 0,OS,payout
0,Linux,322.00353
1,MAC,267.41123
2,Windows,316.03701


The users of the Linux operating system earned more in payouts.

* SQL

In [48]:
q = '''
    SELECT
        plan."OS" AS OS,
        SUM(payout.payout) as total
    FROM
        payout
        LEFT JOIN plan ON plan.user_id = payout.user_id
    GROUP BY
        plan."OS"
    ORDER BY
        total DESC;
        
'''
r8 = printTable(q)
r8

Unnamed: 0,os,total
0,Linux,322.00353
1,Windows,316.03701
2,MAC,267.41123


The users of the Linux operating system earned more in payouts.

#### 9. What is the average payout amount per user for each of the OS in July 2020?

* Python

Using the data_union table : the merge of the two originals tables

In [42]:
#Making a new table : all the rows with month 11 in the date column 
month_filter = data_union.loc[(data_union['date'].dt.month == 7)]
month_filter.head()

Unnamed: 0,user_id,eth_address,date,payout,OS,Plan
0,1005,0x4f9117b14426ac44ead4eaef0223830cb16bdb07,2020-07-27,1.68113,Windows,Free
5,1021,0x4371ac68001b311c44f8839e7fc73588f2a5191d,2020-07-19,2.94432,Linux,Premium
15,1089,0xd78cc55ad71a8bf693c2c9642562266529f9f3cd,2020-07-25,1.14186,Windows,Free
26,1172,0xa30353f9358934fc39f77655c53478faa0bcfbd6,2020-07-22,1.29065,Linux,Premium
27,1175,0x0d9f143c6dd42b000bd34dfa9eccc0d35a0512d2,2020-07-20,0.26432,Linux,Free


In [47]:
#Making a pivot table, grouping the OS and calculating the payout average for each
res_9 = month_filter.pivot_table(index=['OS'],values='payout', aggfunc='mean')
#Resent index
res_9 = res_9.reset_index(level=0)
res_9.head()

Unnamed: 0,OS,payout
0,Linux,1.931114
1,MAC,1.640537
2,Windows,1.5906


* SQL

In [50]:
q = '''
    SELECT
        plan."OS" AS OS,
        AVG(payout.payout) as average
    FROM
        payout
        LEFT JOIN plan ON plan.user_id = payout.user_id
    WHERE
        date BETWEEN '2020-07-01' AND '2020-07-31'   
    GROUP BY
        plan."OS"
    ORDER BY
        average DESC;
        
'''
r9 = printTable(q)
r9

Unnamed: 0,os,average
0,Linux,1.931114
1,MAC,1.640537
2,Windows,1.5906


#### 10. What is the daily share of ether earned by users from Linux that are in the free plan in this data?

* Python

Using the data_union table : the merge of the two originals tables

In [57]:
#Making a new table :  Linux users and free plan users 
filter_table_r10 = data_union.query('OS == "Linux" and Plan == "Free"')
filter_table_r10.head()

Unnamed: 0,user_id,eth_address,date,payout,OS,Plan
6,1041,0xb66c964af2c2f242d1cdc3a45cb6382270e37689,2020-11-13,2.52993,Linux,Free
7,1041,0x410afec0e85f58b040c93bc0cb7c18e3222003e3,2020-08-06,0.93378,Linux,Free
11,1068,0x9b607aea15cf1614b31f65294aa561eab0cfd4e7,2020-08-01,2.5038,Linux,Free
12,1068,0x6d02bcb01c3f968a7c8b63a16d90f15a01265311,2020-10-07,1.50503,Linux,Free
13,1088,0x50fcc1329c331d41f63a074f90b33cb14726f7bf,2020-10-07,2.75562,Linux,Free


In [101]:
#Making a pivot table, grouping by date and calculating the payout average for each date 
res_10 = filter_table_r10.pivot_table(index=['date'],values='payout', aggfunc='mean')
#Resent index 
res_10 = res_10.reset_index(level=0)
res_10 = res_10.sort_values(by = 'payout', ascending = False).reset_index()
#Drop index columns
res_10.drop(columns=['index'] , inplace=True)
res_10

Unnamed: 0,date,payout
0,2020-12-28,3.153180
1,2020-07-13,3.137630
2,2020-07-27,2.986270
3,2020-09-12,2.949820
4,2020-09-04,2.850285
...,...,...
73,2021-01-02,0.301630
74,2020-10-23,0.295190
75,2020-11-24,0.229610
76,2020-10-14,0.138000


**Average of daily share of ether earned by users from Linux that are in the free plan**

In [81]:
print(res_10.payout.mean())

1.601676837606837


* SQL

In [84]:
q = '''
    SELECT
        payout."date" AS date,
        AVG(payout.payout) as average
    FROM
        payout
        LEFT JOIN plan ON plan.user_id = payout.user_id
    WHERE
        ("OS" = 'Linux') AND ("Plan" = 'Free')
    GROUP BY
        payout."date"
    ORDER BY
        average DESC;
        
'''
r10 = printTable(q)
r10

Unnamed: 0,date,average
0,2020-12-28,3.153180
1,2020-07-13,3.137630
2,2020-07-27,2.986270
3,2020-09-12,2.949820
4,2020-09-04,2.850285
...,...,...
73,2021-01-02,0.301630
74,2020-10-23,0.295190
75,2020-11-24,0.229610
76,2020-10-14,0.138000


**Average of daily share of ether earned by users from Linux that are in the free plan**

Now we call the table that we did before like a sub table

In [107]:
q = '''
    SELECT 
     AVG(sub.average) as average
From (   
SELECT
        payout."date" AS date,
        AVG(payout.payout) as average
    FROM
        payout
        LEFT JOIN plan ON plan.user_id = payout.user_id
    WHERE
        ("OS" = 'Linux') AND ("Plan" = 'Free')
    GROUP BY
        payout."date"
    ORDER BY
        average DESC
) as sub;
        
'''
r10_mean = printTable(q)
r10_mean       

Unnamed: 0,average
0,1.601677
