In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# we don't like warnings
# you can comment the following 2 lines if you'd like to
import warnings
warnings.filterwarnings('ignore')

sns.set()

# Graphics in SVG format are more sharp and legible
#%config InlineBackend.figure_format = 'svg'

### code 

In [2]:
def init_data(file_path):
    max_citi = 3600 * 24 * 3
    max_fraud_ctit = 30
    df = pd.read_csv(file_path)
    # print('列参数 = {}'.format(df.columns.values))

    df['fraud_payout'] = (df['payout'] * df['is_fraud']).round(decimals=2)
    df['ctit'] = ((pd.to_datetime(df['postback_datetime']) - pd.to_datetime(df['click_datetime'])) / np.timedelta64(1, 's')).astype(int)
    df['ctit'] = df['ctit'].where(df['ctit'] <= max_citi, other=max_citi)
    df['is_ctit'] = (df['ctit'] <= max_fraud_ctit).astype(np.bool).astype(np.int0)
    df = df.drop('clickid', axis=1)
    return df

In [3]:
def _summary_data(dd, need_cumsum=True):
    dd['payout'] = dd['payout'].round(decimals=1)
    dd['fraud_payout_p'] = (dd['fraud_payout'] / dd['payout']).round(decimals=2)
    dd['is_fraud_p'] = (dd['is_fraud'] / dd['is_valid_pb']).round(decimals=2)
    dd['is_ctit_p'] = (dd['is_ctit'] / dd['is_valid_pb']).round(decimals=2)
    
    if need_cumsum:
        cumsum_payout = dd['payout'].cumsum()
        cumsum_is_fraud = dd['is_fraud'].cumsum()
        cumsum_is_valid_pb = dd['is_valid_pb'].cumsum()
        cumsum_fraud_payout = dd['fraud_payout'].cumsum()
        cumsum_is_ctit = dd['is_ctit'].cumsum()
        dd['cumsum_fraud_payout_p'] = (cumsum_fraud_payout / cumsum_payout).round(decimals=2)
        dd['cumsum_is_fraud_p'] = (cumsum_is_fraud / cumsum_is_valid_pb).round(decimals=2)
        dd['cumsum_is_ctit_p'] = (cumsum_is_ctit / cumsum_is_valid_pb).round(decimals=2)
    return dd


def get_summary_by_date(df):
    dd = df.groupby(['date'])['payout', 'is_valid_pb', 'fraud_payout', 'is_fraud', 'is_ctit'].aggregate(np.sum)
    return _summary_data(dd)


def get_tops(df, df_summary, group_by=['pubid']):
    dd = df.groupby(group_by)['payout', 'is_valid_pb', 'fraud_payout', 'is_fraud', 'is_ctit'].aggregate(np.sum)
    dd = _summary_data(dd, False)
    query_list = []
    for (date, series) in df_summary.iterrows():
        query_list.append('(date == "{}" and (fraud_payout_p > {} or is_fraud_p > {} or is_ctit_p > {}))'.format(date, series.fraud_payout_p, series.is_fraud_p, series.is_ctit_p))
    # print(str_query)
    dd = dd.query(' or '.join(query_list))
    return dd

### init load

In [4]:
file_path = './data/analy_2018-10-28_28_246.csv.gz'
#file_path = './data/analy_2018-10-28_28_.csv.gz'
#file_path = './data/analy_2018-09-30_19_246.csv.gz'
org_df = init_data(file_path)
org_df.head()

Unnamed: 0,payout,pubid,postback_datetime,sub_campid,publisher_payout,click_ip,is_valid_pb,campid,subid,source,is_fraud,pubpb_filter,fraud_note,date,geo,click_datetime,fraud_payout,ctit,is_ctit
0,1.26,10397,2018-10-28T00:08:18Z,,1.76,79.235.92.203,True,52552750,544537,246,0,0,ok,2018-10-28,DE,2018-10-28T00:07:22Z,0.0,56,0
1,0.64,10548,2018-10-28T00:14:26Z,,0.64,126.218.77.50,True,58335214,795ccad65c4fc27a99e2af38484942ce,246,0,900,ok,2018-10-28,JP,2018-10-27T23:07:39Z,0.0,4007,0
2,0.75,10548,2018-10-28T00:23:11Z,,0.75,49.97.92.120,True,57174138,faffa5e44e4304775f0af7a47fa25334,246,0,900,ok,2018-10-28,JP,2018-10-27T18:18:04Z,0.0,21907,0
3,3.0,10288,2018-10-28T00:23:50Z,,3.9,66.250.143.158,True,58801908,2507,246,0,900,ok,2018-10-28,US,2018-10-28T00:19:38Z,0.0,252,0
4,1.75,10291,2018-10-28T00:28:15Z,,2.71,172.58.87.13,True,58333030,20197908,246,1,800,blacklist,2018-10-28,US,2018-10-28T00:26:33Z,1.75,102,0


### 计算每天的fraud百分比平均值

In [5]:
df_summary = get_summary_by_date(org_df)
df_summary

Unnamed: 0_level_0,payout,is_valid_pb,fraud_payout,is_fraud,is_ctit,fraud_payout_p,is_fraud_p,is_ctit_p,cumsum_fraud_payout_p,cumsum_is_fraud_p,cumsum_is_ctit_p
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-10-01,2034.4,753.0,750.03,242,10,0.37,0.32,0.01,0.37,0.32,0.01
2018-10-02,1133.6,441.0,304.23,108,7,0.27,0.24,0.02,0.33,0.29,0.01
2018-10-03,1914.5,434.0,622.59,122,10,0.33,0.28,0.02,0.33,0.29,0.02
2018-10-04,1680.3,539.0,596.16,202,7,0.35,0.37,0.01,0.34,0.31,0.02
2018-10-05,4744.7,1395.0,2663.08,708,5,0.56,0.51,0.0,0.43,0.39,0.01
2018-10-06,3869.3,972.0,1840.02,450,2,0.48,0.46,0.0,0.44,0.4,0.01
2018-10-07,2447.9,812.0,683.08,199,1,0.28,0.25,0.0,0.42,0.38,0.01
2018-10-08,2339.2,801.0,947.13,293,5,0.4,0.37,0.01,0.42,0.38,0.01
2018-10-09,1727.7,666.0,365.39,169,9,0.21,0.25,0.01,0.4,0.37,0.01
2018-10-10,362.3,243.0,140.95,75,8,0.39,0.31,0.03,0.4,0.36,0.01


### 按'date+campid'分组，每天payout >= 100或is_valid_pb >= 100的campid

In [28]:
def top(df, n=5, column='payout'):
    return df.sort_values(by=column)[-n:]

dd = get_tops(org_df, df_summary, group_by=['date', 'campid'])
dd = dd.reset_index()
dd = dd.groupby(['date']).apply(top, n=10, column='fraud_payout')
#dd = dd.sort_values(by=['date','payout'], ascending=[True,False]).query('payout >= 100 or is_valid_pb >= 100')
#print('campid {} = {}'.format(dd['campid'].nunique(), dd['campid'].unique()))
dd

Unnamed: 0_level_0,Unnamed: 1_level_0,date,campid,payout,is_valid_pb,fraud_payout,is_fraud,is_ctit,fraud_payout_p,is_fraud_p,is_ctit_p
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-10-01,6,2018-10-01,56163256,2.8,3.0,0.94,1,0,0.34,0.33,0.00
2018-10-01,2,2018-10-01,52552744,4.0,1.0,4.00,1,0,1.00,1.00,0.00
2018-10-01,5,2018-10-01,54300674,20.0,5.0,8.00,2,0,0.40,0.40,0.00
2018-10-01,7,2018-10-01,56163257,33.6,32.0,11.55,11,0,0.34,0.34,0.00
2018-10-01,3,2018-10-01,52552750,100.0,119.0,11.76,14,6,0.12,0.12,0.05
2018-10-01,8,2018-10-01,56163258,35.0,50.0,16.10,23,3,0.46,0.46,0.06
2018-10-01,9,2018-10-01,57222357,33.9,1.0,33.90,1,0,1.00,1.00,0.00
2018-10-01,4,2018-10-01,54114288,75.4,55.0,35.62,26,1,0.47,0.47,0.02
2018-10-01,0,2018-10-01,45096114,76.0,19.0,36.00,9,0,0.47,0.47,0.00
2018-10-01,1,2018-10-01,48886806,1292.0,323.0,540.00,135,0,0.42,0.42,0.00


### 按'date+pubid'分组，每天payout >= 100的pubid

In [7]:
dd = get_tops(org_df, df_summary, group_by=['date', 'pubid'])
dd = dd.reset_index()
dd = dd.sort_values(by=['date', 'payout'], ascending=[True, False]).query('payout >= 100')
print('pubid {} = {}'.format(dd['pubid'].nunique(), dd['pubid'].unique()))
dd

pubid 11 = [10396 10291 10487 10397 10522 10488 10224 10491 10288 10408 10236]


Unnamed: 0,date,pubid,payout,is_valid_pb,fraud_payout,is_fraud,is_ctit,fraud_payout_p,is_fraud_p,is_ctit_p
5,2018-10-01,10396,392.0,98.0,372.0,93,0,0.95,0.95,0.0
3,2018-10-01,10291,122.0,56.0,12.6,7,1,0.1,0.12,0.02
15,2018-10-02,10487,115.0,23.0,85.0,17,0,0.74,0.74,0.0
14,2018-10-02,10397,113.4,105.0,7.56,9,4,0.07,0.09,0.04
17,2018-10-02,10522,100.3,31.0,46.35,13,0,0.46,0.42,0.0
26,2018-10-03,10488,451.7,44.0,354.9,37,0,0.79,0.84,0.0
19,2018-10-03,10224,153.5,9.0,0.7,1,1,0.0,0.11,0.11
33,2018-10-04,10491,448.3,129.0,61.62,21,2,0.14,0.16,0.02
32,2018-10-04,10488,181.0,34.0,176.67,31,0,0.98,0.91,0.0
31,2018-10-04,10487,173.0,173.0,102.0,102,0,0.59,0.59,0.0


### 按'date+source+campid'分组，前15的campid

In [8]:
dd = get_tops(org_df, df_summary, group_by=['date', 'source', 'campid'])
dd = dd['payout'].groupby(level=['date', 'source'], group_keys=False).nlargest(15)

#top_source_campid = dd['campid'].unique()
#print('source campid {} = {}'.format(len(top_source_campid), top_source_campid))
dd

date        source  campid  
2018-10-01  246     48886806    1292.0
                    52552750     100.0
                    45096114      76.0
                    54114288      75.4
                    56163258      35.0
                    57222357      33.9
                    56163257      33.6
                    54300674      20.0
                    52552744       4.0
                    56163256       2.8
2018-10-02  246     54829945     150.0
                    52552750      90.7
                    57554425      90.0
                    54114288      64.4
                    45096114      64.0
                    56163255      31.5
                    56163258      27.3
                    56163257      21.0
                    54300674      16.0
2018-10-03  246     57554425     960.0
                    54829945     150.0
                    52552750      94.9
                    54114288      67.1
                    45096114      44.0
                    56163258      3

In [9]:
dd = get_tops(org_df, df_summary, group_by=['date', 'pubid', 'campid'])
#dd = dd.sort_values(by=['date', 'source', 'payout'], ascending=[True, True, False]).query('payout >= 100')
dd = dd['payout'].groupby(level=['date', 'pubid'], group_keys=False).nlargest(15)
dd = dd.reset_index()
top_pubid_campid = dd['campid'].unique()
print('pubid campid {} = {}'.format(len(top_pubid_campid), top_pubid_campid))

pubid campid 33 = [45096114 56163258 54114288 57222357 56163256 48886806 52552750 55334293
 52552744 56163257 54300674 54829945 57554425 56163255 57660503 57536774
 55684530 57823422 57554428 57174137 58333030 58427059 58335210 58335214
 58365313 58801908 58335212 53350854 57174138 54580865 59137467 58670895
 57222360]


In [10]:
top_mix_campid = list(set(top_source_campid.tolist()+top_pubid_campid.tolist()))
print('mix campid {} = {}'.format(len(top_mix_campid), top_mix_campid))

NameError: name 'top_source_campid' is not defined