In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# we don't like warnings
# you can comment the following 2 lines if you'd like to
import warnings
warnings.filterwarnings('ignore')

sns.set()

# Graphics in SVG format are more sharp and legible
#%config InlineBackend.figure_format = 'svg'

In [2]:
def init_data(file_path):
    max_citi = 3600 * 24 * 3
    max_fraud_ctit = 30
    df = pd.read_csv(file_path)
    print('列参数 = {}'.format(df.columns.values))

    df['fraud_payout'] = (df['payout'] * df['is_fraud']).round(decimals=1)
    df['ctit'] = ((pd.to_datetime(df['postback_datetime']) - pd.to_datetime(df['click_datetime'])) / np.timedelta64(1, 's')).astype(int)
    df['ctit'] = df['ctit'].where(df['ctit'] <= max_citi, other=max_citi)
    df['is_ctit'] = (df['ctit'] <= max_fraud_ctit).astype(np.bool).astype(np.int0)
    df = df.drop('clickid', axis=1)
    return df

In [3]:
def _summary_data(dd, need_cumsum=True):
    dd['payout'] = dd['payout'].round(decimals=1)
    dd['fraud_payout_p'] = (dd['fraud_payout'] / dd['payout']).round(decimals=2)
    dd['is_fraud_p'] = (dd['is_fraud'] / dd['is_valid_pb']).round(decimals=2)
    dd['is_ctit_p'] = (dd['is_ctit'] / dd['is_valid_pb']).round(decimals=2)
    
    if need_cumsum:
        cumsum_payout = dd['payout'].cumsum()
        cumsum_is_fraud = dd['is_fraud'].cumsum()
        cumsum_is_valid_pb = dd['is_valid_pb'].cumsum()
        cumsum_fraud_payout = dd['fraud_payout'].cumsum()
        cumsum_is_ctit = dd['is_ctit'].cumsum()
        dd['cumsum_fraud_payout_p'] = (cumsum_fraud_payout / cumsum_payout).round(decimals=2)
        dd['cumsum_is_fraud_p'] = (cumsum_is_fraud / cumsum_is_valid_pb).round(decimals=2)
        dd['cumsum_is_ctit_p'] = (cumsum_is_ctit / cumsum_is_valid_pb).round(decimals=2)
    return dd


def get_summary_by_date(df):
    dd = df.groupby(['date'])['payout', 'is_valid_pb', 'fraud_payout', 'is_fraud', 'is_ctit'].aggregate(np.sum)
    return _summary_data(dd)

In [4]:
#file_path = './data/analy_2018-10-28_28_246.csv.gz'
file_path = './data/analy_2018-10-28_28_.csv.gz'
#file_path = './data/analy_2018-09-30_19_246.csv.gz'
org_df = init_data(file_path)
org_df.head()

列参数 = ['payout' 'clickid' 'pubid' 'postback_datetime' 'sub_campid'
 'publisher_payout' 'click_ip' 'is_valid_pb' 'campid' 'subid' 'source'
 'is_fraud' 'pubpb_filter' 'fraud_note' 'date' 'geo' 'click_datetime']


Unnamed: 0,payout,pubid,postback_datetime,sub_campid,publisher_payout,click_ip,is_valid_pb,campid,subid,source,is_fraud,pubpb_filter,fraud_note,date,geo,click_datetime,fraud_payout,ctit,is_ctit
0,0.5814,10294,2018-10-28T00:04:01Z,,0.5814,112.215.200.122,True,58966896,1525,81,0,900,ok,2018-10-28,ID,2018-10-27T23:52:07Z,0.0,714,0
1,1.3,10572,2018-10-28T00:04:05Z,,1.69,174.194.12.96,True,56164481,32816,233,0,0,ok,2018-10-28,US,2018-10-28T00:02:17Z,0.0,108,0
2,0.2,10236,2018-10-28T00:04:03Z,,0.24,115.178.253.97,True,59589430,544537,150,1,0,blacklist,2018-10-28,ID,2018-10-28T00:03:46Z,0.2,17,1
3,0.6,10250,2018-10-28T00:04:04Z,,0.6,63.78.215.163,True,49245488,1525,279,1,900,blacklist,2018-10-28,US,2018-10-28T00:00:24Z,0.6,220,0
4,0.7,10541,2018-10-28T00:04:05Z,,1.05,120.188.64.14,True,48570647,aa5204341f2a50747dadf1c80fa6fd92,162,1,0,blacklist,2018-10-28,ID,2018-10-27T23:54:49Z,0.7,556,0


In [5]:
df_summary = get_summary_by_date(org_df)
df_summary

Unnamed: 0_level_0,payout,is_valid_pb,fraud_payout,is_fraud,is_ctit,fraud_payout_p,is_fraud_p,is_ctit_p,cumsum_fraud_payout_p,cumsum_is_fraud_p,cumsum_is_ctit_p
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-10-01,128657.4,170643.0,17052.8,22696,1709,0.13,0.13,0.01,0.13,0.13,0.01
2018-10-02,129800.2,170021.0,17114.8,23128,1812,0.13,0.14,0.01,0.13,0.13,0.01
2018-10-03,129614.1,161756.0,17424.1,22780,1535,0.13,0.14,0.01,0.13,0.14,0.01
2018-10-04,116117.9,153113.0,16198.9,22232,1457,0.14,0.15,0.01,0.13,0.14,0.01
2018-10-05,135187.5,175365.0,20492.8,25927,1849,0.15,0.15,0.01,0.14,0.14,0.01
2018-10-06,129695.0,164078.0,18903.3,24378,1517,0.15,0.15,0.01,0.14,0.14,0.01
2018-10-07,135451.8,171002.0,19315.3,25749,1724,0.14,0.15,0.01,0.14,0.14,0.01
2018-10-08,135843.4,171132.0,20610.3,26228,1147,0.15,0.15,0.01,0.14,0.14,0.01
2018-10-09,135046.7,172516.0,19841.0,27224,1140,0.15,0.16,0.01,0.14,0.15,0.01
2018-10-10,136808.7,172581.0,20785.2,27039,1444,0.15,0.16,0.01,0.14,0.15,0.01


In [33]:
import random
df = pd.DataFrame(((random.randint(2012, 2016), random.choice(['tech', 'art', 'office']), '%dk-%dk'%(random.randint(2,10), random.randint(10, 20)), 'tt') for _ in range(10000)), columns=['publish_time', 'classf', 'salary', 'title'])
df.head()

Unnamed: 0,publish_time,classf,salary,title
0,2015,office,10k-18k,tt
1,2015,office,2k-16k,tt
2,2016,tech,6k-15k,tt
3,2015,art,6k-10k,tt
4,2012,art,5k-14k,tt


In [34]:
df.groupby(['publish_time', 'classf', 'salary']).count()['title'].groupby(level=0, group_keys=False).nlargest(10)

publish_time  classf  salary 
2012          office  3k-13k     18
                      9k-14k     16
              tech    3k-15k     16
              art     5k-17k     13
                      8k-12k     12
                      8k-19k     12
              office  3k-18k     12
                      5k-12k     12
                      5k-13k     12
              tech    4k-10k     12
2013          office  3k-14k     15
              art     8k-16k     14
                      9k-14k     14
                      3k-10k     13
                      5k-17k     13
              tech    6k-20k     13
              art     10k-20k    12
                      4k-13k     12
                      4k-20k     12
              tech    3k-16k     12
2014          tech    2k-17k     14
              art     3k-14k     13
              tech    10k-11k    13
                      3k-12k     13
                      7k-11k     13
                      7k-13k     13
                      8k-18k     1

In [67]:
def get_tops(df, df_summary, group_by=['pubid']):
    dd = df.groupby(group_by)['payout', 'is_valid_pb', 'fraud_payout', 'is_fraud', 'is_ctit'].aggregate(np.sum)
    dd = _summary_data(dd, False)
    query_list = []
    for (date, series) in df_summary.iterrows():
        query_list.append('(date == "{}" and (fraud_payout_p > {} or is_fraud_p > {} or is_ctit_p > {}))'.format(date, series.fraud_payout_p, series.is_fraud_p, series.is_ctit_p))
    # print(str_query)
    dd = dd.query(' or '.join(query_list))
    return dd

def top(df, n=5, column='tip_pct'):
    return df.sort_index(by=column, ascending=False)[:n]

#dd = get_tops(org_df, df_summary, group_by=['date','pubid'])
#dd
dd = get_tops(org_df, df_summary, group_by=['date', 'campid'])
dd = dd.reset_index()
#dd = dd.sort_values(by=['fraud_payout_p','is_fraud_p', 'is_ctit_p'], ascending=False)
dd = dd.sort_values(by=['date','payout'], ascending=[True,False]).query('payout >= 100 or is_valid_pb >= 100')

#dd =dd.groupby(['date', 'geo'])['payout', 'is_valid_pb', 'fraud_payout', 'is_fraud', 'is_ctit'].aggregate(np.sum)
print('campid = {}'.format(dd['campid'].nunique()))
dd

campid = 1082


Unnamed: 0,date,campid,payout,is_valid_pb,fraud_payout,is_fraud,is_ctit,fraud_payout_p,is_fraud_p,is_ctit_p
40,2018-10-01,36221642,1603.5,1069.0,493.5,329,0,0.31,0.31,0.00
800,2018-10-01,55668799,1382.4,768.0,196.2,109,0,0.14,0.14,0.00
215,2018-10-01,48886806,1292.0,323.0,540.0,135,0,0.42,0.42,0.00
641,2018-10-01,55106605,1037.1,238.0,184.8,42,0,0.18,0.18,0.00
1090,2018-10-01,56512909,976.2,888.0,271.7,247,0,0.28,0.28,0.00
740,2018-10-01,55489154,676.5,835.0,180.8,226,0,0.27,0.27,0.00
569,2018-10-01,54786933,648.0,997.0,156.0,260,46,0.24,0.26,0.05
282,2018-10-01,51024185,591.3,438.0,93.8,67,11,0.16,0.15,0.03
753,2018-10-01,55513686,590.0,472.0,134.4,112,0,0.23,0.24,0.00
766,2018-10-01,55559689,512.6,233.0,72.6,33,0,0.14,0.14,0.00


In [79]:
dd = get_tops(org_df, df_summary, group_by=['date', 'pubid'])
dd = dd.reset_index()
dd = dd.sort_values(by=['date','payout'], ascending=[True,False]).query('payout >= 500')
print('pubid {} = {}'.format(dd['pubid'].nunique(), dd['pubid'].unique()))
dd

pubid 47 = [10236 10418 10491 10488 10396 10305 10294 10367 10460 10436 10292 10419
 10454 10522 10266 10317 10272 10289 10362 10457 10392 10487 10489 10219
 10408 10316 10318 10302 10297 10250 10217 10569 10524 10535 10387 10555
 10224 10291 10310 10546 10541 10570 10425 10273 10572 10397 10307]


Unnamed: 0,date,pubid,payout,is_valid_pb,fraud_payout,is_fraud,is_ctit,fraud_payout_p,is_fraud_p,is_ctit_p
3,2018-10-01,10236,8184.7,15678.0,813.7,1540,263,0.10,0.10,0.02
38,2018-10-01,10418,7944.4,5746.0,500.1,414,87,0.06,0.07,0.02
52,2018-10-01,10491,4382.3,3521.0,988.9,711,13,0.23,0.20,0.00
50,2018-10-01,10488,4140.2,4145.0,2087.8,2134,108,0.50,0.51,0.03
34,2018-10-01,10396,3880.0,3763.0,1994.3,1684,44,0.51,0.45,0.01
17,2018-10-01,10305,3619.7,5731.0,345.0,563,90,0.10,0.10,0.02
13,2018-10-01,10294,3556.0,5300.0,369.6,607,90,0.10,0.11,0.02
29,2018-10-01,10367,2571.2,5232.0,323.1,832,6,0.13,0.16,0.00
44,2018-10-01,10460,2474.2,2591.0,303.8,361,0,0.12,0.14,0.00
40,2018-10-01,10436,2320.8,3883.0,385.4,595,30,0.17,0.15,0.01
