In [1]:
import pandas as pd
import numpy as np
import gc
from base import Cache
from tqdm import tqdm

def reduce_mem(df, use_float16=False):
    start_mem = df.memory_usage().sum() / 1024**2
    tm_cols = df.select_dtypes('datetime').columns
    colsuse = [i for i in df.columns if i!= 'label']
    for col in colsuse:
        if col in tm_cols:
            continue
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16 and c_min > np.finfo(
                        np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
	
##############################################################################################################
# datatrain = pd.read_csv('../data/train_data.csv')
# datatest = pd.read_csv('../data/test_data_A.csv')
# columns_str = datatrain.columns[0]
# gc.collect()
# dflist = []
# for i in tqdm(range(datatrain.shape[0])):
#     dflist.append([ int(j) if index!=32 else j for index,j in enumerate(datatrain[columns_str].iloc[i].split('|'))])
# dflist = pd.DataFrame(dflist,columns = columns_str.split('|'))
# del datatrain
# gc.collect()
# columns_str = datatest.columns[0]
# dflisttst = []
# for i in tqdm(range(datatest.shape[0])):
#     dflisttst.append([ int(j) if index!=32 else j for index,j in enumerate(datatest[columns_str].iloc[i].split('|'))])
# del datatest
# gc.collect()
# dflisttst = pd.DataFrame(dflisttst,columns = columns_str.split('|'))
# dflist['id'] = -1# train id都改成-1
# dataall = pd.concat([dflist,dflisttst],ignore_index=True)
# del dflist,dflisttst
# gc.collect()
# dataall = reduce_mem(dataall, use_float16=False)
# Cache.cache_data(dataall, nm_marker='dataall0831')

##############################################################################################################
# # 比较慢！
# route = []
# for i in tqdm(range(dataall.shape[0])):
#     route.append(dataall['communication_onlinerate'].iloc[i].split('^'))
# route = pd.DataFrame(route)
# route = route.fillna(-1).astype(int)
# routes = []
# for i in tqdm(range(route.shape[0])):
#     routes.append(np.sum(np.eye(25)[route.iloc[i,:]],axis=0))
# del route
# gc.collect()
# routes = pd.DataFrame(routes,columns=['cmr_'+str(i) for i in range(24)]+['cmr_None'])
# routes = routes.astype(int)
# routes = reduce_mem(routes, use_float16=False)
# Cache.cache_data(routes, nm_marker='cmr0831')
# del dataall,routes
# gc.collect()
##############################################################################################################

data = Cache.reload_cache('CACHE_dataall0816.pkl')
print(data.dtypes)
data['communication_onlinerate'] = data['communication_onlinerate'].map(lambda x:x.replace('^',' '))
route = Cache.reload_cache('CACHE_cmr0816.pkl')
route_columns = [i for i in route.columns]
data = pd.concat([data,route],axis=1)
data1= data.query('pt_d<8').drop_duplicates()
data2 = data.query('pt_d==8')
data = pd.concat([data1,data2],ignore_index=True)
data = data.sort_values(['uid','pt_d','task_id','adv_id','slot_id','net_type'],ascending=False).reset_index(drop=True)
del data1,data2
gc.collect()
for var in route_columns:
    data[var] = data[var].astype(int)
print(data.dtypes)
del route
gc.collect()

[2020-09-11 14:41:14] - __init__.py[line:127] - INFO: Successfully Reload: /home/tione/notebook/huawei/cached_data/CACHE_dataall0816.pkl


label                          float64
uid                              int32
task_id                          int16
adv_id                           int16
creat_type_cd                     int8
adv_prim_id                      int16
dev_id                            int8
inter_type_cd                     int8
slot_id                           int8
spread_app_id                     int8
tags                              int8
app_first_class                   int8
app_second_class                  int8
age                               int8
city                             int16
city_rank                         int8
device_name                       int8
device_size                      int16
career                            int8
gender                            int8
net_type                          int8
residence                         int8
his_app_size                      int8
his_on_shelf_time                 int8
app_score                         int8
emui_dev                 

[2020-09-11 14:42:07] - __init__.py[line:127] - INFO: Successfully Reload: /home/tione/notebook/huawei/cached_data/CACHE_cmr0816.pkl


label            float64
uid                int32
task_id            int16
adv_id             int16
creat_type_cd       int8
                  ...   
cmr_20             int64
cmr_21             int64
cmr_22             int64
cmr_23             int64
cmr_None           int64
Length: 62, dtype: object


0

In [2]:
data.shape

(36056562, 62)

In [3]:
data1= data.query('pt_d<8').groupby('pt_d').sample(frac=0.05)
print(data1.shape)
data2 = data.query('pt_d==8')
data = pd.concat([data1,data2],ignore_index=True)
data = data.sort_values(['uid','pt_d','task_id','adv_id','slot_id','net_type'],ascending=False).reset_index(drop=True)
Cache.cache_data(data, nm_marker='data_sample005')

(1752827, 62)


[2020-09-11 15:01:48] - __init__.py[line:112] - INFO: Cache Successfully! File name: /home/tione/notebook/huawei/cached_data/CACHE_data_sample005.pkl


In [1]:
import pandas as pd
