In [1]:
import pandas as pd
import numpy as np
import datetime
import gc
import random

In [2]:
users_info = pd.read_csv('users_info.csv')
users_info_dim = pd.read_csv('users_info_dim.csv')
cp_role_info = pd.read_csv('./cp_role_info.csv')
role_info = pd.read_csv('./role_info.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# 压缩数据
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
users_info = reduce_mem_usage(users_info)
users_info_dim = reduce_mem_usage(users_info_dim)
cp_role_info = reduce_mem_usage(cp_role_info)
role_info = reduce_mem_usage(role_info)

Mem. usage decreased to 276.76 Mb (44.8% reduction)
Mem. usage decreased to 33.21 Mb (70.8% reduction)
Mem. usage decreased to 124.20 Mb (40.0% reduction)
Mem. usage decreased to 124.36 Mb (54.2% reduction)


In [5]:
print(users_info.shape)
print(users_info_dim.shape)
print(cp_role_info.shape)
print(role_info.shape)

(5475477, 12)
(4974324, 3)
(5426230, 5)
(5927469, 6)


In [6]:
# users_info和user_info_dim没有重复的user_id
print(users_info['user_id'].nunique())
print(users_info_dim['user_id'].nunique())

5475477
4974324


In [7]:
a = np.setdiff1d(users_info_dim['user_id'], users_info['user_id'])
a

array([107467275, 107490790, 107492079, 107499803, 107547671, 107555258,
       107555862, 107562097, 107564203, 107565691, 107582732, 107597791,
       107606012, 107681724, 111849757])

In [8]:
users_info['mobile'] = pd.to_numeric(users_info['mobile'],errors='coerce')
# 标记用户时否绑定了手机，绑定;1    未绑定：0
users_info['mobile'] = users_info['mobile'].apply(lambda x:0 if np.isnan(x) == True else 1)
# 合并user_info, user_info_dim 注 user_info_dim只有中间件用户的数据
users_info_merge = pd.merge(users_info, users_info_dim, on='user_id', how='left')

In [9]:
users_info_merge.drop(['user_name','game_id','dt'], axis=1, inplace=True)

In [10]:
# 判断是否滚服  注cp_role_info只有中间件用户的数据
cp_role_info = cp_role_info.dropna(subset=['cp_role_id'])
groups = cp_role_info.groupby('cp_role_id')
temp = groups['server_id'].agg([('server_sum', 'nunique')]).reset_index()
cp_role_info = pd.merge(cp_role_info, temp, on='cp_role_id', how='left')
cp_role_info['is_gunfu'] = cp_role_info['server_sum'].apply(lambda x:0 if x==1 else 1)
cp_role_info.head()

Unnamed: 0,cp_role_id,game_id,user_id,role_id,server_id,server_sum,is_gunfu
0,13024407,1000840,93449068,134292984,3244828,1,0
1,13024860,1000840,93459494,134303214,3244828,1,0
2,13021554,1000840,93328889,134173020,3244665,1,0
3,13024408,1000840,93449212,134292987,3244828,1,0
4,13025583,1000840,93474066,134318040,3244828,1,0


In [11]:
role_info.head()

Unnamed: 0,user_id,role_id,server_id,is_create_role,is_create_no,create_role_time
0,104544600,11653263,26208,1,1,2020-02-26 00:00:08
1,104544609,11653264,26208,1,1,2020-02-26 00:00:18
2,104544763,11653265,26208,1,1,2020-02-26 00:00:18
3,104544765,11653267,26208,1,1,2020-02-26 00:00:19
4,104274333,11653269,26215,0,0,2020-02-26 00:00:28


In [12]:
print(role_info.shape)
print(cp_role_info.shape)

(5927469, 6)
(5408952, 7)


In [13]:
print(role_info['role_id'].nunique())
print(cp_role_info['role_id'].nunique())

5927469
5408952


In [14]:
b = np.setdiff1d(cp_role_info['role_id'], role_info['role_id'])
b

array([], dtype=int32)

In [15]:
# 合并role_info与cp_role_info
role_all = pd.merge(role_info, cp_role_info, on=['user_id', 'role_id'], how='left')

In [16]:
role_all.drop(['server_id_x','server_id_y','game_id','cp_role_id'], axis=1, inplace=True)
role_all.head()

Unnamed: 0,user_id,role_id,is_create_role,is_create_no,create_role_time,server_sum,is_gunfu
0,104544600,11653263,1,1,2020-02-26 00:00:08,1.0,0.0
1,104544609,11653264,1,1,2020-02-26 00:00:18,2.0,1.0
2,104544763,11653265,1,1,2020-02-26 00:00:18,1.0,0.0
3,104544765,11653267,1,1,2020-02-26 00:00:19,1.0,0.0
4,104274333,11653269,0,0,2020-02-26 00:00:28,10.0,1.0


In [17]:
role_all['role_id'].nunique()

5927469

In [18]:
users_info_merge.head()

Unnamed: 0,user_id,platform,user_type,user_flag,mobile,channel_id,source_id,device_no,reg_date,game_pay_num,chmoney_pay_num
0,93329024,1,2,1,1,6770,208821,3F3EA505C0B3408E3569BAC7E47591FC,2019-09-13 00:05:32,0.0,0.0
1,93335936,1,18,1,0,20540,200648,97233F607B1BF960B6BB939D44EDFD00,2019-09-13 01:34:37,0.0,0.0
2,93340416,1,1,1,0,6770,198778,AC70F781E19A6EDF119CD2BA70BA7FA0,2019-09-13 03:34:51,0.0,0.0
3,93349248,1,1,1,0,6770,208816,93D7E53BA301DA5A57444F0EF17A5590,2019-09-13 07:42:17,0.0,0.0
4,93358336,1,18,1,0,20540,198786,802AC04F2B80E47234700AB7CE8C954B,2019-09-13 08:44:19,0.0,0.0


In [19]:
users_info_merge.shape

(5475477, 11)

In [20]:
users_info_merge['user_id'].nunique()

5475477

In [21]:
# 按全量user_id进行合并
users_info_all = pd.merge(role_all, users_info_merge, on='user_id', how='left', validate='many_to_one')

In [25]:
users_info_all['role_id'].nunique()

5927469

In [26]:
users_info_all.head()

Unnamed: 0,user_id,role_id,is_create_role,is_create_no,create_role_time,server_sum,is_gunfu,platform,user_type,user_flag,mobile,channel_id,source_id,device_no,reg_date,game_pay_num,chmoney_pay_num
0,104544600,11653263,1,1,2020-02-26 00:00:08,1.0,0.0,2.0,1.0,1.0,0.0,20021.0,22791.0,0333F64B-9FA2-4EB8-A3A4-1833648F17D6,2020-02-26 00:00:03,0.0,0.0
1,104544609,11653264,1,1,2020-02-26 00:00:18,2.0,1.0,2.0,1.0,1.0,0.0,20021.0,22791.0,8D1014C5-CFCD-489F-BC08-95489D4BB70E,2020-02-26 00:00:14,6.0,0.0
2,104544763,11653265,1,1,2020-02-26 00:00:18,1.0,0.0,2.0,1.0,1.0,0.0,20541.0,226963.0,33EB241B-B83B-455D-B60D-4BEFDB489A97,2020-02-26 00:00:15,0.0,0.0
3,104544765,11653267,1,1,2020-02-26 00:00:19,1.0,0.0,2.0,1.0,1.0,0.0,20144.0,266475.0,62CCEDD0-BEBA-4D91-A1E8-3FDF902A69FA,2020-02-26 00:00:16,0.0,0.0
4,104274333,11653269,0,0,2020-02-26 00:00:28,10.0,1.0,2.0,1.0,1.0,0.0,20144.0,250625.0,2DD2BE9E-250C-455F-854B-3A5E75292C42,2020-02-24 00:51:57,0.0,0.0


In [27]:
users_info_all.to_csv('./FE_data/users_info_all.csv', index=None)

In [28]:
del users_info, users_info_dim,users_info_merge,role_info,role_all,cp_role_info,temp
gc.collect()

18608