In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
from IPython.display import display

In [2]:
uid_train = pd.read_csv('../../trainData/uid_train.txt',sep='\t',header=None,names=('uid','label'))
voice_train = pd.read_csv('../../trainData/voice_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_train = pd.read_csv('../../trainData/sms_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_train = pd.read_csv('../../trainData/wa_train.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
voice_test = pd.read_csv('../../testData_B/voice_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_test = pd.read_csv('../../testData_B/sms_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_test = pd.read_csv('../../testData_B/wa_test_b.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [4]:
uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})
uid_test.to_csv('../data/uid_test_b.txt',index=None)

# 检查缺失值

In [5]:
# 检查NA值
print ('voice_train:', voice_train.isnull().any().any())
print ('sms_train:', sms_train.isnull().any().any())
print ('wa_train:', wa_train.isnull().any().any())
print ('voice_test:', voice_test.isnull().any().any())
print ('sms_test:', sms_test.isnull().any().any())
print ('wa_test:', wa_test.isnull().any().any())

voice_train: False
sms_train: False
wa_train: True
voice_test: False
sms_test: False
wa_test: True


In [6]:
# 针对wa_train, wa_test数据进一步寻找缺失值
pd.isnull(wa_train).any()
pd.isnull(wa_test).any()

uid           False
wa_name        True
visit_cnt      True
visit_dura     True
up_flow        True
down_flow      True
wa_type        True
date           True
dtype: bool

uid           False
wa_name        True
visit_cnt      True
visit_dura     True
up_flow        True
down_flow      True
wa_type        True
date           True
dtype: bool

In [7]:
# 具体查看缺失值数量
wa_train.isnull().sum()
wa_test.isnull().sum()

uid             0
wa_name       801
visit_cnt     801
visit_dura    801
up_flow       801
down_flow     801
wa_type       801
date          801
dtype: int64

uid             0
wa_name       497
visit_cnt     497
visit_dura    497
up_flow       497
down_flow     497
wa_type       497
date          497
dtype: int64

In [8]:
# 具体查看这些缺失值，发现这些缺失值都集中在一批用户身上，这批用户除了uid其余全部缺失
wa_train[wa_train['visit_cnt'].isnull()]
wa_test[wa_test['visit_cnt'].isnull()]


Unnamed: 0,uid,wa_name,visit_cnt,visit_dura,up_flow,down_flow,wa_type,date
24674,u0035,,,,,,,
24675,u0036,,,,,,,
36156,u0052,,,,,,,
37016,u0055,,,,,,,
38729,u0060,,,,,,,
40355,u0066,,,,,,,
41250,u0069,,,,,,,
53367,u0090,,,,,,,
56421,u0096,,,,,,,
75788,u0128,,,,,,,


Unnamed: 0,uid,wa_name,visit_cnt,visit_dura,up_flow,down_flow,wa_type,date
1226,u7001,,,,,,,
14536,u7021,,,,,,,
37236,u7048,,,,,,,
37262,u7051,,,,,,,
37342,u7053,,,,,,,
39449,u7059,,,,,,,
52275,u7077,,,,,,,
59789,u7092,,,,,,,
62347,u7096,,,,,,,
65137,u7102,,,,,,,


In [9]:
# 查看这些wa_train中有缺失值的用户，他们是否为风险用户
NA_wa_train_users = wa_train[wa_train['visit_cnt'].isnull()]['uid'].unique()
NA_wa_test_users = wa_test[wa_test['visit_cnt'].isnull()]['uid'].unique()
NA_wa_train_users.shape
NA_wa_test_users.shape
uid_train[uid_train['uid'].isin(NA_wa_train_users)].groupby('label').count()

(747,)

(474,)

Unnamed: 0_level_0,uid
label,Unnamed: 1_level_1
0,617
1,130


可以看到801条的wa_train缺失记录中，包含747名用户，其中617为非风险用户，130为风险用户
497条的wa_test缺失记录中，包含474名用户

In [17]:
c = wa_train[wa_train['uid'].isin(NA_wa_train_users)].groupby(['uid'])['uid', 'wa_name'].count().sort_values(by='uid').add_prefix('count_').reset_index()


l = uid_train[uid_train['uid'].isin(NA_wa_train_users)]
m = pd.merge(l, c, how='left', on='uid')
m = m.sort_values(by='count_uid')
m[m['count_uid'] <= 5]

Unnamed: 0,uid,label,count_uid,count_wa_name
296,u1907,0,2,1
276,u1808,0,2,0
279,u1828,0,2,0
280,u1837,0,2,0
281,u1841,0,2,1
738,u4963,1,2,0
100,u0708,0,2,0
290,u1883,0,2,1
299,u1926,0,2,0
302,u1938,0,2,0


# 简单统计voice_train各项基本特征（类别特征）的数目（除去时间）

In [25]:
print ('total uid: ', len(voice_train['uid'].unique()))
print ('total opp_num: ', len(voice_train['opp_num'].unique()))
print ('total opp_head: ', len(voice_train['opp_head'].unique()))
print ('total opp_len: ', len(voice_train['opp_len'].unique()))

total uid:  4987
total opp_num:  221942
total opp_head:  550
total opp_len:  18


由上述基本特征可以看到：
+ 有12人没有通话记录
+ opp_head和opp_len的种类相对其他特征较少，可以作为天然的类别特征提取出来进行处理，而其余的特征可能更多的进行计数统计

In [33]:
# no_voice_train_users包含了没有通话记录的12人的uid
no_voice_train_users = uid_train[~uid_train['uid'].isin(voice_train['uid'].unique())]['uid']
uid_train[(uid_train['uid'].isin(no_voice_train_users))]


Unnamed: 0,uid,label
353,u0354,0
421,u0422,0
642,u0643,0
1292,u1293,0
1462,u1463,0
1617,u1618,0
3069,u3070,0
3287,u3288,0
3790,u3791,0
4189,u4190,1


这12个没有通话记录的人中，9人为非风险用户，3人为风险用户

# 简单统计sms_train各项基本特征（类别特征）的数目（除去时间）

In [36]:
print ('total uid: ', len(sms_train['uid'].unique()))
print ('total opp_num: ', len(sms_train['opp_num'].unique()))
print ('total opp_head: ', len(sms_train['opp_head'].unique()))
print ('total opp_len: ', len(sms_train['opp_len'].unique()))

total uid:  4959
total opp_num:  18577
total opp_head:  69
total opp_len:  18


由上述基本特征可以看到：
+ 有40人没有短信记录
+ opp_head和opp_len的种类相对其他特征较少，可以作为天然的类别特征提取出来进行处理，而其余的特征可能更多的进行计数统计

In [40]:
# no_sms_train_users包含了没有短信记录的40人的uid
no_sms_train_users = uid_train[~uid_train['uid'].isin(sms_train['uid'].unique())]['uid']
uid_train[(uid_train['uid'].isin(no_sms_train_users))].groupby('label').count()

Unnamed: 0_level_0,uid
label,Unnamed: 1_level_1
0,33
1,7


这40个没有通话记录的人中，33人为非风险用户，7人为风险用户

# 简单统计wa_train各项基本特征（类别特征）的数目（除去时间）

In [41]:
print ('total uid: ', len(wa_train['uid'].unique()))
print ('total wa_name: ', len(wa_train['wa_name'].unique()))

total uid:  4999
total wa_name:  12782


由上述基本特征可以看到：
+ 所有人都有上网记录
+ 网站/app名字种类较多

# 简单统计voice_test各项基本特征（类别特征）的数目（除去时间）

In [43]:
print ('total uid: ', len(voice_test['uid'].unique()))
print ('total opp_num: ', len(voice_test['opp_num'].unique()))
print ('total opp_head: ', len(voice_test['opp_head'].unique()))
print ('total opp_len: ', len(voice_test['opp_len'].unique()))

total uid:  2992
total opp_num:  122584
total opp_head:  333
total opp_len:  19


有8人没有通话记录

In [45]:
# no_voice_test_users包含了没有通话记录的12人的uid
no_voice_test_users = uid_test[~uid_test['uid'].isin(voice_test['uid'].unique())]['uid']
uid_test[(uid_test['uid'].isin(no_voice_test_users))]


Unnamed: 0,uid
169,u7169
469,u7469
601,u7601
1114,u8114
1681,u8681
1772,u8772
2165,u9165
2988,u9988


# 简单统计sms_test各项基本特征（类别特征）的数目（除去时间）

In [46]:
print ('total uid: ', len(sms_test['uid'].unique()))
print ('total opp_num: ', len(sms_test['opp_num'].unique()))
print ('total opp_head: ', len(sms_test['opp_head'].unique()))
print ('total opp_len: ', len(sms_test['opp_len'].unique()))

total uid:  2965
total opp_num:  11102
total opp_head:  63
total opp_len:  19


有35人没有短信记录

In [48]:
# no_sms_test_users包含了没有短信记录的40人的uid
no_sms_test_users = uid_test[~uid_test['uid'].isin(sms_test['uid'].unique())]['uid']
uid_test[(uid_test['uid'].isin(no_sms_test_users))].count()

uid    35
dtype: int64

# 简单统计wa_test各项基本特征（类别特征）的数目（除去时间）

In [49]:
print ('total uid: ', len(wa_test['uid'].unique()))
print ('total wa_name: ', len(wa_test['wa_name'].unique()))

total uid:  3000
total wa_name:  10030


每个人都有上网记录

# 统计正负样本比例

In [51]:
uid_train[uid_train['label'] == 1].count()
uid_train[uid_train['label'] == 0].count()
print ((uid_train[uid_train['label'] == 1].shape[0]*1.0) / uid_train.shape[0] * 100,'%')

uid      900
label    900
dtype: int64

uid      4099
label    4099
dtype: int64

18.00360072014403 %


# 查看这些没有上网，通话，短信记录的用户

In [58]:
# uid_train中没有通话和短信记录的用户
no_voice_train_users[no_voice_train_users.isin(no_sms_train_users)]
# uid_train中没有通话和上网记录的用户
no_voice_train_users[no_voice_train_users.isin(NA_wa_train_users)]
# uid_train中没有短信和上网记录的用户
no_sms_train_users[no_sms_train_users.isin(NA_wa_train_users)]


642    u0643
Name: uid, dtype: object

353     u0354
421     u0422
642     u0643
3287    u3288
3790    u3791
Name: uid, dtype: object

361     u0362
374     u0375
403     u0404
591     u0592
642     u0643
1102    u1103
1530    u1531
1906    u1907
1937    u1938
1959    u1960
2088    u2089
2814    u2815
2893    u2894
2996    u2997
3431    u3432
3945    u3946
Name: uid, dtype: object