In [7]:
""" 导入基本库 """
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

plt.style.use("bmh")
plt.rc('font', family='SimHei', size=13)
pd.set_option('display.max_columns',1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',1000)

In [8]:
""" 导入数据 """
data_path = r'D:\dataset\中国移动消费者人群画像—信用智能评分'

test_data = pd.read_csv(os.path.join(data_path, 'test_dataset.csv'))
train_data = pd.read_csv(os.path.join(data_path, 'train_dataset.csv'))

df_data = pd.concat([train_data, test_data], ignore_index=True)
df_data.head()

Unnamed: 0,信用分,当月旅游资讯类应用使用次数,当月是否体育场馆消费,当月是否到过福州山姆会员店,当月是否景点游览,当月是否看电影,当月是否逛过福州仓山万达,当月火车类应用使用次数,当月物流快递类应用使用次数,当月网购类应用使用次数,当月视频播放类应用使用次数,当月通话交往圈人数,当月金融理财类应用使用总次数,当月飞机类应用使用次数,是否4G不健康客户,是否大学生客户,是否经常逛商场的人,是否黑名单客户,用户实名制是否通过核实,用户年龄,用户当月账户余额（元）,用户最近一次缴费距今时长（月）,用户编码,用户网龄（月）,用户话费敏感度,用户账单当月总费用（元）,用户近6个月平均消费值（元）,缴费用户当前是否欠费缴费,缴费用户最近一次缴费金额（元）,近三个月月均商场出现次数
0,664.0,30,1,0,1,0,0,0,0,713,7145,83,2740,0,0,0,1,0,1,44,180,1,a4651f98c82948b186bdcdc8108381b4,186,3,159.2,163.86,0,99.8,75
1,530.0,0,0,0,0,0,0,0,0,414,44862,21,2731,0,1,0,1,0,1,18,110,1,aeb10247db4e4d67b2550bbc42ff9827,5,3,145.1,153.28,0,29.94,16
2,643.0,1,0,0,0,0,0,0,0,3391,4804,59,0,0,0,0,0,0,1,47,70,1,5af23a1e0e77410abb25e9a7eee510aa,145,1,120.2,109.64,0,49.9,1
3,649.0,5,1,0,1,0,0,0,0,500,3141,78,1931,0,0,0,1,0,1,55,90,1,43c64379d3c24a15b8478851b22049e4,234,3,167.42,92.97,0,99.8,26
4,648.0,0,0,0,1,0,0,0,0,522,59,70,64,0,0,0,1,0,1,40,80,1,f1687f3b8a6f4910bd0b13eb634056e2,76,3,101.0,95.47,0,49.9,44


In [9]:

""" 数据属性 """
df_data.info()

print()
print("共有数据集：", df_data.shape[0])
print("共有测试集：", test_data.shape[0])
print("共有训练集：", train_data.shape[0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 30 columns):
信用分                50000 non-null float64
当月旅游资讯类应用使用次数      100000 non-null int64
当月是否体育场馆消费         100000 non-null int64
当月是否到过福州山姆会员店      100000 non-null int64
当月是否景点游览           100000 non-null int64
当月是否看电影            100000 non-null int64
当月是否逛过福州仓山万达       100000 non-null int64
当月火车类应用使用次数        100000 non-null int64
当月物流快递类应用使用次数      100000 non-null int64
当月网购类应用使用次数        100000 non-null int64
当月视频播放类应用使用次数      100000 non-null int64
当月通话交往圈人数          100000 non-null int64
当月金融理财类应用使用总次数     100000 non-null int64
当月飞机类应用使用次数        100000 non-null int64
是否4G不健康客户          100000 non-null int64
是否大学生客户            100000 non-null int64
是否经常逛商场的人          100000 non-null int64
是否黑名单客户            100000 non-null int64
用户实名制是否通过核实        100000 non-null int64
用户年龄               100000 non-null int64
用户当月账户余额（元）        100000 non-null int64
用户最近一次缴费距今时长（月）    100000 no

In [10]:
""" 数据类别 """
for i,name in enumerate(df_data.columns):
    name_sum = df_data[name].value_counts().shape[0]
    print("{:2}、{:15}      The number of types of features is：{}".format(i + 1, name, name_sum))

 1、信用分                  The number of types of features is：278
 2、当月旅游资讯类应用使用次数        The number of types of features is：934
 3、当月是否体育场馆消费           The number of types of features is：2
 4、当月是否到过福州山姆会员店        The number of types of features is：2
 5、当月是否景点游览             The number of types of features is：2
 6、当月是否看电影              The number of types of features is：2
 7、当月是否逛过福州仓山万达         The number of types of features is：2
 8、当月火车类应用使用次数          The number of types of features is：180
 9、当月物流快递类应用使用次数        The number of types of features is：239
10、当月网购类应用使用次数          The number of types of features is：8382
11、当月视频播放类应用使用次数        The number of types of features is：16067
12、当月通话交往圈人数            The number of types of features is：554
13、当月金融理财类应用使用总次数       The number of types of features is：7232
14、当月飞机类应用使用次数          The number of types of features is：209
15、是否4G不健康客户            The number of types of features is：2
16、是否大学生客户              The number of types of features is：2
17

In [11]:
""" 数据统计 """
df_data.describe()

Unnamed: 0,信用分,当月旅游资讯类应用使用次数,当月是否体育场馆消费,当月是否到过福州山姆会员店,当月是否景点游览,当月是否看电影,当月是否逛过福州仓山万达,当月火车类应用使用次数,当月物流快递类应用使用次数,当月网购类应用使用次数,当月视频播放类应用使用次数,当月通话交往圈人数,当月金融理财类应用使用总次数,当月飞机类应用使用次数,是否4G不健康客户,是否大学生客户,是否经常逛商场的人,是否黑名单客户,用户实名制是否通过核实,用户年龄,用户当月账户余额（元）,用户最近一次缴费距今时长（月）,用户网龄（月）,用户话费敏感度,用户账单当月总费用（元）,用户近6个月平均消费值（元）,缴费用户当前是否欠费缴费,缴费用户最近一次缴费金额（元）,近三个月月均商场出现次数
count,50000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,618.05306,19.39465,0.37473,0.02705,0.47546,0.2438,0.03923,0.56459,1.02586,1161.14261,3386.321,48.21111,975.36609,0.64976,0.08869,0.00362,0.33072,0.0485,0.99124,37.90791,115.6846,0.70142,96.27158,3.35298,99.709021,98.983241,0.05183,53.721932,26.50703
std,42.443022,312.587384,0.484056,0.16223,0.4994,0.429376,0.194143,7.973381,37.482212,4300.092242,10744.17,54.867465,2965.36056,22.299903,0.284297,0.060058,0.470475,0.214821,0.093184,11.625008,448.684984,0.457637,59.112782,1.241129,65.314169,61.002422,0.221685,62.214807,32.739661
min,422.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,594.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,10.0,16.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,30.0,40.0,0.0,48.0,2.0,53.0,54.32,0.0,0.0,1.0
50%,627.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,248.0,334.0,32.0,265.0,0.0,0.0,0.0,0.0,0.0,1.0,36.0,70.0,1.0,94.0,4.0,90.0,89.67,0.0,49.9,8.0
75%,649.0,4.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,934.0,2440.0,62.0,1145.0,0.0,0.0,0.0,1.0,0.0,1.0,45.0,130.0,1.0,139.0,4.0,134.6275,131.56,0.0,99.8,49.0
max,719.0,87681.0,1.0,1.0,1.0,1.0,1.0,775.0,8235.0,417536.0,1382227.0,1906.0,496238.0,5856.0,1.0,1.0,1.0,1.0,1.0,111.0,109090.0,1.0,288.0,5.0,2117.01,1792.74,1.0,1000.0,92.0


In [12]:
df_data[df_data['信用分'].isnull()].describe()

Unnamed: 0,信用分,当月旅游资讯类应用使用次数,当月是否体育场馆消费,当月是否到过福州山姆会员店,当月是否景点游览,当月是否看电影,当月是否逛过福州仓山万达,当月火车类应用使用次数,当月物流快递类应用使用次数,当月网购类应用使用次数,当月视频播放类应用使用次数,当月通话交往圈人数,当月金融理财类应用使用总次数,当月飞机类应用使用次数,是否4G不健康客户,是否大学生客户,是否经常逛商场的人,是否黑名单客户,用户实名制是否通过核实,用户年龄,用户当月账户余额（元）,用户最近一次缴费距今时长（月）,用户网龄（月）,用户话费敏感度,用户账单当月总费用（元）,用户近6个月平均消费值（元）,缴费用户当前是否欠费缴费,缴费用户最近一次缴费金额（元）,近三个月月均商场出现次数
count,0.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,,19.67218,0.37534,0.02698,0.4765,0.24532,0.03976,0.57584,0.85388,1173.46996,3406.12244,48.36772,979.2291,0.5951,0.0888,0.00352,0.33194,0.0482,0.99226,37.93238,117.1968,0.70274,96.09448,3.35404,99.842912,99.234402,0.05112,54.027936,26.44154
std,,408.041808,0.484215,0.162027,0.499452,0.430281,0.195397,8.20404,28.848873,4586.71334,9919.40536,55.518686,2924.008879,13.025441,0.284458,0.059226,0.470914,0.214191,0.087637,11.636829,556.938946,0.457057,59.048962,1.241067,65.301379,61.245686,0.220245,62.614124,32.690192
min,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,10.0,16.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,30.0,40.0,0.0,48.0,2.0,53.2,54.45,0.0,0.0,1.0
50%,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,246.0,333.0,32.0,263.0,0.0,0.0,0.0,0.0,0.0,1.0,36.0,70.0,1.0,94.0,4.0,90.0,90.0,0.0,49.9,8.0
75%,,4.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,935.0,2455.0,62.0,1144.0,0.0,0.0,0.0,1.0,0.0,1.0,45.0,130.0,1.0,139.0,4.0,135.235,132.0,0.0,99.8,49.0
max,,87681.0,1.0,1.0,1.0,1.0,1.0,775.0,5462.0,417536.0,295210.0,1633.0,329767.0,1645.0,1.0,1.0,1.0,1.0,1.0,108.0,109090.0,1.0,288.0,5.0,2117.01,1792.74,1.0,1000.0,92.0


In [13]:
df_data[df_data['信用分'].notnull()].describe()

Unnamed: 0,信用分,当月旅游资讯类应用使用次数,当月是否体育场馆消费,当月是否到过福州山姆会员店,当月是否景点游览,当月是否看电影,当月是否逛过福州仓山万达,当月火车类应用使用次数,当月物流快递类应用使用次数,当月网购类应用使用次数,当月视频播放类应用使用次数,当月通话交往圈人数,当月金融理财类应用使用总次数,当月飞机类应用使用次数,是否4G不健康客户,是否大学生客户,是否经常逛商场的人,是否黑名单客户,用户实名制是否通过核实,用户年龄,用户当月账户余额（元）,用户最近一次缴费距今时长（月）,用户网龄（月）,用户话费敏感度,用户账单当月总费用（元）,用户近6个月平均消费值（元）,缴费用户当前是否欠费缴费,缴费用户最近一次缴费金额（元）,近三个月月均商场出现次数
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,618.05306,19.11712,0.37412,0.02712,0.47442,0.24228,0.0387,0.55334,1.19784,1148.81526,3366.519,48.0545,971.50308,0.70442,0.08858,0.00372,0.3295,0.0488,0.99022,37.88344,114.1724,0.7001,96.44868,3.35192,99.57513,98.732081,0.05254,53.415929,26.57252
std,42.443022,170.074772,0.4839,0.162435,0.49935,0.428467,0.192881,7.735913,44.469584,3992.957952,11510.06,54.208524,3006.16776,28.721302,0.284139,0.060879,0.470036,0.215452,0.09841,11.613239,304.063961,0.458218,59.176593,1.241202,65.327335,60.757758,0.223116,61.812022,32.789251
min,422.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,594.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,10.0,16.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,30.0,40.0,0.0,48.0,2.0,52.675,54.18,0.0,0.0,1.0
50%,627.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,250.0,335.0,32.0,267.0,0.0,0.0,0.0,0.0,0.0,1.0,36.0,70.0,1.0,94.0,4.0,89.62,89.32,0.0,49.9,8.0
75%,649.0,4.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,932.0,2423.25,61.0,1147.25,0.0,0.0,0.0,1.0,0.0,1.0,45.0,130.0,1.0,139.0,4.0,133.945,131.16,0.0,99.8,50.0
max,719.0,13965.0,1.0,1.0,1.0,1.0,1.0,474.0,8235.0,234336.0,1382227.0,1906.0,496238.0,5856.0,1.0,1.0,1.0,1.0,1.0,111.0,49040.0,1.0,288.0,5.0,1164.29,840.57,1.0,998.0,92.0
