-
Notifications
You must be signed in to change notification settings - Fork 8
/
feature_engineering.py
143 lines (118 loc) · 5.72 KB
/
feature_engineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
# 特征工程
# author = 'huangth'
import os
import time
import gc
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
BASE_PATH = '/home/wolfkin/ImageOfConsumers/'
RAW_DATA_PATH = os.path.join(BASE_PATH, "RawData")
ETL_DATA_PATH = os.path.join(BASE_PATH, "EtlData")
class Processing(object):
# 读取数据
@staticmethod
def _get_data_(name):
data_name = os.path.join(RAW_DATA_PATH, '{}.csv'.format(name))
df = pd.read_csv(data_name)
return df
# 改变变量类型节省内存空间
@staticmethod
def _reduce_mem_usage_(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
# 众数填充异常值
@staticmethod
def _mode_fill_(df, col):
df.loc[df[col] == 0, col] = df[col].mode()
return df
# 疯狂找特征呀呀呀
@staticmethod
def _feature_(df):
df['缴费金额是否能覆盖当月账单'] = df['缴费用户最近一次缴费金额(元)'] - df['用户账单当月总费用(元)']
df['最近一次缴费是否超过平均消费额'] = df['缴费用户最近一次缴费金额(元)'] - df['用户近6个月平均消费值(元)']
df['当月账单是否超过平均消费额'] = df['用户账单当月总费用(元)'] - df['用户近6个月平均消费值(元)']
df['是否去过高档商场'] = df['当月是否逛过福州仓山万达'] * df['当月是否到过福州山姆会员店']
df['交通类应用使用次数'] = df['当月飞机类应用使用次数'] + df['当月火车类应用使用次数']
# 缴费方式
df['缴费方式'] = 0
df['缴费方式'][(df['缴费用户最近一次缴费金额(元)'] % 10 == 0) & df['缴费用户最近一次缴费金额(元)'] != 0] = 1
# 消费稳定性 话费/余额
df['缴费稳定性'] = df['用户账单当月总费用(元)'] / (df['用户近6个月平均消费值(元)'] + 1)
df['当月话费使用率'] = df['用户账单当月总费用(元)'] / (df['用户当月账户余额(元)'] + 1)
return df
# 年龄分箱
@staticmethod
def _group_age_(x):
if x <= 18:
return 1
elif x <= 30:
return 2
elif x <= 35:
return 3
elif x <= 45:
return 4
else:
return 5
# 长尾数据处理
@staticmethod
def _log_feature_(df):
user_bill_features = ['缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)',
'用户账单当月总费用(元)', '用户当月账户余额(元)']
log_features = ['当月网购类应用使用次数', '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数']
for col in user_bill_features + log_features:
df[col] = df[col].map(lambda x: np.log1p(x))
return df
def get_processing(self):
train_df = self._get_data_('train_dataset')
test_df = self._get_data_('test_dataset')
train_df = self._reduce_mem_usage_(train_df)
test_df = self._reduce_mem_usage_(test_df)
test_df['信用分'] = -1
data = pd.concat([train_df, test_df], axis=0, ignore_index=True)
del train_df, test_df
gc.collect()
data = self._mode_fill_(data, '用户年龄')
data = self._feature_(data)
# data['年龄段'] = data['用户年龄'].apply(self._group_age_)
data = self._log_feature_(data)
train, test = data[:50000], data[50000:]
test = test.drop(['信用分'], axis=1)
train_data_name = os.path.join(ETL_DATA_PATH, 'train_data.csv')
test_data_name = os.path.join(ETL_DATA_PATH, 'test_data.csv')
train.to_csv(train_data_name, index=False)
test.to_csv(test_data_name, index=False)
print('Gen train shape: {}, test shape: {}'.format(train.shape, test.shape))
print('features num: ', test.shape[1] - 1)
if __name__ == "__main__":
t0 = time.time()
processing = Processing()
processing.get_processing()
print("Feature engineering has finished!")
print("Cost {} s.".format(time.time() - t0))