In [1]:
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
import pickle
def savePickle(target, filename):
    with open(filename, "wb") as f:
        pickle.dump(target, f)
        
def loadPickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)
    
def getLabelDict(labelDataFrame):
    label_dict = {}
    i = 0
    for c in labelDataFrame.columns:
        label_dict[i] = c[3:]
        label_dict[c[3:]] = i
        i = i + 1
    return label_dict

In [3]:
# train = pd.read_csv("./data/train_all.csv",low_memory =False)
# test = pd.read_csv("./data/republish_test.csv", low_memory =False)
# train = pd.read_csv("../data/train.csv",low_memory =False)
# test = pd.read_csv("../data/test.csv", low_memory =False)

In [4]:
# #测试集，因为确定的是中间两个月的消费数据，可以用前后月的消费数据计算得到。
# test.loc[test['2_total_fee'] == '\\N','3_total_fee'] = test.loc[test['2_total_fee'] == '\\N','1_total_fee']*0.25 + test.loc[test['2_total_fee'] == '\\N','4_total_fee']*0.75
# test.loc[test['2_total_fee'] == '\\N','2_total_fee'] = test.loc[test['2_total_fee'] == '\\N','1_total_fee']*0.75 + test.loc[test['2_total_fee'] == '\\N','4_total_fee']*0.25
# test.loc[test['3_total_fee'] == '\\N','3_total_fee'] = test.loc[test['3_total_fee'] == '\\N','1_total_fee']*0.2 + test.loc[test['3_total_fee'] == '\\N','4_total_fee']*0.75
# #训练集， 直接清除所有确实值记录，这很少，不影响结果
# train = train[train['2_total_fee'] != '\\N']
# train = train[train['3_total_fee'] != '\\N']
# train = train[train['gender'] != '\\N']
# train = train[train['age'] != '\\N'].copy()

# train['2_total_fee'] = train['2_total_fee'].apply(float)
# train['3_total_fee'] = train['3_total_fee'].apply(float)
# train['gender'] = train['gender'].apply(int)
# train['age'] = train['age'].apply(int)
# test['2_total_fee'] = test['2_total_fee'].apply(float)
# test['3_total_fee'] = test['3_total_fee'].apply(float)

In [5]:
    
# savePickle(train, "../data/train.pkl")
# savePickle(test, "../data/test.pkl")

    
train = loadPickle("../data/train.pkl")
test = loadPickle("../data/test.pkl")

In [6]:
#将训练集与测试集合并，这样特征处理就可以一致
test['current_service'] = -1
data = train.append(test)
data['2_total_fee'] = data['2_total_fee'].apply(float)
data['3_total_fee'] = data['3_total_fee'].apply(float)
data['gender'] = data['gender'].apply(int)
data['age'] = data['age'].apply(int)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [7]:
iseq = np.array(data['1_total_fee'] == data['2_total_fee']).astype(int)
isint = np.array(data['1_total_fee']).astype(int) == np.array(data['2_total_fee'])
data['same_fee'] = iseq * isint * np.array(data['1_total_fee'])
data['min_fee'] = np.min(data[['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']], axis = 1)
data['max_fee'] = np.max(data[['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']], axis = 1)
data['range_fee'] = data['max_fee'] - data['min_fee']
data['non_local_trafffic'] = np.array(data['month_traffic']) - np.array(data['local_trafffic_month'])

In [8]:
category_features = ['complaint_level','contract_type','gender', 'is_mix_service', 'is_promise_low_consume',
                     'many_over_bill','net_service','service_type', 'same_fee']
floatcontinous_features = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee','former_complaint_fee',
                           'last_month_traffic',       'local_caller_time', 'local_trafffic_month', 
                           'month_traffic', 'pay_num', 'service1_caller_time', 'service2_caller_time',
                           'min_fee', 'max_fee', 'range_fee',  'non_local_trafffic']
intcontinous_features = ['age','contract_time', 'former_complaint_num', 'online_time',  'pay_times']
# intcontinous_features = ['contract_time', 'former_complaint_num']

In [9]:
# kmeans1 = KMeans(n_clusters=40, random_state=0).fit(data[floatcontinous_features])
# kmeans2 = KMeans(n_clusters=12, random_state=0).fit(data[intcontinous_features])

In [10]:
data_encode = pd.DataFrame()
for feature in category_features:
    data_encode[feature] = np.array(data[feature]).astype(int).astype(np.object)

for feature in floatcontinous_features:
    v = np.array(data[feature]).astype(np.float32)
    v = np.log(v - v.min() + 1) # 平移到 大于1 的正整数空间，然后取对数平滑。
    v = v/v.max() #压缩到 0-1 之间。
    data_encode[feature] = (v - v.mean())/v.std() # 取值为均值为0，方差为1的样本。
    
for feature in intcontinous_features:
    v = np.array(data[feature]).astype(np.float32)
    v = np.log(v - v.min() + 1) # 平移到 大于1 的正整数空间，然后取对数平滑。
    v = v/v.max() #压缩到 0-1 之间。
    data_encode[feature] = (v - v.mean())/v.std() # 取值为均值为0，方差为1的样本。
    
# 对于连续变量，进行聚类，进一步挖掘信息。

# data_encode['kmeans1'] = np.array(kmeans1.labels_).astype(np.object)
# data_encode['kmeans2'] = np.array(kmeans2.labels_).astype(np.object)
data_encode = pd.get_dummies(data_encode)
data_encode['user_id'] = np.array(data['user_id'])
data_encode['current_service'] = np.array(data['current_service'])

In [11]:
# data_encode = pd.DataFrame()
# for feature in [*category_features, *intcontinous_features]:
#     data_encode[feature] = np.array(data[feature]).astype(int).astype(np.object)

# for feature in floatcontinous_features:
#     v = np.array(data[feature]).astype(np.float32)
#     v = np.log(v - v.min() + 1) # 平移到 大于1 的正整数空间，然后取对数平滑。
#     v = np.array(v/v.max()*100).astype(int) #转换成0-100的整数
#     for i in range(100):
#         data_encode['feature_%d'%i] = np.array(v == i).astype(int)
    
    

# data_encode = pd.get_dummies(data_encode)
# data_encode['user_id'] = np.array(data['user_id'])
# data_encode['current_service'] = np.array(data['current_service'])

In [12]:
data_encode.shape

(943986, 483)

In [13]:
# def print_stats(tdf):
#     dct = Counter(tdf)
#     cn = len(tdf)
#     for key in dct.keys():
#         print(key, round(dct[key]/cn, 3))
#     print(cn, end = "\n\n")
    
# odf = train['current_service']
# print_stats(odf)
# # tdf = train[train['former_complaint_fee'] > 0][train['former_complaint_fee'] < 100]['current_service']
# tdf = train[train['3_total_fee'] < 0 ]['current_service']
# print_stats(tdf)

In [14]:
#test data
testData = data_encode[data_encode.current_service == -1]
featureTest = testData.drop(['current_service', 'user_id'], axis = 1)
TestResult = testData[['user_id']].copy()

#train data 
trainData = data_encode[data_encode.current_service != -1]
feature = trainData.drop(['current_service', 'user_id'],axis = 1)

# label encode
label = pd.DataFrame()
label['cs'] = np.array(trainData['current_service']).astype(np.int).astype(np.object)
label = pd.get_dummies(label)
label_dict = getLabelDict(label)

In [15]:
continous_col = [*floatcontinous_features, *intcontinous_features]
train_x_continuous = feature[continous_col]
train_x_onehot = feature.drop(continous_col, axis = 1)
test_x_continous = featureTest[continous_col]
test_x_onehot = featureTest.drop(continous_col, axis = 1)

In [16]:
savePickle(train_x_continuous, "../data/normaldata/train_x_continuous.pkl")
savePickle(train_x_onehot, "../data/normaldata/train_x_onehot.pkl")
savePickle(test_x_continous, "../data/normaldata/test_x_continous.pkl")
savePickle(test_x_onehot, "../data/normaldata/test_x_onehot.pkl")
savePickle(feature, "../data/normaldata/train_x.pkl")
savePickle(featureTest, "../data/normaldata/test_x.pkl")
savePickle(label, "../data/normaldata/train_y.pkl")
savePickle(label_dict, "../data/normaldata/label_dict.pkl")
savePickle(TestResult, "../data/normaldata/TestResult.pkl")

In [17]:
# savePickle(feature, "./data/onehot/train_x.pkl")
# savePickle(featureTest, "./data/onehot/test_x.pkl")
# savePickle(label, "./data/onehot/train_y.pkl")
# savePickle(label_dict, "./data/onehot/label_dict.pkl")
# savePickle(TestResult, "./data/onehot/TestResult.pkl")