In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import random
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def precess_null(dataframe: pd.DataFrame):
    """
    该函数完成对数据集的空值填充

    :param dataframe: pd.DataFrame,
        需要处理的数据集
    :return: pd.DataFrame,
        已经完成空值填充的数据集
    """
    dataframe = dataframe.drop(labels='SaleID', axis=1)
    dataframe = dataframe.drop(labels='name', axis=1)
    # regDate汽车注册日期拆分为年月日三列
    modified_date_list = []
    for i in dataframe['regDate'].values:
        i = str(i)
        i_year = int(i[0:4])
        i_mouth = int(i[4:6])
        i_day = int(i[6:8])
        modified_date_list.append([i_year, i_mouth, i_day])
    modified_date_array = np.array(modified_date_list)
    dataframe['regDate_Year'] = modified_date_array[:, 0]
    dataframe['regDate_Mouth'] = modified_date_array[:, 1]
    dataframe['regDate_Day'] = modified_date_array[:, 2]
    dataframe = dataframe.drop(labels='regDate', axis=1)
    # 车型model有一个数值为空，所以简单的填充车型中最为普遍的0就可以啦。
    dataframe['model'] = dataframe['model'].fillna(value=0)
    # 汽车品牌bodyType中有5%的数值为空，算是空缺比较大的情况，所以凡是空缺的位置填充-1作为特殊标记
    dataframe['bodyType'] = dataframe['bodyType'].fillna(value=-1)
    # 燃料类型fuelType中有8%的数值为空，空缺数量比较大，填充特殊标记-1
    dataframe['fuelType'] = dataframe['fuelType'].fillna(value=-1)
    # 变速箱类型gearbox中有5%的数值为空，填充特殊标记-1
    dataframe['gearbox'] = dataframe['gearbox'].fillna(value=-1)
    # 发动机功率power中有很多异常值，所以进行异常数据压缩和标记工作
    power_values = dataframe['power'].values
    power_values_DB = 10
    power_values_UB = 600
    new_power_seq = []
    new_power_label_seq = []
    for i in power_values:
        if i < power_values_DB:
            new_i = power_values_DB
            new_i_label = -1
        elif i > power_values_UB:
            new_i = power_values_UB
            new_i_label = 1
        else:
            new_i = i
            new_i_label = 0
        new_power_seq.append(new_i)
        new_power_label_seq.append(new_i_label)
    dataframe['power'] = new_power_seq
    dataframe['power_normal_label'] = new_power_label_seq
    # notRepairedDamage字段的替换处理
    notRepairedDamage_replace_map = {'0.0': 0, '1.0': 1, '-': -1}
    dataframe['notRepairedDamage'] = [notRepairedDamage_replace_map[i] for i in dataframe['notRepairedDamage'].values]
    # TODO（张伟健）: 对地区编码进行转换处理
    tag_1_list = []
    tag_2_list = []
    for i_region in dataframe['regionCode'].values:
        tag_1_list.append(region_tag_1_map.get(i_region, 0))
        tag_2_list.append(region_tag_2_map.get(i_region, [0]*8))
    dataframe['regionPriceLevel'] = tag_1_list
    tag_2_array = np.array(tag_2_list)
    for i in range(8):
        dataframe['regionBodyType_%d' % i] = tag_2_array[:, i]
    dataframe = dataframe.drop(labels='regionCode', axis=1)
    # 删除offerType列，该列都是一样的数值，没有任何意义
    dataframe = dataframe.drop(labels='offerType', axis=1)
    # 将creatDate列转换为年月日三列
    modified_date_list = []
    for i in dataframe['creatDate'].values:
        i = str(i)
        i_year = int(i[0:4])
        i_mouth = int(i[4:6])
        i_day = int(i[6:8])
        modified_date_list.append([i_year, i_mouth, i_day])
    modified_date_array = np.array(modified_date_list)
    dataframe['creatDate_Year'] = modified_date_array[:, 0]
    dataframe['creatDate_Mouth'] = modified_date_array[:, 1]
    dataframe['creatDate_Day'] = modified_date_array[:, 2]
    dataframe = dataframe.drop(labels='creatDate', axis=1)
    return dataframe