# 天池大赛·印象盐城

## 特征工程

初赛提供2012年1月-2017年10月盐城分车型销量配置数据。
第一阶段需要参赛者预测2017年11月盐城分车型销量数据，第二阶段需要参赛者预测2017年12月盐城分车型销量数据。

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
train = pd.read_csv('../[new] yancheng_train_20171226.csv', low_memory=False)
test = pd.read_csv('../yancheng_testA_20171225.csv')

### 一、异常值处理

1.1 发现数据里相同车型在同一月份里有两条记录，这里做简单加和

In [3]:
labels = ['sale_date','class_id','brand_id','compartment','type_id','level_id','department_id','TR','gearbox_type','displacement','if_charging',
          'price_level','driven_type_id','fuel_type_id','newenergy_type_id','emission_standards_id','if_MPV_id','if_luxurious_id','power',
          'cylinder_number','engine_torque','car_length','car_width','car_height','total_quality','equipment_quality','rated_passenger',
          'wheelbase','front_track','rear_track']
train = train.groupby(labels).agg('sum').reset_index()
train['brand_id'].unique()

array([761, 106,  98, 836,  12, 814, 831, 750, 537, 450, 692, 985, 841,
       638, 872, 953, 304, 783, 637,  75, 923, 497, 813, 290, 807, 864,
       498, 236, 542, 512, 294,  49, 126, 682,  68,  76], dtype=int64)

1.2 三个功率为81/70的条目里共有两款车

    其中一款车有记载的功率为66和81，我们令值为81
    
    另一款车有记载的功率为66，70,81和96，也令值为81

In [4]:
train.loc[12382,'power']=81
train.loc[13478,'power']=81
train.loc[13450,'power']=81
train[['power']] = train[['power']].astype('float32')

1.3 发动机扭矩

    有三个功率是'155/140'，就当是155吧
    
    还有一个车型是'-'，给个平均值吧

In [5]:
train.loc[12382,'engine_torque']=155
train.loc[13478,'engine_torque']=155
train.loc[13450,'engine_torque']=155
train['engine_torque'][train['engine_torque']=='-']=201.8
train['engine_torque'] = train['engine_torque'].astype('float32')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### 二、特征处理

**训练数据**

In [6]:
labels = ['class_id','brand_id','compartment','type_id','level_id','department_id','TR','gearbox_type','displacement','if_charging',
          'price_level','driven_type_id','fuel_type_id','newenergy_type_id','emission_standards_id','if_MPV_id','if_luxurious_id','power',
          'cylinder_number','engine_torque','car_length','car_width','car_height','total_quality','equipment_quality','rated_passenger',
          'wheelbase','front_track','rear_track']
trainLabels = train[labels].drop_duplicates().reset_index(drop=True)

In [7]:
trainDate = train
trainDate['year'] = trainDate['sale_date'].apply(lambda x: int(str(x)[0:4]))
trainDate['mouth'] = trainDate['sale_date'].apply(lambda x: int(str(x)[4:6]))
trainDate = trainDate.drop(['sale_date'], axis=1)

In [8]:
trainSaleDate = trainLabels

# 2012年1月的销量信息
trainTmp = trainDate[(trainDate['year']==2012)&(trainDate['mouth']==1)]
# trainTmp = trainTmp[trainTmp['mouth']==1]
trainTmp.loc[:, 'how_many_mouth_has_sell'] = 1
sale_class_count = len(trainTmp)

trainSaleDate = pd.merge(trainSaleDate, trainTmp, how='left')
trainSaleDate['year'].fillna(2012.0, inplace=True)
trainSaleDate['mouth'].fillna(1.0, inplace=True)
# trainSaleDate['how_many_mouth_has_sell'].fillna(0.0, inplace=True)
trainSaleDate.loc[:, 'sale_quantity_total_last_mouth'] = 0
trainSaleDate.loc[:, 'sale_quantity_this_last_mouth'] = 0
trainSaleDate.loc[:, 'this_mouth_sale_class_count'] = sale_class_count
# trainSaleDate

labels = ['class_id','sale_quantity','brand_id','compartment','type_id','level_id','department_id','TR','gearbox_type','displacement',
          'if_charging','price_level','driven_type_id','fuel_type_id','newenergy_type_id','emission_standards_id','if_MPV_id','if_luxurious_id',
          'power','cylinder_number','engine_torque','car_length','car_width','car_height','total_quality','equipment_quality','rated_passenger',
          'wheelbase','front_track','rear_track','how_many_mouth_has_sell']
labels_year = ['class_id','sale_quantity','brand_id','compartment','type_id','level_id','department_id','TR','gearbox_type','displacement','if_charging',
               'price_level','driven_type_id','fuel_type_id','newenergy_type_id','emission_standards_id','if_MPV_id','if_luxurious_id','power',
               'cylinder_number','engine_torque','car_length','car_width','car_height','total_quality','equipment_quality','rated_passenger',
               'wheelbase','front_track','rear_track']
for mouth in range(2, 13):
    trainSaleDateTmp = trainLabels
    trainTmp = trainDate[(trainDate['year']==2012)&(trainDate['mouth']==mouth)]
    
    sellMouth = trainSaleDate[labels][(trainSaleDate['year']==2012)&(trainSaleDate['mouth']==(mouth-1))]
    sellMouth.dropna(inplace=True)
    sellMouth['how_many_mouth_has_sell'] = sellMouth['how_many_mouth_has_sell']+1
    sale_quantity_total_last_mouth = sellMouth['sale_quantity'].sum()
    sellMouth.rename(columns={'sale_quantity': 'sale_quantity_this_last_mouth'}, inplace=True)
    
    trainTmp = pd.merge(trainTmp, sellMouth, how='left')
    trainTmp['how_many_mouth_has_sell'].fillna(1.0, inplace=True)
    sale_class_count = len(trainTmp)
    
    trainSaleDateTmp = pd.merge(trainSaleDateTmp, trainTmp, how='left')
    trainSaleDateTmp['year'].fillna(2012.0, inplace=True)
    trainSaleDateTmp['mouth'].fillna(mouth, inplace=True)
    trainSaleDateTmp.loc[:, 'sale_quantity_total_last_mouth'] = sale_quantity_total_last_mouth
    trainSaleDateTmp.loc[:, 'this_mouth_sale_class_count'] = sale_class_count
    
    trainSaleDate = pd.concat([trainSaleDate, trainSaleDateTmp])
    
trainSaleDate.loc[:, 'sale_quantity_total_last_year'] = 0
trainSaleDate.loc[:, 'sale_quantity_this_last_year'] = 0

for year in range(2013, 2017):
    for mouth in range(1, 13):
        trainSaleDateTmp = trainLabels
        trainTmp = trainDate[(trainDate['year']==year)&(trainDate['mouth']==mouth)]
        
        if mouth == 1:
            sellMouth = trainSaleDate[labels][(trainSaleDate['year']==(year-1))&(trainSaleDate['mouth']==12)]
        else:
            sellMouth = trainSaleDate[labels][(trainSaleDate['year']==year)&(trainSaleDate['mouth']==(mouth-1))]
        sellMouth.dropna(inplace=True)
        sellMouth['how_many_mouth_has_sell'] = sellMouth['how_many_mouth_has_sell']+1
        sale_quantity_total_last_mouth = sellMouth['sale_quantity'].sum()
        sellMouth.rename(columns={'sale_quantity': 'sale_quantity_this_last_mouth'}, inplace=True)
        
        sellYear = trainSaleDate[labels_year][((trainSaleDate['year']==(year-1))&(trainSaleDate['mouth']>=mouth))|((trainSaleDate['year']==year)&(trainSaleDate['mouth']<mouth))]
        sellYear.dropna(inplace=True)
        sellYear = sellYear.groupby('class_id').agg('sum').reset_index()
        sale_quantity_total_last_year = sellYear['sale_quantity'].sum()
        sellYear.rename(columns={'sale_quantity': 'sale_quantity_this_last_year'}, inplace=True)
        
        trainTmp = pd.merge(trainTmp, sellMouth, how='left')
        trainTmp['how_many_mouth_has_sell'].fillna(1.0, inplace=True)
        sale_class_count = len(trainTmp)
        
        trainTmp = pd.merge(trainTmp, sellYear, how='left')
        
        trainSaleDateTmp = pd.merge(trainSaleDateTmp, trainTmp, how='left')
        trainSaleDateTmp['year'].fillna(year, inplace=True)
        trainSaleDateTmp['mouth'].fillna(mouth, inplace=True)
        trainSaleDateTmp.loc[:, 'sale_quantity_total_last_mouth'] = sale_quantity_total_last_mouth
        trainSaleDateTmp.loc[:, 'sale_quantity_total_last_year'] = sale_quantity_total_last_year
        trainSaleDateTmp.loc[:, 'this_mouth_sale_class_count'] = sale_class_count
        
        trainSaleDate = pd.concat([trainSaleDate, trainSaleDateTmp])
        
for year in range(2017, 2018):
    for mouth in range(1, 11):
        trainSaleDateTmp = trainLabels
        trainTmp = trainDate[(trainDate['year']==year)&trainDate['mouth']==mouth]
        
        if mouth == 1:
            sellMouth = trainSaleDate[labels][(trainSaleDate['year']==(year-1))&(trainSaleDate['mouth']==12)]
        else:
            sellMouth = trainSaleDate[labels][(trainSaleDate['year']==year)&(trainSaleDate['mouth']==(mouth-1))]
        sellMouth.dropna(inplace=True)
        sellMouth['how_many_mouth_has_sell'] = sellMouth['how_many_mouth_has_sell']+1
        sale_quantity_total_last_mouth = sellMouth['sale_quantity'].sum()
        sellMouth.rename(columns={'sale_quantity': 'sale_quantity_this_last_mouth'}, inplace=True)
        
        sellYear = trainSaleDate[labels_year][((trainSaleDate['year']==(year-1))&(trainSaleDate['mouth']>=mouth))|((trainSaleDate['year']==year)&(trainSaleDate['mouth']<mouth))]
        sellYear.dropna(inplace=True)
        sellYear = sellYear.groupby('class_id').agg('sum').reset_index()
        sale_quantity_total_last_year = sellYear['sale_quantity'].sum()
        sellYear.rename(columns={'sale_quantity': 'sale_quantity_this_last_year'}, inplace=True)
        
        trainTmp = pd.merge(trainTmp, sellMouth, how='left')
        trainTmp['how_many_mouth_has_sell'].fillna(1.0, inplace=True)
        sale_class_count = len(trainTmp)
        
        trainTmp = pd.merge(trainTmp, sellYear, how='left')
        
        trainSaleDateTmp = pd.merge(trainSaleDateTmp, trainTmp, how='left')
        trainSaleDateTmp['year'].fillna(year, inplace=True)
        trainSaleDateTmp['mouth'].fillna(mouth, inplace=True)
        trainSaleDateTmp.loc[:, 'sale_quantity_total_last_mouth'] = sale_quantity_total_last_mouth
        trainSaleDateTmp.loc[:, 'sale_quantity_total_last_year'] = sale_quantity_total_last_year
        trainSaleDateTmp.loc[:, 'this_mouth_sale_class_count'] = sale_class_count
        
        trainSaleDate = pd.concat([trainSaleDate, trainSaleDateTmp])
        
    for mouth in range(11, 12):
        trainSaleDateTmp = trainLabels
        
        sellMouth = trainSaleDate[labels][(trainSaleDate['year']==year)&(trainSaleDate['mouth']==(mouth-1))]
        sellMouth.dropna(inplace=True)
        sellMouth['how_many_mouth_has_sell'] = sellMouth['how_many_mouth_has_sell']+1
        sale_quantity_total_last_mouth = sellMouth['sale_quantity'].sum()
        sellMouth.rename(columns={'sale_quantity': 'sale_quantity_this_last_mouth'}, inplace=True)
        
        sellYear = trainSaleDate[labels_year][((trainSaleDate['year']==(year-1))&(trainSaleDate['mouth']>=mouth))|((trainSaleDate['year']==year)&(trainSaleDate['mouth']<mouth))]
        sellYear.dropna(inplace=True)
        sellYear = sellYear.groupby('class_id').agg('sum').reset_index()
        sale_quantity_total_last_year = sellYear['sale_quantity'].sum()
        sellYear.rename(columns={'sale_quantity': 'sale_quantity_this_last_year'}, inplace=True)
        
        trainSaleDateTmp = pd.merge(trainSaleDateTmp, sellMouth, how='left')
        trainSaleDateTmp['how_many_mouth_has_sell'].fillna(1.0, inplace=True)
        sale_class_count = len(trainSaleDateTmp)
        
        trainSaleDateTmp = pd.merge(trainSaleDateTmp, sellYear, how='left')
        
        trainSaleDateTmp['year'].fillna(year, inplace=True)
        trainSaleDateTmp['mouth'].fillna(mouth, inplace=True)
        trainSaleDateTmp.loc[:, 'sale_quantity_total_last_mouth'] = sale_quantity_total_last_mouth
        trainSaleDateTmp.loc[:, 'sale_quantity_total_last_year'] = sale_quantity_total_last_year
        trainSaleDateTmp.loc[:, 'this_mouth_sale_class_count'] = sale_class_count
        trainSaleDateTmp.loc[:, 'sale_quantity'] = 0
        
        trainSaleDate = pd.concat([trainSaleDate, trainSaleDateTmp])
        

trainSaleDate.sort(['year', 'mouth', 'class_id'])
trainSaleDate.fillna(0.0, inplace=True)
trainSaleDate.to_csv('trainSaleDate.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


KeyError: 'year'

2.1 品牌ID共有36个，进行One-hot编码

In [7]:
brand_id_dummies = pd.get_dummies(trainLabels['brand_id'], prefix='brand_id')
trainLabels = pd.concat([trainLabels, brand_id_dummies], axis=1)
trainLabels = trainLabels.drop(['brand_id'], axis=1)
trainLabels.head()

Unnamed: 0,class_id,compartment,type_id,level_id,department_id,TR,gearbox_type,displacement,if_charging,price_level,...,brand_id_813,brand_id_814,brand_id_831,brand_id_836,brand_id_841,brand_id_864,brand_id_872,brand_id_923,brand_id_953,brand_id_985
0,125403,2,3,2,2,6,AT,2.4,L,35-50W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,125403,2,3,2,2,6,AT,2.4,L,35-50W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,125403,2,3,2,2,6,AT,3.0,L,35-50W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,136916,3,2,2,5,4,AT,2.0,L,15-20W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,136916,3,2,2,5,4,AT,2.0,L,15-20W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2.2 厢数有三种取值，进行One-hot编码

In [8]:
compartment_dummies = pd.get_dummies(trainLabels['compartment'], prefix='compartment')
trainLabels = pd.concat([trainLabels, compartment_dummies], axis=1)
trainLabels = trainLabels.drop(['compartment'], axis=1)
trainLabels.head()

Unnamed: 0,class_id,type_id,level_id,department_id,TR,gearbox_type,displacement,if_charging,price_level,driven_type_id,...,brand_id_836,brand_id_841,brand_id_864,brand_id_872,brand_id_923,brand_id_953,brand_id_985,compartment_1,compartment_2,compartment_3
0,125403,3,2,2,6,AT,2.4,L,35-50W,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,125403,3,2,2,6,AT,2.4,L,35-50W,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,125403,3,2,2,6,AT,3.0,L,35-50W,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,136916,2,2,5,4,AT,2.0,L,15-20W,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,136916,2,2,5,4,AT,2.0,L,15-20W,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


2.3 车型类别有四种

In [9]:
type_id_dummies = pd.get_dummies(trainLabels['type_id'], prefix='type_id')
trainLabels = pd.concat([trainLabels, type_id_dummies], axis=1)
trainLabels = trainLabels.drop(['type_id'], axis=1)
trainLabels.head()

Unnamed: 0,class_id,level_id,department_id,TR,gearbox_type,displacement,if_charging,price_level,driven_type_id,fuel_type_id,...,brand_id_923,brand_id_953,brand_id_985,compartment_1,compartment_2,compartment_3,type_id_1,type_id_2,type_id_3,type_id_4
0,125403,2,2,6,AT,2.4,L,35-50W,1,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,125403,2,2,6,AT,2.4,L,35-50W,1,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,125403,2,2,6,AT,3.0,L,35-50W,1,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,136916,2,5,4,AT,2.0,L,15-20W,1,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,136916,2,5,4,AT,2.0,L,15-20W,1,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


2.4 车型级别有六种

In [10]:
level_id_dummies = pd.get_dummies(trainLabels['level_id'], prefix='level_id')
trainLabels = pd.concat([trainLabels, level_id_dummies], axis=1)
trainLabels = trainLabels.drop(['level_id'], axis=1)
trainLabels.head()

Unnamed: 0,class_id,department_id,TR,gearbox_type,displacement,if_charging,price_level,driven_type_id,fuel_type_id,newenergy_type_id,...,type_id_1,type_id_2,type_id_3,type_id_4,level_id_-,level_id_1,level_id_2,level_id_3,level_id_4,level_id_5
0,125403,2,6,AT,2.4,L,35-50W,1,1,1,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,125403,2,6,AT,2.4,L,35-50W,1,1,1,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,125403,2,6,AT,3.0,L,35-50W,1,1,1,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,136916,5,4,AT,2.0,L,15-20W,1,1,1,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,136916,5,4,AT,2.0,L,15-20W,1,1,1,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


2.5 车型系别有七种

In [11]:
department_id_dummies = pd.get_dummies(trainLabels['department_id'], prefix='department_id')
trainLabels = pd.concat([trainLabels, department_id_dummies], axis=1)
trainLabels = trainLabels.drop(['department_id'], axis=1)
trainLabels.head()

Unnamed: 0,class_id,TR,gearbox_type,displacement,if_charging,price_level,driven_type_id,fuel_type_id,newenergy_type_id,emission_standards_id,...,level_id_3,level_id_4,level_id_5,department_id_1,department_id_2,department_id_3,department_id_4,department_id_5,department_id_6,department_id_7
0,125403,6,AT,2.4,L,35-50W,1,1,1,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,125403,6,AT,2.4,L,35-50W,1,1,1,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,125403,6,AT,3.0,L,35-50W,1,1,1,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,136916,4,AT,2.0,L,15-20W,1,1,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,136916,4,AT,2.0,L,15-20W,1,1,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


2.6 变速器档位有十种

In [12]:
TR_dummies = pd.get_dummies(trainLabels['TR'], prefix='TR')
trainLabels = pd.concat([trainLabels, TR_dummies], axis=1)
trainLabels = trainLabels.drop(['TR'], axis=1)
trainLabels.head()

Unnamed: 0,class_id,gearbox_type,displacement,if_charging,price_level,driven_type_id,fuel_type_id,newenergy_type_id,emission_standards_id,if_MPV_id,...,TR_0,TR_1,TR_4,TR_5,TR_5;4,TR_6,TR_7,TR_8,TR_8;7,TR_9
0,125403,AT,2.4,L,35-50W,1,1,1,3,2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,125403,AT,2.4,L,35-50W,1,1,1,3,2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,125403,AT,3.0,L,35-50W,1,1,1,3,2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,136916,AT,2.0,L,15-20W,1,1,1,3,2,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,136916,AT,2.0,L,15-20W,1,1,1,3,2,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2.7 变速器形式有七种

In [13]:
gearbox_type_dummies = pd.get_dummies(trainLabels['gearbox_type'], prefix='gearbox_type')
trainLabels = pd.concat([trainLabels, gearbox_type_dummies], axis=1)
trainLabels = trainLabels.drop(['gearbox_type'], axis=1)
trainLabels.head()

Unnamed: 0,class_id,displacement,if_charging,price_level,driven_type_id,fuel_type_id,newenergy_type_id,emission_standards_id,if_MPV_id,if_luxurious_id,...,TR_8,TR_8;7,TR_9,gearbox_type_AMT,gearbox_type_AT,gearbox_type_AT;DCT,gearbox_type_CVT,gearbox_type_DCT,gearbox_type_MT,gearbox_type_MT;AT
0,125403,2.4,L,35-50W,1,1,1,3,2,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,125403,2.4,L,35-50W,1,1,1,3,2,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,125403,3.0,L,35-50W,1,1,1,3,2,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,136916,2.0,L,15-20W,1,1,1,3,2,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,136916,2.0,L,15-20W,1,1,1,3,2,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


2.8 排量

    区间缩放

In [14]:
trainLabels.loc[:,'no_displacement']=0
trainLabels['no_displacement'][trainLabels['displacement']==0]=1
trainLabels['displacement'] = trainLabels['displacement'].apply(lambda x: (x-1.0)/(3.6-1.0))
trainLabels.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,class_id,displacement,if_charging,price_level,driven_type_id,fuel_type_id,newenergy_type_id,emission_standards_id,if_MPV_id,if_luxurious_id,...,TR_8;7,TR_9,gearbox_type_AMT,gearbox_type_AT,gearbox_type_AT;DCT,gearbox_type_CVT,gearbox_type_DCT,gearbox_type_MT,gearbox_type_MT;AT,no_displacement
0,125403,0.538462,L,35-50W,1,1,1,3,2,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,125403,0.538462,L,35-50W,1,1,1,3,2,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,125403,0.769231,L,35-50W,1,1,1,3,2,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
3,136916,0.384615,L,15-20W,1,1,1,3,2,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
4,136916,0.384615,L,15-20W,1,1,1,3,2,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0


2.9 是否增压

In [None]:
if_charging_dummies = pd.get_dummies(trainLabels['if_charging'], prefix='if_charging')
trainLabels = pd.concat([trainLabels, if_charging_dummies], axis=1)
trainLabels = trainLabels.drop(['if_charging'], axis=1)
trainLabels.head()

2.10 成交段

In [None]:
price_level_dummies = pd.get_dummies(trainLabels['price_level'], prefix='price_level')
trainLabels = pd.concat([trainLabels, price_level_dummies], axis=1)
trainLabels = trainLabels.drop(['price_level'], axis=1)
trainLabels.head()

2.11 驱动形式

In [None]:
driven_type_id_dummies = pd.get_dummies(trainLabels['driven_type_id'], prefix='driven_type_id')
trainLabels = pd.concat([trainLabels, driven_type_id_dummies], axis=1)
trainLabels = trainLabels.drop(['driven_type_id'], axis=1)
trainLabels.head()

2.12 燃料种类

In [None]:
fuel_type_id_dummies = pd.get_dummies(trainLabels['fuel_type_id'], prefix='fuel_type_id')
trainLabels = pd.concat([trainLabels, fuel_type_id_dummies], axis=1)
trainLabels = trainLabels.drop(['fuel_type_id'], axis=1)
trainLabels.head()

2.13 新能源类型

In [None]:
newenergy_type_id_dummies = pd.get_dummies(trainLabels['newenergy_type_id'], prefix='newenergy_type_id')
trainLabels = pd.concat([trainLabels, newenergy_type_id_dummies], axis=1)
trainLabels = trainLabels.drop(['newenergy_type_id'], axis=1)
trainLabels.head()

2.14 排放标准

In [None]:
emission_standards_id_dummies = pd.get_dummies(trainLabels['emission_standards_id'], prefix='emission_standards_id')
trainLabels = pd.concat([trainLabels, emission_standards_id_dummies], axis=1)
trainLabels = trainLabels.drop(['emission_standards_id'], axis=1)
trainLabels.head()

2.15 是否微客MPV

In [None]:
if_MPV_id_dummies = pd.get_dummies(trainLabels['if_MPV_id'], prefix='if_MPV_id')
trainLabels = pd.concat([trainLabels, if_MPV_id_dummies], axis=1)
trainLabels = trainLabels.drop(['if_MPV_id'], axis=1)
trainLabels.head()

2.16 是否豪华

In [None]:
if_luxurious_id_dummies = pd.get_dummies(trainLabels['if_luxurious_id'], prefix='if_luxurious_id')
trainLabels = pd.concat([trainLabels, if_luxurious_id_dummies], axis=1)
trainLabels = trainLabels.drop(['if_luxurious_id'], axis=1)
trainLabels.head()

2.17 功率

    区间缩放

In [None]:
trainLabels['power'] = MinMaxScaler().fit_transform(trainLabels['power'].reshape(-1, 1))
trainLabels.head()

2.17 缸数

In [15]:
cylinder_number_dummies = pd.get_dummies(trainLabels['cylinder_number'], prefix='cylinder_number')
trainLabels = pd.concat([trainLabels, cylinder_number_dummies], axis=1)
trainLabels = trainLabels.drop(['cylinder_number'], axis=1)
trainLabels.head()

Unnamed: 0,class_id,displacement,if_charging,price_level,driven_type_id,fuel_type_id,newenergy_type_id,emission_standards_id,if_MPV_id,if_luxurious_id,...,gearbox_type_AT;DCT,gearbox_type_CVT,gearbox_type_DCT,gearbox_type_MT,gearbox_type_MT;AT,no_displacement,cylinder_number_0,cylinder_number_3,cylinder_number_4,cylinder_number_6
0,125403,0.538462,L,35-50W,1,1,1,3,2,1,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0
1,125403,0.538462,L,35-50W,1,1,1,3,2,1,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0
2,125403,0.769231,L,35-50W,1,1,1,3,2,1,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0
3,136916,0.384615,L,15-20W,1,1,1,3,2,1,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0
4,136916,0.384615,L,15-20W,1,1,1,3,2,1,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0


2.18 发动机扭矩

    区间缩放

In [None]:
trainLabels['engine_torque'] = MinMaxScaler().fit_transform(trainLabels['engine_torque'].reshape(-1, 1))
trainLabels.head()

2.19 车长

    区间缩放

In [None]:
trainLabels['car_length'] = MinMaxScaler().fit_transform(trainLabels['car_length'].reshape(-1, 1))
trainLabels.head()

2.20 车宽

    区间缩放

In [None]:
trainLabels['car_width'] = MinMaxScaler().fit_transform(trainLabels['car_width'].reshape(-1, 1))
trainLabels.head()

2.21 车高

    区间缩放

In [None]:
trainLabels['car_height'] = MinMaxScaler().fit_transform(trainLabels['car_height'].reshape(-1, 1))
trainLabels.head()

2.22 质量

    区间缩放

In [None]:
trainLabels['total_quality'] = MinMaxScaler().fit_transform(trainLabels['total_quality'].reshape(-1, 1))
trainLabels.head()

2.23 装备质量

    区间缩放

In [None]:
trainLabels['equipment_quality'] = MinMaxScaler().fit_transform(trainLabels['equipment_quality'].reshape(-1, 1))
trainLabels.head()

2.24 轴距

    区间缩放

In [None]:
trainLabels['wheelbase'] = MinMaxScaler().fit_transform(trainLabels['wheelbase'].reshape(-1, 1))
trainLabels.head()

2.25 前轮距

    区间缩放

In [None]:
trainLabels['front_track'] = MinMaxScaler().fit_transform(trainLabels['front_track'].reshape(-1, 1))
trainLabels.head()

2.26 后轮距

    区间缩放

In [None]:
trainLabels['rear_track'] = MinMaxScaler().fit_transform(trainLabels['rear_track'].reshape(-1, 1))
trainLabels.head()

2.27 额定载客

In [None]:
rated_passenger_dummies = pd.get_dummies(trainLabels['rated_passenger'], prefix='rated_passenger')
trainLabels = pd.concat([trainLabels, rated_passenger_dummies], axis=1)
trainLabels = trainLabels.drop(['rated_passenger'], axis=1)
trainLabels.head()

### 三、写入文件

In [None]:
trainLabels.to_csv('train_feature.csv', index=False)