In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import missingno as msno
train_data = pd.read_csv(r'D:/jupyterfile/used_car_train_20200313.csv', sep=' ')
test_data = pd.read_csv(r'D:/jupyterfile/used_car_testB_20200421.csv', sep=' ')

数据探索

In [None]:
#查看数据分布及缺失情况
stats = []
for col in train_data.columns:
    stats.append((col, train_data[col].nunique(), train_data[col].isnull().sum() * 100 / train_data.shape[0],train_data[col].value_counts(normalize=True, dropna=False).values[0] * 100, train_data[col].dtype))
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values','Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False, inplace=True)
stats_df

In [None]:
# 查看object格式的数据字段
train_data['notRepairedDamage'].value_counts()
test_data['notRepairedDamage'].value_counts()

In [None]:
# 空替换'-'
train_data['notRepairedDamage'].replace('-', np.nan, inplace=True)
test_data['notRepairedDamage'].replace('-', np.nan, inplace=True)

In [None]:
#查看seller分布
train_data['seller'].value_counts().sort_values(ascending=False)

In [None]:
#查看offer type分布
train_data["offerType"].value_counts().sort_values(ascending=False)

In [None]:
#删除变量
train_data.drop(['seller','offerType'],axis=1, inplace=True)

In [None]:
#缺失值可视化
missing = Train_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()

In [None]:
#查看价格的分布
import scipy.stats as st
y = Train_data['price']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)

In [None]:
#查看价格的skewness and kurtosis
print("Skewness: %f" % train_data['price'].skew())
print("Kurtosis: %f" % train_data['price'].kurt())
#价格的箱线图
train_data['price'].plot(kind='box')

In [None]:
np.log(train_data[train_data['price']<=20000]['price']).hist()

In [None]:
#连续变量相关性
num_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14','price' ]
cat_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode']
num_feature_price=train_data[num_features]
colormap = plt.cm.magma
plt.figure(figsize=(16,12))
plt.title('Pearson correlation of continuous features', y=1.05, size=15)
sns.heatmap(num_feature_price.corr(),linewidths=0.1,vmax=1.0, square=True, 
            cmap=colormap, linecolor='white', annot=True)

In [None]:
#连续变量分布图
num_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14','price' ]
cat_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode']
num_feature_price=train_data[num_features]
for col in num_features:
    print('{:15}'.format(col), 
          '特征偏度: {:05.2f}'.format(num_feature_price[col].skew()) , 
          '   ' ,
          '特征峰度: {:06.2f}'.format(num_feature_price[col].kurt())  
         )
f = pd.melt(train_data, value_vars=num_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=3, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")

In [None]:
plt.figure(figsize=[16,4])
plt.subplot(1,3,1)
train_data[train_data['power']<=600]['power'].plot(kind='box')
plt.subplot(1,3,2)
train_data[train_data['power']<=600]['power'].hist()
plt.subplot(1,3,3)
np.log(train_data[train_data['power']<=600]['power']+1).hist()

In [None]:
#观察定性变量
# 将空值填充为nan
for c in cat_features:
    train_data[c] = train_data[c].astype('category')
    if train_data[c].isnull().any():
        train_data[c] = train_data[c].cat.add_categories(['nan'])
        train_data[c] = train_data[c].fillna('nan')

In [None]:
# 分析不同定类变量与价格之间的关系
cat_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode']
train_data_copy=train_data[train_data['price']<=20000]
plt.figure(figsize=[16,10])
plt.subplot(2,2,1)
ax = sns.boxplot(x="bodyType", y="price", data=train_data_copy)
plt.subplot(2,2,2)
ax = sns.boxplot(x="fuelType", y="price", data=train_data_copy)
plt.subplot(2,2,3)
ax = sns.boxplot(x="gearbox", y="price", data=train_data_copy)
plt.subplot(2,2,4)
ax = sns.boxplot(x="notRepairedDamage", y="price", data=train_data_copy)

In [None]:
#将日期变量进行拆分 观察与价格的关系
df_train=train_data.loc[:,['regDate','creatDate','price']]
#转换日期格式
df_train['regDate']=df_train['regDate'].astype(str)
df_train['creatDate']=df_train['creatDate'].astype(str)
df_train['regyear']=df_train['regDate'].str[0:4]
df_train['creatyear']=df_train['creatDate'].str[0:4]
df_train['regmonth']=df_train['regDate'].str[4:6]
df_train['creatmonth']=df_train['creatDate'].str[4:6]

In [None]:
df_train['regyear'].value_counts()

In [None]:
df_train_copy=df_train[df_train['price']<=20000]
plt.figure(figsize=[16,10])
plt.subplot(2,1,1)
ax = sns.boxplot(x="regyear", y="price", data=df_train_copy)
plt.subplot(2,1,2)
ax = sns.boxplot(x="regmonth", y="price", data=df_train_copy)

In [None]:
df_train['creatyear'].value_counts()

In [None]:
df_train['creatmonth'].value_counts()

In [None]:
df_train_copy=df_train[df_train['price']<=20000]
plt.figure(figsize=[16,4])
plt.subplot(1,2,1)
ax = sns.boxplot(x="creatyear", y="price", data=df_train_copy)
plt.subplot(1,2,2)
ax = sns.boxplot(x="creatmonth", y="price", data=df_train_copy)

特征工程 建模调参 模型融合

In [None]:
from xgboost import XGBRegressor as XGBR
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import missingno as msno
import sys
import importlib
importlib.reload(sys)
import seaborn as sns
#import missingno as msno
from operator import itemgetter
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import linear_model
importlib.reload(sys)
#import missingno as msno
from operator import itemgetter
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
Train_data2 = pd.read_csv(r'C:\Users\刘浩宇\Desktop\机器学习\used_car_train_20200313.csv', sep=' ')
test_data = pd.read_csv(r'C:\Users\刘浩宇\Desktop\机器学习\used_car_testB_20200421.csv', sep=' ')

In [None]:
Train_data2['notRepairedDamage'].replace('-', np.nan, inplace=True)
test_data['notRepairedDamage'].replace('-', np.nan, inplace=True)

In [None]:
##众数填补
aim_l =Train_data2.isnull().sum()
a_column = aim_l[aim_l>0].index.tolist()
for ck in a_column:
    Train_data2[ck] = Train_data2[ck].fillna(Train_data2[ck].mode()[0])

aim_l = test_data.isnull().sum()
a_column = aim_l[aim_l>0].index.tolist()
for ck in a_column:
    test_data[ck] = test_data[ck].fillna(test_data[ck].mode()[0])

In [None]:
#数据异常值处理
Train_data2['power'].replace(0,np.nan,inplace=True)
Train_data2.loc[Train_data2['power']>600,'power']=np.nan
#训练集
train_data_cut=Train_data2.loc[:,['brand','bodyType','fuelType','gearbox','power']]
it_imputer=IterativeImputer(max_iter=10, random_state=0)
train_data_it_imputed=pd.DataFrame(it_imputer.fit_transform(train_data_cut),columns=train_data_cut.columns)    
Train_data2['power']=train_data_it_imputed['power']   
# 测试集
test_data['power'].replace(0,np.nan,inplace=True)
test_data.loc[test_data['power']>600,'power']=np.nan
test_data['power']=test_data['power'].fillna(-1)
test_data_fillna=test_data[test_data['power'].isin([-1])]
#合并
train_data_power_info=Train_data2.loc[:,['SaleID','brand','bodyType','fuelType','gearbox','power']]
test_data_power_info=test_data.loc[:,['SaleID','brand','bodyType','fuelType','gearbox','power']]
power_info=pd.concat([train_data_power_info,test_data_power_info],axis=0)
power_info.reset_index(drop=True,inplace=True)
power_info_cut=power_info.loc[:,['brand','bodyType','fuelType','gearbox','power']]
power_info_cut['power'].replace(-1,np.nan,inplace=True)
it_imputer=IterativeImputer(max_iter=10, random_state=0)
power_info_it_imputed=pd.DataFrame(it_imputer.fit_transform(power_info_cut),columns=power_info_cut.columns)
power_info_it_imputed['SaleID']=power_info['SaleID']
power_info_it_imputed.rename(columns={'power':'fixpower'},inplace=True)
power_info_it_imputed=power_info_it_imputed.loc[:,['SaleID','fixpower']]
test_data=pd.merge(test_data,power_info_it_imputed,on='SaleID',how='left')
test_data['power']=test_data['power'].mask(test_data['power']==-1,test_data['fixpower'])    
del test_data['fixpower']

In [None]:
df_train=Train_data2 .copy()
df_test=test_data.copy()
#注册年
df_train['regyear']=df_train['regDate'].apply(lambda x : str(x)[:4])
df_test['regyear']=df_test['regDate'].apply(lambda x : str(x)[:4])
df_train['regmonth']=df_train['regDate'].apply(lambda x : str(x)[4:6])
df_test['regmonth']=df_test['regDate'].apply(lambda x : str(x)[4:6])
df_train['regday']=df_train['regDate'].apply(lambda x : str(x)[6:8])
df_test['regday']=df_test['regDate'].apply(lambda x : str(x)[6:8])
df_train['regmonth'].replace('00',np.nan,inplace=True)
df_test['regmonth'].replace('00',np.nan,inplace=True)
# 注册月份00修正
year_month_info=pd.DataFrame()
year_month_info=df_train.groupby('regyear')['regmonth'].agg(lambda x: x.value_counts().index[0]).reset_index()
year_month_info.rename(columns={'regmonth':'fixregmonth'},inplace=True)
df_train['regmonth'].replace(np.nan,'00',inplace=True)
df_test['regmonth'].replace(np.nan,'00',inplace=True)
df_train=pd.merge(df_train,year_month_info,on='regyear',how='left')
df_train['regmonth']=df_train['regmonth'].mask(df_train['regmonth']=='00',df_train['fixregmonth'])
del df_train['fixregmonth']  
df_test=pd.merge(df_test,year_month_info,on='regyear',how='left')
df_test['regmonth']=df_test['regmonth'].mask(df_test['regmonth']=='00',df_test['fixregmonth'])
del df_test['fixregmonth']  
#使用时长（单位分别为日、月份、年）
df_train['fix_regdate']=df_train['regyear']+df_train['regmonth']+df_train['regday']
df_test['fix_regdate']=df_test['regyear']+df_test['regmonth']+df_test['regday']
df_train['used_day']=(pd.to_datetime(df_train['creatDate'], format='%Y%m%d', errors='coerce') - 
                            pd.to_datetime(df_train['fix_regdate'], format='%Y%m%d', errors='coerce')).dt.days
df_test['used_day']=(pd.to_datetime(df_test['creatDate'], format='%Y%m%d', errors='coerce') - 
                            pd.to_datetime(df_test['fix_regdate'], format='%Y%m%d', errors='coerce')).dt.days
df_train['used_year']=(pd.to_datetime(df_train['creatDate'], format='%Y%m%d').dt.year)-(pd.to_datetime(df_train['fix_regdate'], format='%Y%m%d').dt.year)
df_test['used_year']=(pd.to_datetime(df_test['creatDate'], format='%Y%m%d').dt.year)-(pd.to_datetime(df_test['fix_regdate'], format='%Y%m%d').dt.year)
df_train['used_month']=((pd.to_datetime(df_train['creatDate'], format='%Y%m%d').dt.year)-(pd.to_datetime(df_train['fix_regdate'], format='%Y%m%d').dt.year))*12+((pd.to_datetime(df_train['creatDate'], format='%Y%m%d').dt.month)-(pd.to_datetime(df_train['fix_regdate'], format='%Y%m%d').dt.month))
df_test['used_month']=((pd.to_datetime(df_test['creatDate'], format='%Y%m%d').dt.year)-(pd.to_datetime(df_test['fix_regdate'], format='%Y%m%d').dt.year))*12+((pd.to_datetime(df_train['creatDate'], format='%Y%m%d').dt.month)-(pd.to_datetime(df_test['fix_regdate'], format='%Y%m%d').dt.month))
df_train.drop(['regDate','regionCode','creatDate','regyear','regmonth','regday','fix_regdate','creatDate'],axis=1,inplace=True)
df_test.drop(['regDate','regionCode','creatDate','regyear','regmonth','regday','fix_regdate','creatDate'],axis=1,inplace=True)

In [None]:
import numpy as np
from sklearn.preprocessing import QuantileTransformer
rng=np.random.RandomState(304)
q_t=QuantileTransformer(n_quantiles=500,output_distribution='normal',random_state=rng)
for i in ['v_1','v_6','v_10']:
    X=np.array(df_train[i]);
    X=X.reshape(-1,1);
    df_train[i]=q_t.fit_transform(X)

In [None]:
import numpy as np
from sklearn.preprocessing import QuantileTransformer
rng=np.random.RandomState(304)
q_t=QuantileTransformer(n_quantiles=500,output_distribution='normal',random_state=rng)
for i in ['v_1','v_6','v_10']:
    X=np.array(df_test[i]);
    X=X.reshape(-1,1);
df_test[i]=q_t.fit_transform(X)

In [None]:
#通过对各指标的统计处理构造新变量
df_train2=pd.DataFrame()
df_test2=pd.DataFrame()
train_gb_brand = df_train.groupby("brand")
all_info = {}
for kind, kind_data in train_gb_brand:
    info = {}
    kind_data = kind_data[kind_data['price'] > 0]
    info['brand_amount'] = len(kind_data)
    info['brand_price_max'] = kind_data.price.max()
    info['brand_price_median'] = kind_data.price.median()
    info['brand_price_min'] = kind_data.price.min()
    info['brand_price_sum'] = kind_data.price.sum()
    info['brand_price_std'] = kind_data.price.std()
    info['brand_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
    all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "brand"})
df_train2 = df_train.merge(brand_fe, how='left', on='brand')
df_test2=df_test.merge(brand_fe, how='left', on='brand')
#

#
train_gb_used_month = df_train.groupby("used_month")
all_info = {}
for kind, kind_data in train_gb_used_month:
    info = {}
    kind_data = kind_data[kind_data['price'] > 0]
    info['used_month_amount'] = len(kind_data)
    info['used_month_price_max'] = kind_data.price.max()
    info['used_month_price_median'] = kind_data.price.median()
    info['used_month_price_min'] = kind_data.price.min()
    info['used_month_price_sum'] = kind_data.price.sum()
    info['used_month_price_std'] = kind_data.price.std()
    info['used_month_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
    all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "used_month"})
df_train2 = df_train2.merge(brand_fe, how='left', on='used_month')
df_test2=df_test2.merge(brand_fe, how='left', on='used_month')
#
train_gb_model = df_train.groupby("model")
all_info = {}
for kind, kind_data in train_gb_model:
    info = {}
    kind_data = kind_data[kind_data['price'] > 0]
    info['model_amount'] = len(kind_data)
    info['model_price_max'] = kind_data.price.max()
    info['model_price_median'] = kind_data.price.median()
    info['model_price_min'] = kind_data.price.min()
    info['model_price_sum'] = kind_data.price.sum()
    info['model_price_std'] = kind_data.price.std()
    info['model_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
    all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "model"})
df_train2 = df_train2.merge(brand_fe, how='left', on='model')
df_test2=df_test2.merge(brand_fe, how='left', on='model')
#
train_gb_bodyType = df_train.groupby("bodyType")
all_info = {}
for kind, kind_data in train_gb_bodyType:
    info = {}
    kind_data = kind_data[kind_data['price'] > 0]
    info['bodyType_amount'] = len(kind_data)
    info['bodyType_price_max'] = kind_data.price.max()
    info['bodyType_price_median'] = kind_data.price.median()
    info['bodyType_price_min'] = kind_data.price.min()
    info['bodyType_price_sum'] = kind_data.price.sum()
    info['bodyType_price_std'] = kind_data.price.std()
    info['bodyType_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
    all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "bodyType"})
df_train2 = df_train2.merge(brand_fe, how='left', on='bodyType')
df_test2=df_test2.merge(brand_fe, how='left', on='bodyType')
#
train_gb_fuelType = df_train.groupby("fuelType")
all_info = {}
for kind, kind_data in train_gb_fuelType:
    info = {}
    kind_data = kind_data[kind_data['price'] > 0]
    info['fuelType_amount'] = len(kind_data)
    info['fuelType_price_max'] = kind_data.price.max()
    info['fuelType_price_median'] = kind_data.price.median()
    info['fuelType_price_min'] = kind_data.price.min()
    info['fuelType_price_sum'] = kind_data.price.sum()
    info['fuelType_price_std'] = kind_data.price.std()
    info['fuelType_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
    all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "fuelType"})
df_train2 = df_train2.merge(brand_fe, how='left', on='fuelType')
df_test2=df_test2.merge(brand_fe, how='left', on='fuelType')
#
train_gb_gearbox = df_train.groupby("gearbox")
all_info = {}
for kind, kind_data in train_gb_gearbox:
    info = {}
    kind_data = kind_data[kind_data['price'] > 0]
    info['gearbox_amount'] = len(kind_data)
    info['gearbox_price_max'] = kind_data.price.max()
    info['gearbox_price_median'] = kind_data.price.median()
    info['gearbox_price_min'] = kind_data.price.min()
    info['gearbox_price_sum'] = kind_data.price.sum()
    info['gearbox_price_std'] = kind_data.price.std()
    info['gearbox_price_average'] = round(kind_data.price.sum() / (len(kind_data) + 1), 2)
    all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "gearbox"})
df_train2 = df_train2.merge(brand_fe, how='left', on='gearbox')
df_test2=df_test2.merge(brand_fe, how='left', on='gearbox')

In [None]:
#取出连续变量进行后续特征工程处理
numeric_features = ['power', 'kilometer','used_day', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14','price' ]
for i in numeric_features:
    sta=(df_train2[i]-df_train2[i].mean())/df_train2[i].std()
    delete1=df_train2[sta.abs()>3].index
    df_train2=df_train2.drop(delete1)

In [None]:
df_train2['train']=1
df_test2['train']=0
df_data = pd.concat([df_train2, df_test2], ignore_index=True)

In [None]:
#取对数
df_data['log_power']=np.log(df_data['power'])
df_data['log_price']=np.log(df_data['price'])

In [None]:
#离散变量的独热编码
from sklearn.preprocessing  import OneHotEncoder 
import pandas as pd
classfiy_features=['model','brand','bodyType','fuelType','gearbox','notRepairedDamage']
df_data1=df_data[classfiy_features]
enc=OneHotEncoder(categories='auto').fit(df_data1)
result=enc.transform(df_data1).toarray()
newdata=pd.concat([df_data,pd.DataFrame(result)],axis=1)
newdata.head()

In [None]:
#删掉冗余变量
#newdata.drop(['seller','offerType','SaleID', 'name','train','price','power'],axis=1, inplace=True)
newdata.drop(classfiy_features,axis=1,inplace=True)

In [None]:
col=list(newdata.columns)
col.insert(0,col.pop(col.index('log_price')))
newdata=newdata[col]
newdata.head()

In [None]:
df_train_1=newdata[0:135481]
df_test_1=newdata[135482:]

In [None]:
df_test_1.drop(['log_price'],axis=1, inplace=True)

In [None]:
numeric_features1 = ['log_power', 'kilometer','used_day', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14','log_price' ]
data=df_train_1[numeric_features1]
x = data.iloc[:,0:-1]
y = data.iloc[:,-1]

In [None]:
#自身方差筛选
from sklearn.feature_selection import VarianceThreshold
selector=VarianceThreshold()
data=selector.fit_transform(data)
data.shape

In [None]:
#f检验
from sklearn.feature_selection import f_regression
F,pvalues_f=f_regression(x,y)
F
pvalues_f
k=F.shape[0]-(pvalues_f>0.05).sum()
k

In [None]:
#互信息法筛选
from sklearn.feature_selection import mutual_info_regression as MIC
result=MIC(x,y)
k=result.shape[0]-sum(result<=0)
k

In [None]:
#嵌入法初步筛选
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor as XGBR
#data1=df_train_1(frac=0.5,axis=0)
x1=df_train_1.iloc[:,1:]
y1=df_train_1.iloc[:,0]
rfr=XGBR(n_estimators=100)
x_embedded=SelectFromModel(rfr,threshold=0.00002).fit_transform(x1,y1)

In [None]:
x_embedded

In [None]:
#迭代法用嵌入法求出最佳阈值
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor as XGBR
data1=df_train_1(frac=0.5,axis=0)
from sklearn.model_selection import cross_val_score 
x1= df_train_1.iloc[:,1:]
y1=df_train_1.iloc[:,0]
rfr=XGBR(n_estimators=100)
x_embedded=SelectFromModel(rfr,threshold=0.00002).fit_transform(x1,y1)
rfr.fit(x1,y1).feature_importances_
threshold = np.linspace(0,(rfr.fit(x1,y1).feature_importances_).max(),20)
score = []
for i in threshold:
    X_embedded = SelectFromModel(rfr,threshold=i).fit_transform(x1,y1)
    once = cross_val_score(rfr,X_embedded,y1,cv=5).mean()
    score.append(once)
plt.plot(threshold,score)
plt.show()


In [None]:
threshold

In [None]:
from xgboost import XGBRegressor as XGBR
#data1=df_train_1(frac=0.5,axis=0)
x1= df_train_1.iloc[:,1:]
y1=df_train_1.iloc[:,0]
rfr=XGBR(n_estimators=100)
x_embedded=SelectFromModel(rfr,threshold=0.0005).fit_transform(x1,y1)

In [None]:
x_embedded=pd.DataFrame(x_embedded)

In [None]:
a=pd.DataFrame([*zip(x1.columns,rfr.fit(x1,y1).feature_importances_)])

In [None]:
column = a[a[1]>0.0005][0].values
df_test_2=df_test_1[column]

In [None]:
df_test_2

In [None]:
#处理数据中字段的格式，通过改变数据格式的方式减小内存，方便后续运算
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

df_train_x= reduce_mem_usage(x_embedded)
df_test_x= reduce_mem_usage(df_test_2)

In [None]:
#用各个模型都跑一下来看一下效果
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,  make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm.sklearn import LGBMRegressor
from xgboost import XGBRegressor as XGBR
models = [LinearRegression(),
          DecisionTreeRegressor(),
          RandomForestRegressor(),
          GradientBoostingRegressor(),
          LGBMRegressor(n_estimators = 100),
          XGBR(n_estimators=100)]

In [None]:
result = dict()
for model in models:
    model_name = str(model).split('(')[0]
    scores = cross_val_score(model, X=df_train_x, y=y1, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))
    result[model_name] = scores
    print(model_name + ' is finished')

In [None]:
#展示结果
result = pd.DataFrame(result)
result.index = ['cv' + str(x) for x in range(1, 6)]
result

In [None]:
#网格搜索lightgbm
from lightgbm.sklearn import LGBMRegressor
from sklearn.model_selection import GridSearchCV
objective = ['regression', 'regression_l1', 'mape', 'huber', 'fair']
num_leaves = [3,5,10,15,20,40, 55]
max_depth = [3,5,10,15,20,40, 55]
n_estimators=np.arrage(0.200,10)
parameters = {'objective': objective , 'num_leaves': num_leaves, 'max_depth': max_depth,'n_estimators':n_estimators}
model = LGBMRegressor()
clf1 = GridSearchCV(model, parameters, cv=5)
clf1 = clf1.fit(df_train_x,y1)

In [None]:
clf1.best_params_

In [None]:
#测试调参后lightgbm效果
from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import mean_absolute_error,  make_scorer
model = LGBMRegressor(n_estimators=120,
                    objective='regression',
                           num_leaves=55,
                           max_depth=15)
np.mean(cross_val_score(model, X=df_train_x, y=y1, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))

In [None]:
#贝叶斯调参 lightgbm
from bayes_opt import BayesianOptimization
def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
    val = cross_val_score(
        LGBMRegressor(objective = 'regression',
            num_leaves=int(num_leaves),
            max_depth=int(max_depth),
            subsample = subsample,
            min_child_samples = int(min_child_samples)
        ),
        X=df_train_x, y=y1, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
    ).mean()
    return val


In [None]:
rf_bo = BayesianOptimization(
    rf_cv,
    {
    'num_leaves': (2, 100),
    'max_depth': (2, 100),
    'subsample': (0.1, 1),
    'min_child_samples' : (2, 100)
    }
)

In [None]:
rf_bo.maximize()

In [None]:
#网格搜索xgboost
from lightgbm.sklearn import LGBMRegressor
from sklearn.model_selection import GridSearchCV
eta=[0.1 0.2 0.3 0.4 0.5 ]
max_depth = [3,5,10,15,20,40, 55]
n_estimators=np.arrage(0.200,10)
parameters = { 'max_depth': max_depth,'n_estimators':n_estimators,'eta';eta}
model =XGBR()
clf2 = GridSearchCV(model, parameters, cv=5)
clf2 = clf2.fit(df_train_x,y1)

In [None]:
clf2.best_params_

In [None]:
#用交叉验证测试调参之后xgboost效果
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,  make_scorer
model = XGBR(n_estimators=130,
             eta=0.3,
             max_depth=15)
np.mean(cross_val_score(model, X=df_train_x, y=y1, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))

In [None]:
#构建函数方便实施各个方法
def build_model_lgb(x_train,y_train):
    lgb= LGBMRegressor(n_estimators = 120,max_depth=15, num_leaves=55, objective='regression')
    lgb.fit(x_train, y_train)
    return lgb
def build_model_xgb(x_train,y_train):
    xgb = XGBR(n_estimators=130, learning_rate=0.08, eta=0.3, max_depth=15) #, objective ='reg:squarederror'
    xgb.fit(x_train, y_train)
    return xgb
def build_model_xgb1(x_train,y_train):
    xgb = XGBR(n_estimators=130, learning_rate=0.08, eta=0.3, max_depth=8) #, objective ='reg:squarederror'
    xgb.fit(x_train, y_train)
    return xgb
def build_model_lr(x_train,y_train):
    reg_model = linear_model.LinearRegression()
    reg_model.fit(x_train,y_train)
    return reg_model

In [None]:
#实施过程中由于个人原因前文中把数据导出，现将其导入
import pandas as pd
import numpy as np
df_train_x= pd.read_csv(r'C:\Users\刘浩宇\Desktop\df_train_x.csv')
df_train_y= pd.read_csv(r'C:\Users\刘浩宇\Desktop\df_train_y.csv')
df_test_x= pd.read_csv(r'C:\Users\刘浩宇\Desktop\df_test_x.csv')

In [None]:
#划分训练集和测试集
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
x_train_11=df_train_x
y_train_11=df_train_y
x_train,x_val,y_train,y_val = train_test_split(df_train_x,df_train_y,test_size=0.3)
x_test_1=df_test_x

In [None]:
#删除无用列
df_train_y.drop('Unnamed: 0',axis=1,inplace=True)
df_train_x.drop('Unnamed: 0',axis=1,inplace=True)
df_test_x.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
df_train_x.columns

In [None]:
df_train_x.columns[0]

In [None]:
#分别用xgboost，lightgbm预测数据
print('predict XGB')
model_xgb = build_model_xgb(x_train,y_train)
val_xgb = model_xgb.predict(x_val)
subA_xgb = model_xgb.predict(x_test_1)

print('predict lgb')
model_lgb = build_model_lgb(x_train,y_train)
val_lgb = model_lgb.predict(x_val)
subA_lgb = model_lgb.predict(x_test_1)
print('Sta inf of lgb:')
Sta_inf(subA_lgb)

In [None]:
def Sta_inf(data):
    print('_min',np.min(data))
    print('_max:',np.max(data))
    print('_mean',np.mean(data))
    print('_ptp',np.ptp(data))
    print('_std',np.std(data))
    print('_var',np.var(data))

In [None]:
#模型融合
#第一层
train_lgb_pred = model_lgb.predict(x_train)
train_xgb_pred = model_xgb.predict(x_train)


Strak_X_train = pd.DataFrame()
Strak_X_train['Method_1'] = train_lgb_pred
Strak_X_train['Method_2'] = train_xgb_pred


Strak_X_val = pd.DataFrame()
Strak_X_val['Method_1'] = val_lgb
Strak_X_val['Method_2'] = val_xgb


Strak_X_test = pd.DataFrame()
Strak_X_test['Method_1'] = subA_lgb
Strak_X_test['Method_2'] = subA_xgb

In [None]:
## 用线性模型或者xgboost分别进行第二层的融合
#model_lr_Stacking = build_model_lr(Strak_X_train,y_train)
model_lr_Stacking = build_model_xgb1(Strak_X_train,y_train)
## 训练集
train_pre_Stacking = model_lr_Stacking.predict(Strak_X_train)
print('MAE of Stacking-LR:',mean_absolute_error(y_train,train_pre_Stacking))

## 验证集
val_pre_Stacking = model_lr_Stacking.predict(Strak_X_val)
print('MAE of Stacking-LR:',mean_absolute_error(y_val,val_pre_Stacking))

## 预测集
print('Predict Stacking-LR...')
subA_Stacking = model_lr_Stacking.predict(Strak_X_test)

In [None]:
subA_Stacking.min()

In [None]:
submit_data2 = pd.read_csv(r'C:\Users\刘浩宇\Desktop\机器学习\used_car_sample_submit.csv')

In [None]:
subA_Stacking[subA_Stacking<4.5]=4.5## 去除过小的预测值
subA_Stacking_1=np.exp(subA_Stacking)
sub = pd.DataFrame()
sub['SaleID'] =submit_data2['SaleID']#测试集文件
sub['price'] = subA_Stacking_1
sub.to_csv(r'C:\Users\刘浩宇\Desktop\submit_data_3.csv',index=False)

In [None]:
print('Sta inf:')
Sta_inf(subA_Stacking)
