In [1]:
import numpy as np
import pandas as pd
from xgboost.sklearn import XGBClassifier,XGBRegressor
pd.set_option('display.max_columns', None)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def rmse(y_true,y_pred):
    
    return mean_squared_error(y_true=y_true,y_pred=y_pred) ** 0.5

# IPRC模型

## 特征构建

In [2]:
train = pd.read_csv('data/train.csv')
test  = pd.read_csv('data/test.csv')
###########################################################################
# 分析数据 发现该数据更适合先用回归模型
df = pd.concat([train,test]).reset_index(drop=True)
###########################################################################

In [3]:
# 计算每个IAQI
def IAQI(x,bins):
    bins_AQI = [0, 50, 100, 150, 200, 300, 400, 500, 500]
    for i in range(len(bins)):
        if x < bins[i]:
            return (bins_AQI[i]-bins_AQI[i-1])/(bins[i]-bins[i-1])*(x-bins[i-1])+bins_AQI[i-1]
    return 500

bins = [0, 35, 75, 115, 150, 250, 350, 500, 1000]
df['PM2_5_IAQI'] = df['PM2_5'].apply(lambda x:IAQI(x,bins))

bins = [0, 50, 150, 250, 350, 420, 500]
df['PM10_IAQI'] = df['PM10'].apply(lambda x:IAQI(x,bins))

bins = [0,50,150,475]
df['SO2_IAQI'] = df['SO2'].apply(lambda x:IAQI(x,bins))

bins = [0,2,4,14]
df['CO_IAQI'] = df['CO'].apply(lambda x:IAQI(x,bins))

bins = [0,40,80,180,280]
df['NO2_IAQI'] = df['NO2'].apply(lambda x:IAQI(x,bins))

bins = [0,100,160,215,265,300]
df['O3_IAQI'] = df['O3'].apply(lambda x:IAQI(x,bins))

In [4]:
# 计算每个IAQI排序
df['list_IAQI'] = df.apply(lambda x:sorted([x['PM2_5_IAQI'],x['PM10_IAQI'],x['SO2_IAQI'],x['CO_IAQI'],x['NO2_IAQI'],x['O3_IAQI']],reverse=True),axis=1)

df['AQI_max'] =  df['list_IAQI'].apply(lambda x:x[0])
df['AQI_max_ceil'] =  df['AQI_max'].apply(np.ceil)
for i in range(1,6):
    df['AQI_max'+str(i+1)] =  df['list_IAQI'].apply(lambda x:x[i])
    df['AQI_max_ceil'+str(i+1)] =  df['AQI_max'+str(i+1)].apply(np.ceil)

df = df.drop('list_IAQI',axis=1)

In [5]:
# 计算IAQI的sun
df['sum1'] = df[['AQI_max2','AQI_max3','AQI_max4','AQI_max5','AQI_max6']].sum(axis=1)
df['sum2'] = df[['AQI_max_ceil2','AQI_max_ceil3','AQI_max_ceil4','AQI_max_ceil5','AQI_max_ceil6']].sum(axis=1)
df['A1_H'] = df[['PM2_5_IAQI','PM10_IAQI']].max(axis=1).apply(np.ceil)

## 训练

In [6]:
train = df[:train.shape[0]]
test = df[train.shape[0]:].reset_index(drop=True)
train = train.drop(index=[487]).reset_index(drop=True)

In [7]:
label = 'IPRC'
def get_l(train, test, feature):
    c = []
    oof_train = np.zeros((train.shape[0],))
    oof_test  = np.zeros((test.shape[0],))

    kf = KFold(n_splits=5,random_state=2020,shuffle=True)
    for index,(tr_index,vl_index) in enumerate(kf.split(train)):
        X_train,X_valid = train.iloc[tr_index][feature].values,train.iloc[vl_index][feature].values
        y_train,y_valid = train.iloc[tr_index][label],train.iloc[vl_index][label]

        lf = LinearRegression()
        lf.fit(X_train,y_train)

        oof_train[vl_index] = lf.predict(X_valid)
        oof_test = oof_test + lf.predict(test[feature].values) / kf.n_splits
        c1 = lf.coef_
        c.append(c1)
    r = rmse(train[label],oof_train)

    return  oof_test, r, c

In [8]:
sub = pd.DataFrame()
sub['date'] = test['date']
sub['AQI'] = 0
sub['IPRC'] = 0

x = train[(train.AQI_max_ceil <= 50)]
l = test[(test.AQI_max_ceil <= 50)]

col = ['AQI_max_ceil','AQI_max','AQI_max_ceil2','sum1','sum2','A1_H']
answers0, score0, c0 = get_l(x,l,col)
sub.loc[l.index,'IPRC'] = answers0

print("*" * 50)
print(c0)
print('评分为:', score0)
###########################################################################
x = train[(train.AQI_max_ceil > 50)&(train.AQI_max_ceil <= 100)]
l = test[(test.AQI_max_ceil > 50)&(test.AQI_max_ceil <= 100)]

answers0, score0, c0 = get_l(x,l,col)
sub.loc[l.index,'IPRC'] = answers0

print("*" * 50)
print(c0)
print('评分为:', score0)

###########################################################################

x = train[(train.AQI_max_ceil > 100)&(train.AQI_max_ceil <= 150)]
l = test[(test.AQI_max_ceil > 100)&(test.AQI_max_ceil <= 150)]

answers0, score0, c0 = get_l(x,l,col)
sub.loc[l.index,'IPRC'] = answers0

print("*" * 50)
print(c0)
print('评分为:', score0)

###########################################################################
col = ['AQI_max_ceil','AQI_max','sum1','sum2','A1_H']
x = train[(train.AQI_max_ceil > 150)&(train.AQI_max_ceil <= 200)]
l = test[(test.AQI_max_ceil > 150)&(test.AQI_max_ceil <= 200)]

answers0, score0, c0 = get_l(x,l,col)
sub.loc[l.index,'IPRC'] = answers0

print("*" * 50)
print(c0)
print('评分为:', score0)

###########################################################################
x = train[(train.AQI_max_ceil > 200)&(train.AQI_max_ceil <= 300)]
l = test[(test.AQI_max_ceil > 200)&(test.AQI_max_ceil <= 300)]

answers0, score0, c0 = get_l(x,l,col)
sub.loc[l.index,'IPRC'] = answers0

print("*" * 50)
print(c0)
print('评分为:', score0)

###########################################################################
x = train[(train.AQI_max_ceil > 300)]
l = test[(test.AQI_max_ceil > 300)]

s = pd.DataFrame()

l['I'] = 0.002054733*l['AQI_max_ceil']+0.000554733*l['sum2']+0.002609878*l['AQI_max']+0.001109878*l['sum1']+0.0012
sub.loc[l.index,'IPRC'] = l['I'].values

print("*" * 50)
print(c0)
print('评分为:', score0)

**************************************************
[array([ 1.58002892e-03,  2.16309523e-03, -2.84758007e-06,  6.58973595e-04,
        8.15437097e-05,  3.90138764e-06]), array([ 1.56573748e-03,  2.17698381e-03, -3.21303033e-06,  6.51195025e-04,
        8.90860157e-05,  4.22341782e-06]), array([ 1.58376028e-03,  2.15833677e-03, -3.28845697e-06,  6.60555448e-04,
        7.98823876e-05,  5.04970920e-06]), array([ 1.57596450e-03,  2.16656987e-03, -2.89636183e-06,  6.55260745e-04,
        8.52300735e-05,  4.18863239e-06]), array([ 1.59386520e-03,  2.14688647e-03, -6.92348756e-07,  6.44930888e-04,
        9.53724984e-05,  4.21165779e-06])]
评分为: 2.5937053592274796e-05
**************************************************
[array([1.68518180e-03, 2.23940301e-03, 4.62434299e-07, 7.40467389e-04,
       1.84464736e-04, 5.37333859e-06]), array([1.68338201e-03, 2.24041917e-03, 4.97982314e-07, 7.40272489e-04,
       1.84662867e-04, 6.15235449e-06]), array([1.68103637e-03, 2.24333832e-03, 4.26526662e-07,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  l['I'] = 0.002054733*l['AQI_max_ceil']+0.000554733*l['sum2']+0.002609878*l['AQI_max']+0.001109878*l['sum1']+0.0012


# AQI模型1

## 特征构建

In [34]:
train = pd.read_csv('./data/train.csv')
test  = pd.read_csv('./data/test.csv')

df = pd.concat([train,test]).reset_index(drop=True)

df['AQI2'] = df['AQI']

####################### IAQI ##########################

def IAQI(x,bins):
    for i in range(len(bins)):
        if x < bins[i]:
            return (bins2[i]-bins2[i-1])/(bins[i]-bins[i-1])*(x-bins[i-1])+bins2[i-1]
    return 500

bins2 = [0, 50, 100, 150, 200, 300, 400, 500, 500]

bins = [0, 35, 75, 115, 150, 250, 350, 500, 1000]
df['PM2_5_IAQI'] = df['PM2_5'].apply(lambda x:IAQI(x,bins))

bins = [0, 50, 150, 250, 350, 420, 500]
df['PM10_IAQI'] = df['PM10'].apply(lambda x:IAQI(x,bins))

bins = [0,50,150,475]
df['SO2_IAQI'] = df['SO2'].apply(lambda x:IAQI(x,bins))

bins = [0,2,4,14]
df['CO_IAQI'] = df['CO'].apply(lambda x:IAQI(x,bins))

bins = [0,40,80,180,280]
df['NO2_IAQI'] = df['NO2'].apply(lambda x:IAQI(x,bins))

bins = [0,100,160,215,265,300]
df['O3_IAQI'] = df['O3'].apply(lambda x:IAQI(x,bins))

####################### order ##########################

df['list_IAQI'] = df.apply(lambda x:sorted([x['PM2_5_IAQI'],x['PM10_IAQI'],x['SO2_IAQI'],x['CO_IAQI'],x['NO2_IAQI'],x['O3_IAQI']],reverse=True),axis=1)

df['AQI_max'] =  df['list_IAQI'].apply(lambda x:x[0])
df['AQI'] =  df['AQI_max'].apply(np.ceil)
for i in range(1,6):
    df['AQI_max'+str(i)] =  df['list_IAQI'].apply(lambda x:x[i])#.apply(np.ceil)
#     df['AQI_max_ratio'+str(i)] =  df['AQI_max'+str(i)]/df['AQI_max']
df = df.drop('list_IAQI',axis=1)

######################  AQI  ##########################
bins = [0, 50,100,150, 200,500]
df['AQI_cut'] = pd.cut(df['AQI'], bins,labels=[i for i in range(len(bins)-1)])
df_onehot = pd.get_dummies(df['AQI_cut'])
df_onehot.columns = ['AQI_cut_'+str(i) for i in range(len(bins)-1)]
df = pd.concat([df,df_onehot],axis=1).reset_index(drop=True)
df = df.drop('AQI_cut',axis=1)

#######################  PM2_5  ##########################
bins = [-1, 35, 75, 115, 150, 250, 350, 500, 1000]
df['PM2_5_cut2'] = pd.cut(df['PM2_5'], bins,labels=[i for i in range(len(bins)-1)])
df_onehot = pd.get_dummies(df['PM2_5_cut2'])
df_onehot.columns = ['PM2_5_cut_'+str(i) for i in range(len(bins)-1)]
df = pd.concat([df,df_onehot],axis=1)

#######################  PM10  ##########################
bins = [-1, 50, 150, 250, 350, 420, 500, 600, 1000]
df['PM10_cut2'] = pd.cut(df['PM10'], bins,labels=[i for i in range(len(bins)-1)])
df_onehot = pd.get_dummies(df['PM10_cut2'])
df_onehot.columns = ['PM10_cut_'+str(i) for i in range(len(bins)-1)]
df = pd.concat([df,df_onehot],axis=1).reset_index(drop=True)

df['min_IAQI'] = df[['PM2_5_IAQI','PM10_IAQI','SO2_IAQI','CO_IAQI','NO2_IAQI','O3_IAQI']].min(axis=1)

In [12]:
df['AQI3'] = df[['PM2_5_IAQI','PM10_IAQI']].max(axis=1)
df['AQI4'] = df[['SO2_IAQI','CO_IAQI','NO2_IAQI','O3_IAQI']].max(axis=1)
df['AQI42'] = df[['SO2_IAQI','CO_IAQI','NO2_IAQI','O3_IAQI']].min(axis=1)

df['AQI32'] = df[['PM2_5_IAQI','PM10_IAQI']].min(axis=1)
df['ratio'] = df['AQI3']/df['AQI32']
df['ratio2'] = df['AQI32']/df['AQI3']

In [13]:
df['AQI323'] = (df['AQI3'] - df['AQI32'])/df['AQI3']
df['AQI32_ratio'] = df['AQI323'].apply(lambda x: 1 if x <=0.02 else 0)
df['AQI32_ratio2'] = df['AQI323'].apply(lambda x: 1 if (x >0.45)&(x<=0.55) else 0)
df['AQI32_ratio4'] = df['AQI323'].apply(lambda x: 1 if x >=0.55 else 0)

In [14]:
df['NO2O3'] = (df['NO2_IAQI']+df['O3_IAQI'])-df['AQI']
df['NO2O3_2ratio'] = df['NO2O3'].apply(lambda x: 1 if (x >30)&(x<=35) else 0)
df['NO2O3_2ratio2'] = df['NO2O3'].apply(lambda x: 1 if x >40 else 0)
df['NO2O3_2ratio3'] = df['NO2O3'].apply(lambda x: 1 if (x >20)&(x<=30) else 0)
df['NO2O3_2ratio4'] = df['NO2O3'].apply(lambda x: 1 if (x >35)&(x<=40) else 0)
# df['NO2O3_2ratio3'] = df['NO2O3'].apply(lambda x: 1 if x<-200 else 0)

In [15]:
train = df[df['AQI2'].notnull()]
test  = df[df['AQI2'].isnull()]

In [16]:
# train['diff'] = train['AQI2'] - train['AQI3']
# train = train[train['diff'].abs()<=6]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['diff'] = train['AQI2'] - train['AQI3']


In [17]:
train = train.drop(index=[487]).reset_index(drop=True)

## 训练

In [18]:
label = 'AQI2'
def get_l(train, test, feature):
    c = []
    d = []
    oof_train = np.zeros((train.shape[0],))
    oof_test  = np.zeros((test.shape[0],))

    kf = KFold(n_splits=10,random_state=2020,shuffle=True)
    for index,(tr_index,vl_index) in enumerate(kf.split(train)):
        X_train,X_valid = train.iloc[tr_index][feature].values,train.iloc[vl_index][feature].values
        y_train,y_valid = train.iloc[tr_index][label],train.iloc[vl_index][label]

        lf = LinearRegression()
#         lf = XGBRegressor()
        lf.fit(X_train,y_train)

        oof_train[vl_index] = lf.predict(X_valid)
        oof_test = oof_test + lf.predict(test[feature].values) / kf.n_splits
        
        d.append(lf.intercept_)
        c1 = lf.coef_
        c.append(c1)
    r = rmse(train[label],oof_train)
    return  oof_test, r , c,d, oof_train

###########################################################################
# for k in range(80,130):
for i,j in [(0,500)]:
    x = train[(train.AQI3>i)&(train.AQI3<=j)]
    l = test[(test.AQI3>i)&(test.AQI3<=j)]
# x = train
# l = test

    col = [i for i in train.columns if i not in ['diff','CO','SO2','NO2','PM2_5','PM10','NO2O3','AQI323','NO2O3_2',
                                                 'PM2_5_cut2','PM10_cut2','SO2_cut','CO_cut','NO2_cut','O3_8h_cut','ratio2_cut',
                                                 'pre','质量等级','date','IPRC','AQI2','AQI_cut']]
    # col = col2
    answers0, score0, c0, d0,oof_train = get_l(x,l,col)

    train.loc[x.index,'pre'] = oof_train
    test.loc[l.index,'pre'] = answers0

    print("*" * 50)
    # print('quality为:', i)
    # print(c0)
    print('评分为:', score0)
    # 评分为: 1.7115533079812493

**************************************************
评分为: 1.1854982964151928


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [19]:
# c1 = np.mean(c0,axis=0)
# d1 = np.mean(d0,axis=0)
# test['pre'] = np.dot(test[col],c1)+d1

In [20]:
coe = pd.DataFrame({'coef':np.mean(c0,axis=0)},index=col)
coe.abs().sort_values('coef')

Unnamed: 0,coef
PM10_cut_6,0.0
PM10_cut_7,0.0
NO2O3_2ratio2,0.0
PM2_5_cut_7,5.662137e-16
PM10_cut_5,6.938894e-16
PM2_5_cut_5,4.549139e-15
PM2_5_cut_6,1.827982e-14
SO2_IAQI,0.005117782
AQI_max3,0.0133844
AQI42,0.0164406


In [21]:
# for i,j in [(104,147)]:
#     x = train[(train.AQI3>i)&(train.AQI3<=j)]
#     l = test[(test.AQI3 >=i)&(test.AQI3<=j)]
    
# rmse(x[label],x['pre'] )

## 输出

In [22]:
# sub = pd.read_csv('FINAL1.csv')
sub['date'] = pd.to_datetime(sub['date'])
sub = sub.sort_values('date').reset_index(drop=True)
sub['AQI'] = test['pre'].values
sub['AQI'] = sub['AQI'].apply(np.ceil)
# sub.to_csv('h.csv',index=False)

# AQI模型2

## 特征构建

In [23]:
train = pd.read_csv('./data/train.csv')
test  = pd.read_csv('./data/test.csv')

df = pd.concat([train,test]).reset_index(drop=True)

df['AQI2'] = df['AQI']

def IAQI(x,bins):
    for i in range(len(bins)):
        if x < bins[i]:
            return (bins2[i]-bins2[i-1])/(bins[i]-bins[i-1])*(x-bins[i-1])+bins2[i-1]
    return 500

bins2 = [0, 50, 100, 150, 200, 300, 400, 500, 500]

bins = [0, 35, 75, 115, 150, 250, 350, 500, 1000]
df['PM2_5_IAQI'] = df['PM2_5'].apply(lambda x:IAQI(x,bins))

bins = [0, 50, 150, 250, 350, 420, 500]
df['PM10_IAQI'] = df['PM10'].apply(lambda x:IAQI(x,bins))

bins = [0,50,150,475]
df['SO2_IAQI'] = df['SO2'].apply(lambda x:IAQI(x,bins))

bins = [0,2,4,14]
df['CO_IAQI'] = df['CO'].apply(lambda x:IAQI(x,bins))

bins = [0,40,80,180,280]
df['NO2_IAQI'] = df['NO2'].apply(lambda x:IAQI(x,bins))

bins = [0,100,160,215,265,300]
df['O3_IAQI'] = df['O3'].apply(lambda x:IAQI(x,bins))

In [24]:
df['PM10_IAQI'] = df['PM10_IAQI'].apply(np.ceil)
df['PM2_5_IAQI'] = df['PM2_5_IAQI'].apply(np.ceil)

df['AQI3'] = df[['PM2_5_IAQI','PM10_IAQI']].max(axis=1)
df['AQI32'] = df[['PM2_5_IAQI','PM10_IAQI']].min(axis=1)

df['ratio'] = df['AQI3']/df['AQI32']
df['ratio2'] = df['AQI32']/df['AQI3']

In [25]:
train = df[df['AQI2'].notnull()]
test  = df[df['AQI2'].isnull()]

In [26]:
train['diff'] = train['AQI2'] - train['AQI3']
# train = train[train['diff'].abs()<=5.9]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['diff'] = train['AQI2'] - train['AQI3']


In [27]:
train = train.drop(index=[487]).reset_index(drop=True)
# train

## 训练

In [28]:
label = 'AQI2'
def get_l(train, test, feature):
    c = []
    oof_train = np.zeros((train.shape[0],))
    oof_test  = np.zeros((test.shape[0],))

    kf = KFold(n_splits=5,random_state=2020,shuffle=True)
    for index,(tr_index,vl_index) in enumerate(kf.split(train)):
        X_train,X_valid = train.iloc[tr_index][feature].values,train.iloc[vl_index][feature].values
        y_train,y_valid = train.iloc[tr_index][label],train.iloc[vl_index][label]

        lf = LinearRegression()
#         lf = XGBRegressor()
        lf.fit(X_train,y_train)

        oof_train[vl_index] = lf.predict(X_valid)
        oof_test = oof_test + lf.predict(test[feature].values) / kf.n_splits
        c1 = lf.coef_
        c.append(c1)
    r = rmse(train[label],oof_train)
    return  oof_test, r , c, oof_train

###########################################################################

for i,j in [(145,500)]:
    x = train[(train.AQI3>i)&(train.AQI3<=j)]
    l = test[(test.AQI3 >=i)&(test.AQI3<=j)]

    col = [i for i in train.columns if i not in ['AQI','SO2','NO2','CO','O3','PM10', 
                                                 'SO2_IAQI','CO_IAQI','NO2_IAQI','O3_IAQI',
                                                 'pre','质量等级','date','IPRC','AQI2','diff','diff2']]
#     col = ['AQI3','PM2_5_IAQI','PM10_IAQI','PM2_5']
    answers0, score0, c0, oof_train = get_l(x,l,col)

    train.loc[x.index,'pre'] = oof_train
    test.loc[l.index,'pre'] = answers0

    print("*" * 50)
    # print('quality为:', i)
    # print(c0)
    print('评分为:', score0)
    # 评分为: 1.7115533079812493

**************************************************
评分为: 1.2462650124145012


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [29]:
# c1 = np.mean(c0,axis=0)
# d1 = np.mean(d0,axis=0)
# test['pre'] = np.dot(test[col],c1)+d1

In [30]:
test = test.reset_index(drop=True)
l = test[(test.AQI3 >=145)&(test.AQI3<=500)]

sub.loc[l.index,'AQI'] = test.loc[l.index,'pre']
sub['AQI'] = sub['AQI'].apply(np.ceil)
sub['date'] = test['date']

sub.to_csv('sub.csv',index=False)

In [32]:
df2 =  pd.read_csv('1.0351.csv')

In [33]:
len(sub[sub['AQI']!=df2['AQI']])

54