In [3]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [4]:
origin=pd.read_csv('./data/harbin_2019_aqi.csv')

In [5]:
# origin.head()

In [6]:
origin.isnull().sum()

ymd          0
bWendu       0
yWendu       0
tianqi       0
fengxiang    0
fengli       0
aqi          0
aqiInfo      0
aqiLevel     0
dtype: int64

#### 1. feature_engineering

In [7]:
# 把温度 风力 aqi转化为浮点数
origin['max_temp']=origin['bWendu'].map(lambda x:float(x[:-1]))
origin['min_temp']=origin['yWendu'].map(lambda x:float(x[:-1]))
origin['wind_power']=origin['fengli'].map(lambda x:float(x[:1]))
origin['aqi']=origin['aqi'].astype('float32')

In [8]:
# 删除原有的数据列
origin.drop(['bWendu','yWendu','fengli','ymd'],axis=1,inplace=True)
# origin.drop(['ymd'],axis=1,inplace=True)

In [9]:
# 统计有多少种aqiLevel
origin[['aqiInfo','aqiLevel']].drop_duplicates(['aqiInfo'])

Unnamed: 0,aqiInfo,aqiLevel
0,良,2
2,轻度污染,3
6,中度污染,4
12,重度污染,5
15,优,1
57,严重污染,6


In [10]:
origin.drop(['aqiInfo'],axis=1,inplace=True)

In [11]:
origin['aqiLevel']=origin['aqiLevel'].astype('float32')

### 定类等级编码

In [12]:
origin.head()

Unnamed: 0,tianqi,fengxiang,aqi,aqiLevel,max_temp,min_temp,wind_power
0,多云,西南风,63.0,2.0,-12.0,-21.0,3.0
1,晴,西南风,73.0,2.0,-10.0,-19.0,2.0
2,多云~小雪,西南风,126.0,3.0,-10.0,-21.0,3.0
3,多云,西南风,58.0,2.0,-9.0,-21.0,3.0
4,晴,西北风,55.0,2.0,-10.0,-19.0,2.0


In [13]:
data_digited=pd.get_dummies(origin,columns=['tianqi','fengxiang'],prefix_sep="_")

In [14]:
# data_digited.head()

In [15]:
# data_digited.describe()

### 因为污染物浓度和污染指数有直接的关系，所以把该列去除

In [16]:
corr=data_digited.corr()

In [17]:
# corr

In [18]:
data_digited.drop(['aqi'],axis=1,inplace=True)

In [19]:
data_digited.head()

Unnamed: 0,aqiLevel,max_temp,min_temp,wind_power,tianqi_中雨,tianqi_中雨~多云,tianqi_中雨~大雨,tianqi_中雨~小雨,tianqi_中雨~雷阵雨,tianqi_中雪~多云,...,tianqi_雷阵雨~阵雨,tianqi_雾~小雪,tianqi_霾~多云,fengxiang_东北风,fengxiang_东南风,fengxiang_东风,fengxiang_南风,fengxiang_西北风,fengxiang_西南风,fengxiang_西风
0,2.0,-12.0,-21.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2.0,-10.0,-19.0,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3.0,-10.0,-21.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2.0,-9.0,-21.0,3.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2.0,-10.0,-19.0,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [20]:

standarScaler = StandardScaler()

std_data = standarScaler.fit_transform(data_digited)
std_data=pd.DataFrame(std_data,columns=data_digited.columns)


In [21]:
corr_2=std_data.corr()
corr_2
# f, ax = plt.subplots(figsize=(18, 6))
# sns.heatmap(corr_2, square=True)

Unnamed: 0,aqiLevel,max_temp,min_temp,wind_power,tianqi_中雨,tianqi_中雨~多云,tianqi_中雨~大雨,tianqi_中雨~小雨,tianqi_中雨~雷阵雨,tianqi_中雪~多云,...,tianqi_雷阵雨~阵雨,tianqi_雾~小雪,tianqi_霾~多云,fengxiang_东北风,fengxiang_东南风,fengxiang_东风,fengxiang_南风,fengxiang_西北风,fengxiang_西南风,fengxiang_西风
aqiLevel,1.000000,-0.391921,-0.446463,-0.039275,-0.039074,-0.039074,-0.039074,-0.067865,-0.055335,0.012044,...,-0.055335,0.063163,0.165400,-0.148100,-0.019240,-0.039074,0.114282,-0.062132,0.156216,0.012044
max_temp,-0.391921,1.000000,0.979039,0.082435,0.021057,0.036069,0.032316,0.071336,0.077653,-0.046495,...,0.082967,-0.084024,-0.065259,0.167500,0.173736,0.058586,-0.057753,-0.159260,-0.073897,-0.087776
min_temp,-0.446463,0.979039,1.000000,0.057815,0.029961,0.037259,0.033610,0.102737,0.091520,-0.046666,...,0.083769,-0.086804,-0.061261,0.204154,0.194881,0.062801,-0.064910,-0.164819,-0.107241,-0.083155
wind_power,-0.039275,0.082435,0.057815,1.000000,0.017125,0.017125,0.087356,-0.010917,0.024251,0.087356,...,-0.025478,0.017125,-0.053106,-0.142512,-0.205154,-0.053106,-0.053106,0.059849,0.187820,0.017125
tianqi_中雨,-0.039074,0.021057,0.029961,0.017125,1.000000,-0.002747,-0.002747,-0.004772,-0.003891,-0.002747,...,-0.003891,-0.002747,-0.002747,-0.019154,0.133105,-0.002747,-0.002747,-0.033086,-0.047871,-0.002747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fengxiang_东风,-0.039074,0.058586,0.062801,-0.053106,-0.002747,-0.002747,-0.002747,-0.004772,-0.003891,-0.002747,...,-0.003891,-0.002747,-0.002747,-0.019154,-0.020640,1.000000,-0.002747,-0.033086,-0.047871,-0.002747
fengxiang_南风,0.114282,-0.057753,-0.064910,-0.053106,-0.002747,-0.002747,-0.002747,-0.004772,-0.003891,-0.002747,...,-0.003891,-0.002747,-0.002747,-0.019154,-0.020640,-0.002747,1.000000,-0.033086,-0.047871,-0.002747
fengxiang_西北风,-0.062132,-0.159260,-0.164819,0.059849,-0.033086,-0.033086,-0.033086,0.009762,-0.046855,-0.033086,...,-0.046855,-0.033086,-0.033086,-0.230676,-0.248571,-0.033086,-0.033086,1.000000,-0.576533,-0.033086
fengxiang_西南风,0.156216,-0.073897,-0.107241,0.187820,-0.047871,0.057388,-0.047871,-0.022205,0.006739,-0.047871,...,0.006739,-0.047871,0.057388,-0.333760,-0.359652,-0.047871,-0.047871,-0.576533,1.000000,-0.047871


### KNN

In [22]:
y=data_digited['aqiLevel']
x=data_digited.drop('aqiLevel',axis=1)

In [23]:
y=np.where(y>=3,1,0)

In [24]:
pd.Series(y).value_counts()

0    309
1     56
dtype: int64

In [34]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
standarScaler = StandardScaler()
standarScaler.fit(X_train)

X_train_std = standarScaler.transform(X_train)
X_test_std = standarScaler.transform(X_test)
# 模型训练和测试
for i in range(2,9):
    knn_clf = KNeighborsRegressor(n_neighbors=i)
    knn_clf.fit(X_train,y_train)
    score = knn_clf.score(X_test, y_test)

    print(score,i)

-0.15873015873015883 2
-0.17160493827160495 3
-0.14424603174603168 4
-0.06139682539682534 5
-0.01710758377425048 6
-0.00029154518950447184 7
0.0784474206349206 8


In [26]:
# knn_clf.predict(X_test_std[:10])

In [24]:
# y_test[:10]

### 线性回归

In [118]:
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = \
    train_test_split(x, y, test_size=0.2)

lin_reg = LinearRegression()

# standardScaler = StandardScaler()

# X_standard = standardScaler.fit_transform(X_train)

# standardScaler.fit(y_test)
# x_test_standard = standardScaler.transform(X_test)

lin_reg.fit(X_train, y_train)

# 打印结果
# print(lin_reg.coef_)
# print(lin_reg.intercept_)
print(lin_reg.score(X_test, y_test))

-4.6170925010678546e+20


In [26]:
y.value_counts(normalize=True)

1.0    0.509589
2.0    0.336986
3.0    0.071233
4.0    0.046575
5.0    0.032877
6.0    0.002740
Name: aqiLevel, dtype: float64

## svm

In [117]:
from sklearn.svm import LinearSVC,SVC
from matplotlib.colors import ListedColormap
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# 归一化尺度
standardScaler = StandardScaler()
X_standard = standardScaler.fit_transform(X_train)

# standardScaler.fit(y_test)
x_test_standard = standardScaler.transform(X_test)

model = LinearSVC(C=1) #C=1:0.9333333333333333   C=1000000 0.9666666666666667
model.fit(X_standard, y_train)
model.score(x_test_standard,y_test)

0.9863013698630136

In [28]:
# model.predict(x[:20])

In [29]:
# y[:20]

### 多项式特征

In [94]:
x2=x.copy()
y2=y.copy()

In [95]:
from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(degree=2,include_bias=False,interaction_only=True)
x2_ploy=poly.fit_transform(x2)
x2_ploy.shape

(365, 2080)

In [114]:
X_train, X_test, y_train, y_test = train_test_split(x2_ploy, y2, test_size=0.2)
# 归一化尺度
standardScaler = StandardScaler()
X_standard = standardScaler.fit_transform(X_train)

# standardScaler.fit(y_test)
x_test_standard = standardScaler.transform(X_test)

model = LinearSVC(C=1) #C=1:0.9333333333333333   C=1000000 0.9666666666666667
model.fit(X_standard, y_train)
model.score(x_test_standard,y_test)



0.9726027397260274

In [104]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(x2_ploy, y2, test_size=0.2)
# 归一化尺度
standardScaler = StandardScaler()
X_standard = standardScaler.fit_transform(X_train)

# standardScaler.fit(y_test)
x_test_standard = standardScaler.transform(X_test)

model = LogisticRegression()
model.fit(X_standard, y_train)
model.score(x_test_standard,y_test)

0.9315068493150684