In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 1. 处理数据

In [3]:
origin_data=pd.read_csv('../data/per_month_sale_and_risk.csv')

In [4]:
data=origin_data[['开始时间','风险值','地区名','销售额']]
data['销售额']=data['销售额'].astype(np.int32)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['销售额']=data['销售额'].astype(np.int32)


Unnamed: 0,开始时间,风险值,地区名,销售额
0,2019-01-01,0.147242,滨湖区,313
1,2019-02-01,0.187812,滨湖区,376
2,2019-03-01,0.313739,滨湖区,501
3,2019-04-01,0.462676,滨湖区,627
4,2019-05-01,0.523725,滨湖区,752
...,...,...,...,...
70938,2019-08-01,0.635473,梁溪区,1349
70939,2019-09-01,0.549814,梁溪区,1141
70940,2019-10-01,0.372998,梁溪区,1038
70941,2019-11-01,0.268777,梁溪区,934


#### 把地区名转化为数值

先记住对离散数据进行编码的总原则：

    离散特征的取值之间没有大小的意义，比如color：[red,blue], 性别的男女等，那么就使用OneHot编码
    离散特征的取值有大小的意义，比如size:[X,XL,XXL],身高的高，中，低等，那么就使用数值的映射（数字）{X:1,XL:2,XXL:3}进行编码

In [5]:
data_del=pd.get_dummies(data,columns=['地区名'])

#### 新建月份列

In [6]:
data_del['month']=data_del['开始时间'].map(lambda t:int(t.split('-')[1]))

data_del.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70943 entries, 0 to 70942
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   开始时间     70943 non-null  object 
 1   风险值      70943 non-null  float64
 2   销售额      70943 non-null  int32  
 3   地区名_宜兴市  70943 non-null  uint8  
 4   地区名_新吴区  70943 non-null  uint8  
 5   地区名_梁溪区  70943 non-null  uint8  
 6   地区名_江阴市  70943 non-null  uint8  
 7   地区名_滨湖区  70943 non-null  uint8  
 8   地区名_锡山区  70943 non-null  uint8  
 9   month    70943 non-null  int64  
dtypes: float64(1), int32(1), int64(1), object(1), uint8(6)
memory usage: 2.3+ MB


In [7]:
data_del=data_del.drop('开始时间',axis=1)
data_del.head()

Unnamed: 0,风险值,销售额,地区名_宜兴市,地区名_新吴区,地区名_梁溪区,地区名_江阴市,地区名_滨湖区,地区名_锡山区,month
0,0.147242,313,0,0,0,0,1,0,1
1,0.187812,376,0,0,0,0,1,0,2
2,0.313739,501,0,0,0,0,1,0,3
3,0.462676,627,0,0,0,0,1,0,4
4,0.523725,752,0,0,0,0,1,0,5


#### sklearn 预测

In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# from xgboost import XGBClassifier 
from sklearn.ensemble import RandomForestRegressor



In [10]:
y=data_del['销售额']

x=data_del.drop('销售额',axis=1)



### 1. 随机深林 (score=83.6%,n_estimators=100)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

不同参数结果

    score          n_estimators
    
0.7680222001122863 1

0.8308109412227868 10

0.835274281001485 100

0.8366202323912915 1000

### 结论 最佳参数100

In [26]:
for i in [0.1,1,10,100,1000]:
    rgs_model = RandomForestRegressor(n_estimators=i)  ##随机森林模型
    rgs_model = rgs_model.fit(X_train, y_train)
    score=rgs_model.score(X_test,y_test)
    print(score,i)

0.8361325140380509 100
0.8361919165080738 200
0.837251716837464 300


### 2. KNN(score=88.2%,n_neighbors=14)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
standarScaler = StandardScaler()
standarScaler.fit(X_train)
X_train_std = standarScaler.transform(X_train)
X_test_std = standarScaler.transform(X_test)
for i in range(1,30):
    knn_clf = KNeighborsRegressor(n_neighbors=i)
    knn_clf.fit(X_train_std,y_train)
    score = knn_clf.score(X_test_std, y_test)
    knn_clf.predict(X_test_std[:1])
    print(score,i)

0.768191550585448 1
0.8280784015862065 2
0.8494303986778422 3
0.8579214553517528 4
0.863635308872755 5
0.866919350264251 6
0.8710142594226077 7
0.8724439393599112 8
0.8735370091682908 9
0.8744477430327371 10
0.8760696834252879 11
0.8767593330841652 12
0.877738599490365 13
0.8779711217845223 14
0.8782086123788285 15
0.8783878118123347 16
0.8788880217025855 17
0.8792908850805572 18
0.8793764553234711 19
0.8797348783823326 20
0.8799227193403324 21
0.8802392671105502 22
0.8800774965398545 23
0.8800710553477319 24
0.8801665377240305 25
0.8801759539882903 26
0.8803104408767148 27
0.8801852338612004 28
0.8805096172474947 29


In [13]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
standarScaler = StandardScaler()
standarScaler.fit(X_train)
X_train_std = standarScaler.transform(X_train)
X_test_std = standarScaler.transform(X_test)
knn_clf = KNeighborsRegressor(n_neighbors=14)
knn_clf.fit(X_train_std,y_train)
score = knn_clf.score(X_test_std, y_test)
r=knn_clf.predict(X_test_std[:10])
print(score)
print(r)

0.8788317801564748
[1927.57142857  506.42857143  393.14285714  693.85714286 1215.
 1669.21428571  535.78571429  669.07142857  463.78571429  256.57142857]


In [14]:
print(y_test[:10])

64505    2080
31958     394
19445     360
19929     784
40047    1328
35535    2358
64135     435
10634     706
62800     325
33761     182
Name: 销售额, dtype: int32


### 3. 线性回归(score=51.9%)  多项式回归（score=86.3,degree=4）

In [29]:
from sklearn.linear_model import LinearRegression

In [30]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_model.score(X_test, y_test)

0.762943601983751

In [31]:
from sklearn.preprocessing import PolynomialFeatures



In [32]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
X_train.shape

(56754, 8)

In [33]:
poly_features_2=PolynomialFeatures(degree=4,include_bias=False)

# 构建训练数据的特征矩阵
poly_train_2=poly_features_2.fit_transform(np.array(X_train).reshape(len(X_train),8))

# 构建测试数据的特征矩阵
poly_test_2=poly_features_2.fit_transform(np.array(X_test).reshape(len(X_test),8))

model_2=LinearRegression()

model_2.fit(poly_train_2,np.array(y_train).reshape(len(y_train),1))
score=model_2.score(poly_test_2,np.array(y_test).reshape(len(y_test),1))

print(score)

0.8576458898832867


### 4. SVM (适合分类)

In [10]:
from sklearn.svm import LinearSVC,SVC
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
svc_model=SVC(C=1,kernel='linear')
svc_model.fit(X_train,y_train)
score = svc_model.score(X_test, y_test)
print(score)