# 预测城市自行车共享系统的使用情况

&emsp;&emsp;共享单车系统是一种自行车租赁方式，通过遍布城市的自助服务亭网络自动完成获取会员资格、租赁和归还自行车的过程。使用这些系统，人们可以根据需要在某个地方租用自行车，然后将其归还到另一个地方。目前，全球有 500 多个共享单车项目。

&emsp;&emsp;这些系统生成的数据对研究人员很有吸引力，因为出行时间、出发地点、到达地点和所用时间都被明确记录下来。因此，共享单车系统充当传感器网络，可用于研究城市的流动性。在本次比赛中，参赛者需要将历史使用模式与天气数据结合起来，以预测华盛顿特区 Capital Bikeshare 计划中的自行车租赁需求

![](data_describe.png)

In [358]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## 1 数据预处理

### 1.1 读取数据

In [320]:
train = pd.read_csv("train.csv")
result_features = pd.read_csv("test.csv")
sampleSubmission = pd.read_csv("sampleSubmission.csv")

In [359]:
print("温度的0：",len(train[train['temp']==0]))
print("体感温度的0：",len(train[train['atemp']==0]))
print("湿度的0：",len(train[train['humidity']==0]))
print("风力的0：",len(train[train['windspeed']==0]))

温度的0： 7
体感温度的0： 2
湿度的0： 0
风力的0： 767


In [360]:
train.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,season_spring,season_summer,season_autumn,season_winter,hour,year,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,0,1,0.22449,0.305068,0.81,0.05665,3,13,16,1,0,0,0,0,2011,0,0,0,0,0,1,0
1,0,0,1,0.204082,0.288064,0.8,0.063316,8,32,40,1,0,0,0,1,2011,0,0,0,0,0,1,0
2,0,0,1,0.204082,0.288064,0.8,0.061945,5,27,32,1,0,0,0,2,2011,0,0,0,0,0,1,0
3,0,0,1,0.22449,0.305068,0.75,0.063126,3,10,13,1,0,0,0,3,2011,0,0,0,0,0,1,0
4,0,0,1,0.22449,0.305068,0.75,0.061948,0,1,1,1,0,0,0,4,2011,0,0,0,0,0,1,0


In [322]:
# one-hot一下
train = pd.get_dummies(train, prefix='season', columns=['season'])
result_features = pd.get_dummies(result_features, prefix='season', columns=['season'])
# 改个名
train = train.rename(columns={'season_1':'season_spring', 'season_2':'season_summer', 'season_3':'season_autumn', 'season_4':'season_winter'})
result_features = result_features.rename(columns={'season_1':'season_spring', 'season_2':'season_summer', 'season_3':'season_autumn', 'season_4':'season_winter'})

In [323]:
# 看一下
#train.head()

### 1.2数据处理

#### ~~1.2.1 删掉两列没什么影响的标签（会员和非会员）~~
#### 1.2.1 这两列删不得
我们需要对把这两个标签再摘出来，对这两个标签分别做预测，然后加起来就是我们要的count

In [324]:
#train_features = train_features.drop(columns=['casual','registered'])
#train_features.head()
#test.head()

#### 1.2.2 对季节做独热编码

本来打算把天气也处理了，但是想想觉得天气的1-4可以代表天气的恶劣程度，于是不做处理。

#### 1.2.3 日期处理

In [325]:
train['datetime'] = pd.to_datetime(train['datetime'])
result_features['datetime'] = pd.to_datetime(result_features['datetime'])
# 提取时段
train['hour'] = train['datetime'].dt.hour
result_features['hour'] = result_features['datetime'].dt.hour
# 提取年份
train['year'] = train['datetime'].dt.year
result_features['year'] = result_features['datetime'].dt.year
# 提取月份
#train['month'] = train['datetime'].dt.month
#result_features['month'] = result_features['datetime'].dt.month
# 提取天
train['day'] = train['datetime'].dt.day
result_features['day'] = result_features['datetime'].dt.day
# 提取星期几（0=周一, 6=周日）
train['weekday'] = train['datetime'].dt.weekday
result_features['weekday'] = result_features['datetime'].dt.weekday

接下来删掉datetime，因为有用的东西已经提出来了。

In [326]:
train = train.drop(columns=['datetime'])
result_features = result_features.drop(columns=['datetime'])

In [327]:
# 看看
#train.head()

对星期几做一个独热编码，因为这个特征其实是一个离散的东西

In [328]:
# 再one-hot一下
train = pd.get_dummies(train, prefix='weekday', columns=['weekday'])
result_features = pd.get_dummies(result_features, prefix='weekday', columns=['weekday'])
# 改个名
train = train.rename(columns={'weekday_0':'Monday', 'weekday_1':'Tuesday', 'weekday_2':'Wednesday', 'weekday_3':'Thursday', 'weekday_4':'Friday', 'weekday_5':'Saturday', 'weekday_6':'Sunday'})
result_features = result_features.rename(columns={'weekday_0':'Monday', 'weekday_1':'Tuesday', 'weekday_2':'Wednesday', 'weekday_3':'Thursday', 'weekday_4':'Friday', 'weekday_5':'Saturday', 'weekday_6':'Sunday'})

#### 1.2.3 ~~标准化，~~归一化

In [329]:
features_to_scale = ['temp', 'atemp', 'humidity']

In [330]:
#scaler = StandardScaler()
#train[features_to_scale] = scaler.fit_transform(train[features_to_scale])
#result_features[features_to_scale] = scaler.fit_transform(result_features[features_to_scale])

In [331]:
normalizer = MinMaxScaler()
train[features_to_scale] = normalizer.fit_transform(train[features_to_scale])
result_features[features_to_scale] = normalizer.fit_transform(result_features[features_to_scale])

#### 1.2.4 补全风速的缺省值

In [332]:
zero_windspeed = train[train['windspeed'] == 0].copy()
non_zero_windspeed = train[train['windspeed'] != 0].copy()

In [333]:
# 把风速一栏删去做标签
X_wind_train = non_zero_windspeed.drop(columns=['windspeed','casual','registered','count'])
y_wind_train = non_zero_windspeed['windspeed']
X_wind_test = zero_windspeed.drop(columns=['windspeed','casual','registered','count'])

In [334]:
rf_wind = RandomForestRegressor(random_state=713)
# 训练
rf_wind.fit(X_wind_train, y_wind_train)
# 预测
windspeed_pred_train = rf_wind.predict(X_wind_test)
windspeed_pred_test = rf_wind.predict(result_features[result_features['windspeed'] == 0].drop(columns=['windspeed']))
# 用预测值替换零值
train.loc[train['windspeed'] == 0, 'windspeed'] = windspeed_pred_train
result_features.loc[result_features['windspeed'] == 0, 'windspeed'] = windspeed_pred_test

对风速也做~~标准化和~~归一化

In [335]:
#scaler = StandardScaler()
#train[['windspeed']] = scaler.fit_transform(train[['windspeed']])
#result_features[['windspeed']] = scaler.fit_transform(result_features[['windspeed']])

In [336]:
normalizer = MinMaxScaler()
train[['windspeed']] = normalizer.fit_transform(train[['windspeed']])
result_features[['windspeed']] = normalizer.fit_transform(result_features[['windspeed']])

#### 数据处理结束，存csv里面

In [337]:
train.to_csv("train_new.csv", index=False)
result_features.to_csv("result_features.csv", index=False)

In [338]:
train.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,season_spring,season_summer,season_autumn,season_winter,hour,year,day,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,0,1,0.22449,0.305068,0.81,0.05665,3,13,16,1,0,0,0,0,2011,1,0,0,0,0,0,1,0
1,0,0,1,0.204082,0.288064,0.8,0.063316,8,32,40,1,0,0,0,1,2011,1,0,0,0,0,0,1,0
2,0,0,1,0.204082,0.288064,0.8,0.061945,5,27,32,1,0,0,0,2,2011,1,0,0,0,0,0,1,0
3,0,0,1,0.22449,0.305068,0.75,0.063126,3,10,13,1,0,0,0,3,2011,1,0,0,0,0,0,1,0
4,0,0,1,0.22449,0.305068,0.75,0.061948,0,1,1,1,0,0,0,4,2011,1,0,0,0,0,0,1,0


## 2 训练、预测、评估

In [339]:
#读一下存好的训练集，等等用来划分
train_new = pd.read_csv("train_new.csv")
#train_new.info()
train = train_new[train_new['day'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19])]
test = train_new[train_new['day'].isin([10, 11])]

删掉day

In [340]:
train = train.drop(columns=['day'])
test = test.drop(columns=['day'])
result_features = result_features.drop(columns=['day'])

In [341]:
# 拆数据集
#X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=229003001)
X_train = train.drop(columns=['casual', 'registered', 'count'])
X_test = test.drop(columns=['casual', 'registered', 'count'])
y_casual_train = train['casual']
y_registered_train = train['registered']
y_casual_test = test['casual']
y_registered_test = test['registered']

In [342]:
y_casual_train.head()

0    3
1    8
2    5
3    3
4    0
Name: casual, dtype: int64

这里要做一个自定义评估函数RMSLE

In [343]:
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred), "Length of y_true and y_pred must be the same."
    y_true = np.clip(y_true, a_min=0, a_max=None)
    y_pred = np.clip(y_pred, a_min=0, a_max=None)
    rmsle = np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))  
    return rmsle

### 2.1 随机森林

In [344]:
# 自定义一个评分器
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

rf = RandomForestRegressor(n_estimators=1000, min_samples_split = 11)

#### ~~2.1.1 K折交叉验证找参数再训练~~
#### 2.1.1 用自己拆的训练集训练并预测

In [345]:
# 定义交叉验证策略
#kf = KFold(n_splits=5, shuffle=True, random_state=229003001)
# 使用 cross_val_score 进行交叉验证
#scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=make_scorer(rmsle, greater_is_better=False))
#best_model.fit(X_train, y_train, cat_features=categorical_features_indices)
#y_pred = best_model.predict(X_test)

先做casual的

In [346]:
# 训练模型
rf.fit(X_train, y_casual_train)
# 预测
y_casual_pred = rf.predict(X_test)
rmsle_score = rmsle(y_casual_test, y_casual_pred)
print("RMSLE Score (Casual) on Test Set:", rmsle_score)

RMSLE Score (Casual) on Test Set: 0.5815020243597163


In [347]:
y_casual_result = rf.predict(result_features)
y_casual_result

array([1.59613726, 0.67496292, 0.74712064, ..., 4.30289573, 2.71965659,
       2.41010144])

再做registered的

In [348]:
rf.fit(X_train, y_registered_train)
y_registered_pred = rf.predict(X_test)
rmsle_score_registered = rmsle(y_registered_test, y_registered_pred)
print("RMSLE Score (Registered) on Test Set:", rmsle_score_registered)

RMSLE Score (Registered) on Test Set: 0.3571507661534743


In [349]:
y_registered_result = rf.predict(result_features)
y_registered_result

array([10.56348365,  4.92515479,  2.05818106, ..., 96.65455136,
       82.63013004, 49.66932516])

加起来就是我们要的count

In [350]:
y_result = y_casual_result + y_registered_result

In [351]:
sampleSubmission['count'] = pd.DataFrame({'count': y_result})
sampleSubmission.to_csv("sampleSubmission_randomforest_new_new.csv", index=False)
print("OK")

OK


最终得分0.41多

#### 关于提交
不能有负值，日期格式得在excel里改为“yyyy-mm-dd hh:mm:ss”

In [352]:
y_result[y_result<0].sum()

0.0