In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xgboost as xgb
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import missingno as msno
from sklearn.ensemble import RandomForestRegressor

custom_style = {'axes.labelcolor': 'white',
                'xtick.color': 'white',
                'ytick.color': 'white'}
sns.set_style("darkgrid", rc=custom_style)
plt.style.use('dark_background')

%precision 3
%matplotlib inline

#### データの読み込み

In [2]:
df = pd.read_csv("./train.csv", parse_dates=["datetime"])
df_test = pd.read_csv("./test.csv", parse_dates=["datetime"])
data = df.append(df_test, sort=False)

#### feature engineering

In [3]:
data["year"] = data["datetime"].dt.year
data["month"] = data["datetime"].dt.month
data["day"] = data["datetime"].dt.day
data["dayofweek"] = data["datetime"].dt.dayofweek
data["hour"] = data["datetime"].dt.hour

#### RandomForestを使って windowspeed = 0 を穴埋め

In [4]:
pick = ['season', 'weather', 'temp','atemp', 'humidity', 'month', 'year']
reg = RandomForestRegressor()
win0 = data[data["windspeed"] == 0].copy()
winNot0 = data[data["windspeed"] != 0].copy()
reg.fit(X=winNot0[pick], y=winNot0["windspeed"])
win0Value= reg.predict(win0[pick])
win0.loc[:,"windspeed"] = win0Value
data = winNot0.append(win0,sort=False)

#### データ種別の定義

In [5]:
categoricalFeatureNames = ["season","holiday","workingday","weather","dayofweek","month","year","hour"]
numericalFeatureNames = ["temp","humidity","windspeed","atemp"]
dropFeatures = ['casual',"count","datetime","registered"]
for var in categoricalFeatureNames:
    data[var] = data[var].astype("category")

#### Train / Test に分割

In [6]:
dataTrain = data[pd.notnull(data["count"])].sort_values(by="datetime")
dataTest = data[~pd.notnull(data["count"])].sort_values(by="datetime")

In [7]:
datatimecol = dataTest["datetime"]
yLabels = dataTrain["count"]
yLabelsRegistered = dataTrain["registered"]
yLabelCasual = dataTrain["casual"]

#### 不要なデータを削除

In [8]:
dataTrain = dataTrain.drop(dropFeatures,axis=1)
dataTest = dataTest.drop(dropFeatures,axis=1)

#### 評価指標としてRMSLE(Root Mean Squared Log Error)を追加
$RMSLE = \sqrt{\frac{1}{N} \sum^{n}_{i=0}(log(y_i + 1) - log(y'_{i} + 1))^2}$

In [9]:
def rmsle(y, y_, convertExp=True):
    if convertExp:
        y = np.exp(y)
        y_ = np.exp(y_)
    
    log1 = np.nan_to_num(np.array([np.log(v+1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v+1) for v in y_]))
    calc = np.sqrt(np.mean((log1 - log2)**2))
    return calc
        

#### Linear Regression Model

In [10]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
y = np.log1p(yLabels)
lr = LinearRegression()
lr.fit(X=dataTrain, y=y)
pred = lr.predict(dataTrain)
lr_rmsle = rmsle(y, pred, True)
print("RMSLE = {}".format(lr_rmsle))