In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
from scipy import stats

%matplotlib inline

plt.style.use('ggplot')

  import pandas.util.testing as tm


In [0]:
train = pd.read_csv("train.csv", parse_dates=["datetime"])

FileNotFoundError: ignored

In [0]:
train.shape

In [0]:
train.info()

In [0]:
train.head()

In [0]:
train.dtypes

In [0]:
train.isnull().sum()

In [0]:
import missingno as msno
msno.matrix(train, figsize=(12,5))

In [0]:
train["year"] = train["datetime"].dt.year
train["month"] = train["datetime"].dt.month
train["day"] = train["datetime"].dt.day
train["hour"] = train["datetime"].dt.hour
train["minute"] = train["datetime"].dt.minute
train["second"] = train["datetime"].dt.second
train.shape

In [0]:
train.head()

In [0]:
from IPython.display import set_matplotlib_formats

## Windows의 한글 폰트 설정
plt.rc('font', family='NanumGothic')

plt.rc('axes', unicode_minus=False)

# 폰트가 선명하게 보이기 위해
set_matplotlib_formats('retina')

In [0]:
figure, ((ax1,ax2,ax3),(ax4,ax5,ax6))=plt.subplots(nrows=2, ncols=3)
figure.set_size_inches(18,8)

sns.barplot(data=train, x='year', y='count', ax=ax1)
sns.barplot(data=train, x='month', y='count', ax=ax2)
sns.barplot(data=train, x='day', y='count', ax=ax3)
sns.barplot(data=train, x='hour', y='count', ax=ax4)
sns.barplot(data=train, x='minute', y='count', ax=ax5)
sns.barplot(data=train, x='second', y='count', ax=ax6)

In [0]:
fig, axes=plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(12,10)
sns.boxplot(data=train,y='count',orient='v',ax=axes[0][0])
sns.boxplot(data=train,y='count',x='season',orient='v',ax=axes[0][1])
sns.boxplot(data=train,y='count',x='hour',orient='v',ax=axes[1][0])
sns.boxplot(data=train,y='count',x='workingday',orient='v',ax=axes[1][1])

In [0]:
train['dayofweek']=train['datetime'].dt.dayofweek
train.shape

In [0]:
train['dayofweek'].value_counts()

In [0]:
fig,(ax1,ax2,ax3,ax4,ax5)=plt.subplots(nrows=5)
fig.set_size_inches(18,25)

sns.pointplot(data=train,x='hour',y='count',ax=ax1)
sns.pointplot(data=train,x='hour',y='count',hue='workingday',ax=ax2)
sns.pointplot(data=train,x='hour',y='count',hue='dayofweek',ax=ax3)
sns.pointplot(data=train,x='hour',y='count',hue='weather',ax=ax4)
sns.pointplot(data=train,x='hour',y='count',hue='season',ax=ax5)

In [0]:
corrMatt=train[['temp','atemp','casual','registered','humidity','windspeed','count']]
corrMatt=corrMatt.corr()
print(corrMatt)

mask=np.array(corrMatt)
mask[np.tril_indices_from(mask)]=False

In [0]:
fig,ax=plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(corrMatt,mask=mask,vmax=8,square=True,annot=True)

In [0]:
fig,(ax1,ax2,ax3)=plt.subplots(ncols=3)
fig.set_size_inches(12,5)
sns.regplot(x='temp',y='count',data=train,ax=ax1)
sns.regplot(x='windspeed',y='count',data=train,ax=ax2)
sns.regplot(x='humidity', y='count',data=train,ax=ax3)

In [0]:
def concatenate_year_month(datetime):
    return "{0}-{1}".format(datetime.year, datetime.month)
train['year_month']=train['datetime'].apply(concatenate_year_month)

print(train.shape)
train[['datetime','year_month']].head()

In [0]:
fig, (ax1, ax2)=plt.subplots(nrows=1, ncols=2)
fig.set_size_inches(18,4)

sns.barplot(data=train, x='year', y='count', ax=ax1)
sns.barplot(data=train, x='month', y='count', ax=ax2)

fig, ax3=plt.subplots(nrows=1, ncols=1)
fig.set_size_inches(18,4)

sns.barplot(data=train,x='year_month',y='count',ax=ax3)

In [0]:
#trainWithoutOutliers
trainWithoutOutliers=train[np.abs(train['count']-train['count'].mean()) <= (3*train['count'].std())]

print(train.shape)
print(trainWithoutOutliers.shape)

In [0]:
figure, axes=plt.subplots(ncols=2,nrows=2)
figure.set_size_inches(12,10)
sns.distplot(train['count'],ax=axes[0][0])
stats.probplot(train['count'],dist='norm',fit=True,plot=axes[0][1])
sns.distplot(np.log(trainWithoutOutliers['count']),ax=axes[1][0])
stats.probplot(np.log1p(trainWithoutOutliers['count']),dist='norm',fit=True,plot=axes[1][1])

In [0]:
test=pd.read_csv('test.csv', parse_dates=["datetime"])

In [0]:
test.shape

In [0]:
test["year"] = test["datetime"].dt.year
test["month"] = test["datetime"].dt.month
test["day"] = test["datetime"].dt.day
test["hour"] = test["datetime"].dt.hour
test["minute"] = test["datetime"].dt.minute
test["second"] = test["datetime"].dt.second
test["dayofweek"] = test["datetime"].dt.dayofweek
test.shape

In [0]:
fig, axes=plt.subplots(nrows=2)
fig.set_size_inches(18,10)

plt.sca(axes[0])
plt.xticks(rotation=30, ha='right')
axes[0].set(ylabel='Count', title='train windspeed')
sns.countplot(data=train, x='windspeed', ax=axes[0])

plt.sca(axes[1])
plt.xticks(rotation=30, ha='right')
axes[1].set(ylabel='Count', title='test windspeed')
sns.countplot(data=test, x='windspeed', ax=axes[1])

In [0]:
trainWind0=train.loc[train['windspeed']==0]
trainWindNot0=train.loc[train['windspeed']!=0]
print(trainWind0.shape)
print(trainWindNot0.shape)

In [0]:
# 그래서 머신러닝으로 예측을 해서 풍속을 넣어주도록 한다.
from sklearn.ensemble import RandomForestClassifier

def predict_windspeed(data):
    
    # 풍속이 0인것과 아닌 것을 나누어 준다.
    dataWind0 = data.loc[data['windspeed'] == 0]
    dataWindNot0 = data.loc[data['windspeed'] != 0]
    
    # 풍속을 예측할 피처를 선택한다.
    wCol = ["season", "weather", "humidity", "month", "temp", "year", "atemp"]

    # 풍속이 0이 아닌 데이터들의 타입을 스트링으로 바꿔준다.
    dataWindNot0["windspeed"] = dataWindNot0["windspeed"].astype("str")

    # 랜덤포레스트 분류기를 사용한다.
    rfModel_wind = RandomForestClassifier()

    # wCol에 있는 피처의 값을 바탕으로 풍속을 학습시킨다.
    rfModel_wind.fit(dataWindNot0[wCol], dataWindNot0["windspeed"])

    # 학습한 값을 바탕으로 풍속이 0으로 기록 된 데이터의 풍속을 예측한다.
    wind0Values = rfModel_wind.predict(X = dataWind0[wCol])

    # 값을 다 예측 후 비교해 보기 위해
    # 예측한 값을 넣어 줄 데이터 프레임을 새로 만든다.
    predictWind0 = dataWind0
    predictWindNot0 = dataWindNot0

    # 값이 0으로 기록 된 풍속에 대해 예측한 값을 넣어준다.
    predictWind0["windspeed"] = wind0Values

    # dataWindNot0 0이 아닌 풍속이 있는 데이터프레임에 예측한 값이 있는 데이터프레임을 합쳐준다.
    data = predictWindNot0.append(predictWind0)

    # 풍속의 데이터타입을 float으로 지정해 준다.
    data["windspeed"] = data["windspeed"].astype("float")

    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    
    return data

In [0]:
# 0값을 조정한다.
train = predict_windspeed(train)
# test = predict_windspeed(test)

# widspeed 의 0값을 조정한 데이터를 시각화
fig, ax1 = plt.subplots()
fig.set_size_inches(18,6)

plt.sca(ax1)
plt.xticks(rotation=30, ha='right')
ax1.set(ylabel='Count',title="train windspeed")
sns.countplot(data=train, x="windspeed", ax=ax1)

In [0]:
categorical_feature_names=['season','holiday','workingday','weather','dayofweek','month','year','hour']

for var in categorical_feature_names:
    train[var]=train[var].astype('category')
    test[var]=test[var].astype('category')

In [0]:
feature_names=['season','weather','temp','atemp','humidity','windspeed','year','hour','dayofweek','holiday','workingday']
feature_names

In [0]:
X_train=train[feature_names]
print(X_train.shape)
X_train.head()

In [0]:
X_test=test[feature_names]
print(X_test.shape)
X_test.head()

In [0]:
label_name='count'
y_train=train[label_name]
print(y_train.shape)
y_train.head()

In [0]:
from sklearn.metrics import make_scorer

In [0]:
def rmsle(predicted_values, actual_values):
    predicted_values=np.array(predicted_values)
    actual_valuees=np.array(actual_values)
    
    log_predict=np.log(predicted_values+1)
    log_actual=np.log(actual_values+1)
    
    difference=log_predict - log_actual
    difference=np.square(difference)
    
    mean_difference=difference.mean()
    
    score=np.sqrt(mean_difference)
    
    return score

rmsle_scorer=make_scorer(rmsle)
rmsle_scorer

In [0]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold=KFold(n_splits=10, shuffle=True, random_state=29)

### RandomForest

In [0]:
from sklearn.ensemble import RandomForestRegressor

max_depth_list=[]
model=RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=28)
model

In [0]:
%time score = cross_val_score(model, X_train, y_train, cv=k_fold, scoring=rmsle_scorer)
score=score.mean()
print('score={0:.5f}'.format(score))

In [0]:
model.fit(X_train, y_train)

In [0]:
predictions=model.predict(X_test)
print(predictions.shape)
predictions[0:10]

In [0]:
fig,(ax1,ax2)=plt.subplots(ncols=2)
fig.set_size_inches(12,5)
sns.distplot(y_train,ax=ax1, bins=50)
ax1.set(title='train')
sns.distplot(predictions, ax=ax2, bins=50)
ax2.set(title='test')

### Submit

In [0]:
submission=pd.read_csv('Submission.csv')
submission
submission['count']=predictions
print(submission.shape)
submission.head()

In [0]:
submission.to_csv('submission.csv'.format(score), index=False)