In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import scipy
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
submission = pd.read_csv('dataset/sampleSubmission.csv')

In [3]:
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [4]:
# datatime 쪼개기
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['minute'] = train['datetime'].dt.minute
train['second'] = train['datetime'].dt.second
# dayofweek 는 요일을 가져오는 말
#월(0) 화(1) 수(2) 목(3) 금(4) 토(5) 일(6)
train['dayofweek'] = train['datetime'].dt.dayofweek

test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour
test['minute'] = test['datetime'].dt.minute
test['second'] = test['datetime'].dt.second
# dayofweek 는 요일을 가져오는 말
#월(0) 화(1) 수(2) 목(3) 금(4) 토(5) 일(6)
test['dayofweek'] = test['datetime'].dt.dayofweek

In [5]:
from collections import Counter

def detect_outliers(data, n, cols):
    outlier_indices = []
    for col in cols:
        Q1 = np.percentile(data[col], 25)
        Q3 = np.percentile(data[col], 75)
        IQR = Q3 - Q1
        
        outlier_step = 1.5 * IQR
        
        outlier_list_col = data[(data[col] < Q1 - outlier_step) | (data[col] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
    
    return multiple_outliers

In [6]:
outliers_to_drop = detect_outliers(train, 2, ["temp", "atemp", "casual", "registered", "humidity", "windspeed", "count"])

In [8]:
# Randomforest로 대체
from sklearn.ensemble import RandomForestClassifier

def predict_windspeed(data):
    wind0 = data.loc[data['windspeed'] == 0]
    windnot0 = data.loc[data['windspeed'] != 0]
    
    #풍속이 날씨변수이기 때문에 날씨변수를 활용해서 windspeed를 예측해줄 것
    col = ['season', 'weather', 'temp', 'humidity', 'atemp', 'day']
    windnot0['windspeed'] = windnot0['windspeed'].astype('str')
    
    rf = RandomForestClassifier()
    #windspeed가 0이 아닌 컬럼으로 fit 해줌
    #model.fit(X_train, y_train)
    rf.fit(windnot0[col], windnot0['windspeed'])
 
    #windspeed가 0인 부분을 예측
    #model.predict(X_test)
    pred_wind0 = rf.predict(X = wind0[col])
    
    #wind0의 windspeed 값을 pred_wind0으로 바꿔주고
    wind0['windspeed'] = pred_wind0
    
    #windnot0과 wind0을 합쳐준다
    data = windnot0.append(wind0)
    data['windspeed'] = data['windspeed'].astype('float')
    
    data.reset_index(inplace = True)
    data.drop("index", inplace = True, axis = 1)
   
    return data

In [9]:
train = predict_windspeed(train)
test = predict_windspeed(test)

In [10]:
train = pd.get_dummies(train, columns = ['weather'], prefix = 'weather')
test = pd.get_dummies(test, columns = ['weather'], prefix = 'weather')

train = pd.get_dummies(train, columns = ['season'], prefix = 'season')
test = pd.get_dummies(test, columns = ['season'], prefix = 'season')

train = pd.get_dummies(train, columns = ['holiday'], prefix = 'holiday')
test = pd.get_dummies(test, columns = ['holiday'], prefix = 'holiday')

In [11]:
test_datetime = test['datetime']
train.drop(['datetime', 'workingday', 'minute', 'second', 'atemp', 'casual', 'registered'], axis=1, inplace=True)
test.drop(['datetime', 'workingday', 'minute', 'second', 'atemp'], axis=1, inplace=True)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [18]:
x_train = train.drop('count', axis=1).values
target_label = train['count'].values
x_test = test.values
x_train, x_val, y_train, y_val = train_test_split(x_train, target_label, test_size=0.2, random_state=2000)

In [52]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [29]:
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(x_train, y_train)

RandomForestClassifier(max_depth=10, random_state=0)

In [30]:
clf.score(x_train, y_train), clf.score(x_val, y_val)

(0.7687184198438217, 0.01652892561983471)

In [53]:
reg = RandomForestRegressor(max_depth=10, random_state=0)
reg.fit(x_train, y_train)
reg.score(x_train, y_train), reg.score(x_train, y_train)

(0.9522337442492639, 0.9522337442492639)

In [31]:
from sklearn.ensemble import GradientBoostingRegressor

In [32]:
regressor = GradientBoostingRegressor(n_estimators = 2000
					, learning_rate = 0.05
                                    , max_depth = 5
                                    , min_samples_leaf = 15
                                    , min_samples_split = 10
                                    , random_state = 42)
regressor.fit(x_train, y_train)

GradientBoostingRegressor(learning_rate=0.05, max_depth=5, min_samples_leaf=15,
                          min_samples_split=10, n_estimators=2000,
                          random_state=42)

In [33]:
regressor.score(x_train, y_train), regressor.score(x_val, y_val)

(0.9860940926033575, 0.9605989819774398)

In [34]:
from sklearn.metrics import classification_report

In [39]:
# classification_report는 사용 불가한듯
classification_report(y_val, regressor.predict(x_val))

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [40]:
regressor.predict(x_val)

array([174.40942014, 361.48759285, 145.0008908 , ..., 310.3025176 ,
        13.48644902, 103.54655007])

In [41]:
y_val

array([165, 361, 114, ..., 301,   5, 135])

In [49]:
dir(GradientBoostingRegressor)

['_SUPPORTED_LOSS',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_initialized',
 '_check_n_features',
 '_check_params',
 '_clear_state',
 '_compute_partial_dependence_recursion',
 '_estimator_type',
 '_fit_stage',
 '_fit_stages',
 '_get_param_names',
 '_get_tags',
 '_init_state',
 '_is_initialized',
 '_make_estimator',
 '_more_tags',
 '_raw_predict',
 '_raw_predict_init',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_resize_state',
 '_staged_raw_predict',
 '_valid

In [51]:
import inspect
inspect.signature(GradientBoostingRegressor)

<Signature (*, loss='squared_error', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)>