In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/

/content/drive/MyDrive


In [None]:
import numpy as np
import pandas as pd
from statsmodels.regression.quantile_regression import QuantReg
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
import lightgbm as lgb
import warnings
warnings.filterwarnings(action='ignore')

train = pd.read_csv('./dacon_태양광발전량예측/preprocessed_data/pre_train_0118.csv')
test = pd.read_csv('./dacon_태양광발전량예측/preprocessed_data/pre_test_0118.csv')

  import pandas.util.testing as tm


## 변수 선택

In [None]:
def lin_uni(q, train_x, train_y) :
    model = QuantReg(train_y, train_x).fit(q = q, max_iter = 5000, kernel = 'gau') # max_iter = 5000!!
    p_value = model.pvalues[0]
    return model, p_value

def feature_selection_pvalue(train_x, train_y) :
  include_column = []
  for i in range(len(train_x.columns)) :
    train_x_subset = train_x.iloc[:,i]
    model, p_value = lin_uni(0.9, train_x_subset, train_y) # q = 0.9!!
    if p_value < 0.05 :
      include_column.append(train_x.columns[i])
    
  train_x_final = train_x[include_column]

  # 선택된 변수들 간에 완벽한 다중공선성의 문제로 피팅이 안되는 경우 발생 : VIF계수가 Inf인 변수 제거
  vif = pd.DataFrame()
  vif["feature"] = train_x_final.columns
  vif["VIF"] = [variance_inflation_factor(train_x_final.values, i) for i in range(len(train_x_final.columns))] 
  include_column = vif.loc[vif.VIF!=float('inf'),'feature']
  return include_column

In [None]:
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(train.iloc[:, :-2], train.iloc[:, -1], test_size=0.3, random_state=8)
X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(train.iloc[:, :-2], train.iloc[:, -2], test_size=0.3, random_state=8)

include_column_1 = feature_selection_pvalue(X_train_1, Y_train_1)
include_column_2 = feature_selection_pvalue(X_train_2, Y_train_2)

## 선택된 변수로 fitting하는 함수

In [None]:
def quantile_loss(q, y, f):
    e = y - f
    return np.maximum(q * e, (q - 1) * e)
  
def lin(q, train_x, train_y, valid_x, valid_y, test) :
    model = QuantReg(train_y, train_x).fit(q = q, max_iter = 5000, kernel = 'gau')
    val = pd.Series(model.predict(valid_x).round(2))
    pred = pd.Series(model.predict(test).round(2))
    return model, val, pred

def fit_lin_final(selected_list, train_x, train_y, valid_x, valid_y, test) :

  models = []
  val_scores = 0
  preds = pd.DataFrame()

  train_x_final = train_x[selected_list]
  valid_x_final = valid_x[selected_list]
  test_final = test[selected_list]

  for q in quantiles :
    print(q)
    model, val, pred = lin(q, train_x_final, train_y, valid_x_final, valid_y, test_final)
    models.append(model)
    val_scores += quantile_loss(q, valid_y, val).mean()
    preds = pd.concat([preds,pred], axis=1)

  preds.columns = quantiles
  return models, preds, val_scores

## 위 함수로 cv 돌리는 코드

In [None]:
# CV
f_scores = 0
spliter = ShuffleSplit(n_splits=5, test_size=0.3, random_state=8)

for train_index, valid_index in spliter.split(list(train.index)): 
  X_train_1,Y_train_1,X_valid_1,Y_valid_1 = train.iloc[train_index,:-2],train.iloc[train_index,-2], train.iloc[valid_index,:-2], train.iloc[valid_index,-2]
  X_train_2,Y_train_2,X_valid_2,Y_valid_2 = train.iloc[train_index,:-2],train.iloc[train_index,-1], train.iloc[valid_index,:-2], train.iloc[valid_index,-1]
  quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

  # Target1
  models_1, results_1, val_scores_1 = fit_lin_final(include_column_1, X_train_1, Y_train_1, X_valid_1, Y_valid_1, test)
  # Target2
  models_2, results_2, val_scores_2 = fit_lin_final(include_column_2, X_train_2, Y_train_2, X_valid_2, Y_valid_2, test)
  
  f_scores += (val_scores_1 + val_scores_2)/18
  print((val_scores_1 + val_scores_2)/18)

print(f_scores)

## 위 함수로 최종 예측하는 코드(전체 train set 이용)

In [None]:
# Final Predict
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(train.iloc[:, :-2], train.iloc[:, -1], test_size=0.3, random_state=8)
X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(train.iloc[:, :-2], train.iloc[:, -2], test_size=0.3, random_state=8)

models_1, results_1, val_scores_1 = fit_lin_final(include_column_1, X_train_1, Y_train_1, X_valid_1, Y_valid_1, test)
models_2, results_2, val_scores_2 = fit_lin_final(include_column_2, X_train_2, Y_train_2, X_valid_2, Y_valid_2, test)

# 음수로 나온 예측값을 0으로 바꾸는 함수
def get_positive(data) :
  for i in range(data.shape[0]) :
    for j in range(data.shape[1]) :
      if data.iloc[i,j] <= 0 : data.iloc[i,j] = 0
  return data

results_1 = get_positive(results_1)
results_2 = get_positive(results_2)