In [1]:
import time
import re
import datetime as dt
import random
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import sin,cos,arccos,pi,round
import math
import warnings
warnings.filterwarnings('ignore')

In [2]:
CFG = {
    'IMG_SIZE':224,
    'EPOCHS':10,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':32,
    'SEED':41
}



def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

In [4]:
cur_path = "/content/drive/Shareddrives/대기오염_예측_경진대회/데이터"
loc_PM = pd.read_csv('./META/pmmap.csv', encoding = 'UTF-8')
loc_PM.head()

Unnamed: 0,Location,Latitude,Longitude,Description
0,아름동,36.512252,127.246789,세종특별자치시 보듬3로 114 아름동커뮤니티센터 옥상 (아름동)
1,신흥동,36.592887,127.29255,세종 조치원읍 군청로 87-16(신흥동) 세종특별자치시 조치원청사 옥상
2,노은동,36.368242,127.318498,대전 유성구 노은동로 87번길 89(노은1동 주민센터) 노은1동 주민센터 3층 옥상
3,문창동,36.317215,127.437825,대전 중구 보문로 20번길 38(문창동 주민센터) 문창동 주민센터
4,읍내동,36.372388,127.417714,대전 대덕구 대전로 1331번길 75(태아산업(주)) 태아산업(주)


In [5]:
loc_PM.sort_values(by=['Location'], ascending=True,inplace=True)
loc_PM.set_index('Location',inplace=True)

In [6]:
#Description Drop -> 스크린 공간을 너무 많이 잡아 먹어서 삭제 
loc_PM.drop('Description',axis = 1,inplace = True)

In [7]:
loc_PM.head()

Unnamed: 0_level_0,Latitude,Longitude
Location,Unnamed: 1_level_1,Unnamed: 2_level_1
공주,36.446951,127.119209
노은동,36.368242,127.318498
논산,36.199217,127.087021
대천2동,36.353148,126.589735
독곶리,36.987579,126.391672


In [8]:
loc_AWS = pd.read_csv('./META/awsmap.csv', index_col = 'Location', encoding = 'UTF-8')
loc_AWS.head()

Unnamed: 0_level_0,Latitude,Longitude,Description
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
오월드,36.2913,127.3959,대전광역시 중구사정공원로 70 오월드 내 플라워랜드
세천,36.3402,127.4938,대전광역시 동구세천동 63-1
장동,36.4135,127.4382,대전광역시 대덕구장동 360-2
세종고운,36.5315,127.2406,세종특별자치시 고운동산 25번지
세종금남,36.4585,127.2688,세종특별자치시 시군구금남면 성덕리 77-3


In [9]:
#Description Drop -> 스크린 공간을 너무 많이 잡아 먹어서 삭제 
loc_AWS.drop('Description',axis = 1,inplace = True)

In [17]:
# Train data import
PM_loc = []

for nm_PM in loc_PM.index:
    pm = pd.read_csv(f'./TRAIN/{nm_PM}.csv',encoding = 'UTF-8',error_bad_lines=False)
    PM_loc.append(pm)
    globals()['PM_{}'.format(nm_PM)] = pm
    
for nm_AWS in loc_AWS.index:
    globals()['AWS_{}'.format(nm_AWS)]=pd.read_csv(f'./TRAIN_AWS/{nm_AWS}.csv',encoding = 'UTF-8',error_bad_lines=False)

In [18]:
# Test data import
TPM_loc = []

for nm_PM in loc_PM.index:
    tpm = pd.read_csv(f'./TEST_INPUT/{nm_PM}.csv',encoding = 'UTF-8',error_bad_lines=False)
    TPM_loc.append(tpm)
    globals()['TPM_{}'.format(nm_PM)] = tpm
    
for nm_AWS in loc_AWS.index:
    globals()['TAWS_{}'.format(nm_AWS)]=pd.read_csv(f'./TEST_AWS/{nm_AWS}.csv',encoding = 'UTF-8',error_bad_lines=False)

In [19]:
import math

def incenter(x_lst, y_lst):  # 내심
    p1_x = x_lst[0];p1_y = y_lst[0]
    p2_x = x_lst[1];p2_y = y_lst[1]
    p3_x = x_lst[2];p3_y = y_lst[2]
    l12 = math.sqrt((p1_x - p2_x)**2 + (p1_y - p2_y)**2)
    l13 = math.sqrt((p1_x - p3_x)**2 + (p1_y - p3_y)**2)
    l23 = math.sqrt((p2_x - p3_x)**2 + (p2_y - p3_y)**2)
    X = (l12*p3_x + l13*p2_x + l23*p1_x) / (l12 + l13 + l23)
    Y = (l12*p3_y + l13*p2_y + l23*p1_y) / (l12 + l13 + l23)
    return (X,Y)

def centroid(x_lst, y_lst):   # 무게중심
    n = len(x_lst)
    X = 0;Y = 0
    X = sum(x_lst);Y = sum(y_lst)
    X /= n;Y /= n
    return (X,Y)

## aws 와의 거리 계산 함수 -> 단순 유클리드 거리가 아니라 gps상의 거리를 이용

def rad2deg(radians):
    degrees = radians * 180 / pi
    return degrees

def deg2rad(degrees):
    radians = degrees * pi / 180
    return radians

def getDistanceBetweenPointsNew(latitude1, longitude1, latitude2, longitude2, unit = 'miles'):
    theta = longitude1 - longitude2
    dist = 60 * 1.1515 * rad2deg(
        arccos(
            (sin(deg2rad(latitude1)) * sin(deg2rad(latitude2))) + 
            (cos(deg2rad(latitude1)) * cos(deg2rad(latitude2)) * cos(deg2rad(theta)))
        )
    )
    if unit == 'miles':
        return round(dist, 2)
    if unit == 'kilometers':
        return round(dist * 1.609344, 3)
def distance(p1, p2):  #거리
    p1_x = p1[0];p1_y = p1[1]
    p2_x = p2[0];p2_y = p2[1]
    D = getDistanceBetweenPointsNew(p1_x, p1_y, p2_x,p2_y, unit = 'kilometers')
    return D


In [20]:
rads = []
Near_AWS = []
Count_AWS = []
lctn_AWS = {}
for i in range(len(loc_PM.index)): # 17개 PM 관측소
    lat_PM = loc_PM['Latitude'][i]
    lon_PM = loc_PM['Longitude'][i]
    
    lats_AWS = [];lons_AWS = []
    
    near_AWS = ''
    rad = 0
    count_AWS = 0
    
    while len(near_AWS) == 0:
        rad += 2.5     # 반경을 조금씩 넓혀가며
        
        for j in range(len(loc_AWS.index)):
            lat_AWS = loc_AWS['Latitude'][j]
            lon_AWS = loc_AWS['Longitude'][j]
            if distance((lat_AWS, lon_AWS), (lat_PM, lon_PM)) < rad:
                count_AWS += 1
                lats_AWS.append(lat_AWS);lons_AWS.append(lon_AWS);
                near_AWS += str('/') + loc_AWS.index[j]
    
    rads.append(rad)
    Near_AWS.append(near_AWS[1:])
    Count_AWS.append(count_AWS)
    lctn_AWS[f'{loc_PM.index[i]}'] = [lats_AWS,lons_AWS]
    
loc_PM['Rad'] = rads
loc_PM['Near_AWS'] = Near_AWS
loc_PM['Count_AWS'] = Count_AWS
loc_PM

Unnamed: 0_level_0,Latitude,Longitude,Rad,Near_AWS,Count_AWS,Cent_lat,Cent_lon,PM-Cent_d,sub_lat,sub_lon
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
공주,36.446951,127.119209,5.0,공주,1,36.4828,127.1365,4.275,-0.035849,-0.017291
노은동,36.368242,127.318498,10.0,계룡,1,36.3132,127.2407,9.274,0.055042,0.077798
논산,36.199217,127.087021,2.5,논산,1,36.2116,127.1082,2.347,-0.012383,-0.021179
대천2동,36.353148,126.589735,10.0,대천항,1,36.3244,126.5021,8.475,0.028748,0.087635
독곶리,36.987579,126.391672,5.0,대산,1,37.0106,126.3881,2.579,-0.023021,0.003572
동문동,36.780158,126.455197,15.0,태안,1,36.7585,126.2964,14.347,0.021658,0.158797
모종동,36.7827,127.01461,17.5,성거/아산,2,36.862,127.01075,8.824,-0.0793,0.00386
문창동,36.317215,127.437825,5.0,오월드,1,36.2913,127.3959,4.735,0.025915,0.041925
성성동,36.840313,127.141777,5.0,성거,1,36.8782,127.1561,4.401,-0.037887,-0.014323
신방동,36.782355,127.120506,12.5,세종전의/성거,2,36.7845,127.1783,5.152,-0.002145,-0.057794


In [21]:
cent_lat = [];cent_lon=[];distance_PC = []   # 일단 무게중심, 위도경도만
for nm_PM in loc_PM.index:
    cent_point = centroid(lctn_AWS[nm_PM][0], lctn_AWS[nm_PM][1])
    distance_PC.append(distance(cent_point,(loc_PM.loc[nm_PM]['Latitude'],loc_PM.loc[nm_PM]['Longitude'])))
    cent_lat.append(cent_point[0]);cent_lon.append(cent_point[1])
    
loc_PM['Cent_lat'] = cent_lat
loc_PM['Cent_lon'] = cent_lon
loc_PM['PM-Cent_d'] = distance_PC

loc_PM

Unnamed: 0_level_0,Latitude,Longitude,Rad,Near_AWS,Count_AWS,Cent_lat,Cent_lon,PM-Cent_d,sub_lat,sub_lon
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
공주,36.446951,127.119209,5.0,공주,1,36.4828,127.1365,4.275,-0.035849,-0.017291
노은동,36.368242,127.318498,10.0,계룡,1,36.3132,127.2407,9.274,0.055042,0.077798
논산,36.199217,127.087021,2.5,논산,1,36.2116,127.1082,2.347,-0.012383,-0.021179
대천2동,36.353148,126.589735,10.0,대천항,1,36.3244,126.5021,8.475,0.028748,0.087635
독곶리,36.987579,126.391672,5.0,대산,1,37.0106,126.3881,2.579,-0.023021,0.003572
동문동,36.780158,126.455197,15.0,태안,1,36.7585,126.2964,14.347,0.021658,0.158797
모종동,36.7827,127.01461,17.5,성거/아산,2,36.862,127.01075,8.824,-0.0793,0.00386
문창동,36.317215,127.437825,5.0,오월드,1,36.2913,127.3959,4.735,0.025915,0.041925
성성동,36.840313,127.141777,5.0,성거,1,36.8782,127.1561,4.401,-0.037887,-0.014323
신방동,36.782355,127.120506,12.5,세종전의/성거,2,36.7845,127.1783,5.152,-0.002145,-0.057794


In [22]:
df = pd.DataFrame()

In [23]:
'''
PM_loc = [PM_공주,PM_노은동, PM_논산,PM_대천2동,PM_독곶리,PM_동문동,PM_모종동, PM_문창동,PM_성성동,PM_신방동, PM_신흥동,
    PM_아름동,PM_예산군, PM_읍내동,PM_이원면, PM_정림동, PM_홍성읍]
'''
loc_PM['sub_lat'] = loc_PM['Latitude'] - loc_PM['Cent_lat']
loc_PM['sub_lon'] = loc_PM['Longitude'] - loc_PM['Cent_lon']

for i,nm_PM in enumerate(loc_PM.index):
    temp = PM_loc[i]
    #변경
    temp[['Rad','Count_AWS','sub_lat','sub_lon','PM-Cent_d']] = loc_PM.loc[nm_PM][['Rad','Count_AWS','sub_lat','sub_lon','PM-Cent_d']]
    connectaws = PM_loc[i].iloc[:,:2]
    listaws = list(loc_PM.iloc[i,3].split('/'))
    connectaws['기온(°C)'] = 0
    connectaws['풍향(deg)'] = 0
    connectaws['풍속(m/s)'] = 0
    connectaws['강수량(mm)'] = 0
    connectaws['습도(%)'] = 0
    for loc in listaws:
        a = pd.read_csv(f'./TRAIN_AWS/{loc}.csv',sep = ',|\n',encoding = 'UTF-8',error_bad_lines=False)
        connectaws['기온(°C)'] += a['기온(°C)']
        connectaws['풍향(deg)'] += a['풍향(deg)']
        connectaws['풍속(m/s)'] += a['풍속(m/s)']
        connectaws['강수량(mm)'] += a['강수량(mm)']
        connectaws['습도(%)'] += a['습도(%)']
    connectaws['기온(°C)'] /= len(listaws)
    connectaws['풍향(deg)']/= len(listaws)
    connectaws['풍속(m/s)'] /= len(listaws)
    connectaws['강수량(mm)']/= len(listaws)
    connectaws['습도(%)'] /= len(listaws)
    v = pd.merge(temp,connectaws,how='right',on=['연도','일시'])
    df = pd.concat([df,v], axis=0)
df.head()    

Unnamed: 0,연도,일시,측정소,PM2.5,Rad,Count_AWS,sub_lat,sub_lon,PM-Cent_d,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%)
0,0,01-01 00:00,공주,0.056,5.0,1,-0.035849,-0.017291,4.275,0.173776,0.201944,0.023018,0.0,0.828
1,0,01-01 01:00,공주,0.06,5.0,1,-0.035849,-0.017291,4.275,0.176935,0.168611,0.030691,0.0,0.831
2,0,01-01 02:00,공주,0.068,5.0,1,-0.035849,-0.017291,4.275,0.180095,0.087222,0.033248,0.0,0.784
3,0,01-01 03:00,공주,0.06,5.0,1,-0.035849,-0.017291,4.275,0.178515,0.087222,0.025575,0.0,0.745
4,0,01-01 04:00,공주,0.068,5.0,1,-0.035849,-0.017291,4.275,0.164297,0.113889,0.02046,0.0,0.75


In [24]:
df.tail()

Unnamed: 0,연도,일시,측정소,PM2.5,Rad,Count_AWS,sub_lat,sub_lon,PM-Cent_d,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%)
35059,3,12-31 19:00,홍성읍,0.06,5.0,1,-0.030027,0.009987,3.456,0.273302,0.832222,0.086957,0.0,0.671
35060,3,12-31 20:00,홍성읍,0.052,5.0,1,-0.030027,0.009987,3.456,0.271722,0.831667,0.043478,0.0,0.692
35061,3,12-31 21:00,홍성읍,0.044,5.0,1,-0.030027,0.009987,3.456,0.268562,0.8325,0.066496,0.0,0.706
35062,3,12-31 22:00,홍성읍,0.052,5.0,1,-0.030027,0.009987,3.456,0.262243,0.866944,0.043478,0.0,0.725
35063,3,12-31 23:00,홍성읍,0.06,5.0,1,-0.030027,0.009987,3.456,0.257504,0.0,0.0,0.0,0.71


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 596088 entries, 0 to 35063
Data columns (total 14 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   연도         596088 non-null  int64  
 1   일시         596088 non-null  object 
 2   측정소        596088 non-null  object 
 3   PM2.5      580546 non-null  float64
 4   Rad        596088 non-null  float64
 5   Count_AWS  596088 non-null  int64  
 6   sub_lat    596088 non-null  float64
 7   sub_lon    596088 non-null  float64
 8   PM-Cent_d  596088 non-null  float64
 9   기온(°C)     585026 non-null  float64
 10  풍향(deg)    585026 non-null  float64
 11  풍속(m/s)    585026 non-null  float64
 12  강수량(mm)    585026 non-null  float64
 13  습도(%)      585026 non-null  float64
dtypes: float64(10), int64(2), object(2)
memory usage: 68.2+ MB


In [26]:
## 달 ,날짜, 시간 구분해주기 -> 전부 int 정수형으로 
def time_spliter(df):
    df['달']=df['일시'].str.split("-").apply(lambda x: x[0]).apply(lambda x: int(x))
    
    
    df['날짜'] = df['일시'].str.split("-").apply(lambda x: x[1]).str.split(" ").apply(lambda x: x[0]).str.split(":").apply(lambda x: x[0]).apply(lambda x: int(x))
    df['시간'] = df['일시'].str.split("-").apply(lambda x: x[1]).str.split(" ").apply(lambda x: x[1]).str.split(":").apply(lambda x: x[0]).apply(lambda x: int(x))
    
    # 변경
    df['연간일자'] = df['달'] * 30 + df['날짜'] - 30
    
    #df.drop('일시',axis = 1,inplace = True)



## 주기성을 반영하기 위해서 달과 시간을 삼각함수에 집어 넣기 
# 참조
# https://dacon.io/competitions/official/235985/codeshare/7042?page=1&dtype=recent 

def sin_converter(df,col,a): ## 시간 변환 
    df[f"sin_{col}"] = df[col].apply(lambda x: math.sin(2*math.pi*x/a))

def cos_converter(df,col,a):
    df[f"cos_{col}"] = df[col].apply(lambda x: math.cos(2*math.pi*x/a))

def total_converter(df):
    sin_converter(df,'연간일자',365)
    cos_converter(df,'연간일자',365)

    sin_converter(df,'시간',24)
    cos_converter(df,'시간',24)
    
    sin_converter(df,'풍향(deg)',1)
    cos_converter(df,'풍향(deg)',1)


In [27]:
time_spliter(df)

In [29]:
total_converter(df)

df.drop(['달','날짜','시간'],axis=1, inplace=True)

In [30]:
df = df[['연도','sin_연간일자', 'cos_연간일자', 'sin_시간', 'cos_시간', '측정소', 'Rad', 'Count_AWS', 'sub_lat', 'sub_lon',
       'PM-Cent_d', '기온(°C)', 'sin_풍향(deg)', 'cos_풍향(deg)' , '풍속(m/s)', '강수량(mm)', '습도(%)','PM2.5']]
df.head()

Unnamed: 0,연도,sin_연간일자,cos_연간일자,sin_시간,cos_시간,측정소,Rad,Count_AWS,sub_lat,sub_lon,PM-Cent_d,기온(°C),sin_풍향(deg),cos_풍향(deg),풍속(m/s),강수량(mm),습도(%),PM2.5
0,0,0.017213,0.999852,0.0,1.0,공주,5.0,1,-0.035849,-0.017291,4.275,0.173776,0.954761,0.297375,0.023018,0.0,0.828,0.056
1,0,0.017213,0.999852,0.258819,0.965926,공주,5.0,1,-0.035849,-0.017291,4.275,0.176935,0.872069,0.489382,0.030691,0.0,0.831,0.06
2,0,0.017213,0.999852,0.5,0.866025,공주,5.0,1,-0.035849,-0.017291,4.275,0.180095,0.52101,0.853551,0.033248,0.0,0.784,0.068
3,0,0.017213,0.999852,0.707107,0.707107,공주,5.0,1,-0.035849,-0.017291,4.275,0.178515,0.52101,0.853551,0.025575,0.0,0.745,0.06
4,0,0.017213,0.999852,0.866025,0.5,공주,5.0,1,-0.035849,-0.017291,4.275,0.164297,0.656059,0.75471,0.02046,0.0,0.75,0.068


In [31]:
## 결측치 있는 자료 없애고 시작 
## df: 모든 자료가 있는 데이터
## full_data:  결측치가 없는 데이터  

In [32]:
df.isnull().sum()

연도                 0
sin_연간일자           0
cos_연간일자           0
sin_시간             0
cos_시간             0
측정소                0
Rad                0
Count_AWS          0
sub_lat            0
sub_lon            0
PM-Cent_d          0
기온(°C)         11062
sin_풍향(deg)    11062
cos_풍향(deg)    11062
풍속(m/s)        11062
강수량(mm)        11062
습도(%)          11062
PM2.5          15542
dtype: int64

In [33]:
df['wind_pow_yesterday'] = df.iloc[:,15].shift(axis=0)
df = df.iloc[1:,:]

In [34]:
full_data = df.dropna()

In [35]:
full_data['wind_pow_yesterday'] = full_data.iloc[:,15].shift(axis=0)


In [36]:
full_data = full_data.iloc[1:,:]


In [37]:
df.isnull().sum()

연도                        0
sin_연간일자                  0
cos_연간일자                  0
sin_시간                    0
cos_시간                    0
측정소                       0
Rad                       0
Count_AWS                 0
sub_lat                   0
sub_lon                   0
PM-Cent_d                 0
기온(°C)                11062
sin_풍향(deg)           11062
cos_풍향(deg)           11062
풍속(m/s)               11062
강수량(mm)               11062
습도(%)                 11062
PM2.5                 15542
wind_pow_yesterday    11062
dtype: int64

In [38]:
## wind_deg_predictor 

In [39]:
full_data.head()

Unnamed: 0,연도,sin_연간일자,cos_연간일자,sin_시간,cos_시간,측정소,Rad,Count_AWS,sub_lat,sub_lon,PM-Cent_d,기온(°C),sin_풍향(deg),cos_풍향(deg),풍속(m/s),강수량(mm),습도(%),PM2.5,wind_pow_yesterday
2,0,0.017213,0.999852,0.5,0.8660254,공주,5.0,1,-0.035849,-0.017291,4.275,0.180095,0.52101,0.853551,0.033248,0.0,0.784,0.068,0.0
3,0,0.017213,0.999852,0.707107,0.7071068,공주,5.0,1,-0.035849,-0.017291,4.275,0.178515,0.52101,0.853551,0.025575,0.0,0.745,0.06,0.0
4,0,0.017213,0.999852,0.866025,0.5,공주,5.0,1,-0.035849,-0.017291,4.275,0.164297,0.656059,0.75471,0.02046,0.0,0.75,0.068,0.0
5,0,0.017213,0.999852,0.965926,0.258819,공주,5.0,1,-0.035849,-0.017291,4.275,0.14218,0.716911,0.697165,0.033248,0.0,0.808,0.068,0.0
6,0,0.017213,0.999852,1.0,6.123234000000001e-17,공주,5.0,1,-0.035849,-0.017291,4.275,0.120063,0.0,1.0,0.007673,0.0,0.842,0.088,0.0


In [40]:
col = ['연도','sin 날짜','sin 달','sin 시간','cos 날짜','cos 달','cos 시간','풍속(m/s)','wind_pow_yesterday']
full_data[col]

KeyError: "['sin 날짜', 'sin 달', 'sin 시간', 'cos 날짜', 'cos 달', 'cos 시간'] not in index"

In [41]:
full_data.isna().sum()

연도                    0
sin_연간일자              0
cos_연간일자              0
sin_시간                0
cos_시간                0
측정소                   0
Rad                   0
Count_AWS             0
sub_lat               0
sub_lon               0
PM-Cent_d             0
기온(°C)                0
sin_풍향(deg)           0
cos_풍향(deg)           0
풍속(m/s)               0
강수량(mm)               0
습도(%)                 0
PM2.5                 0
wind_pow_yesterday    0
dtype: int64

In [None]:
x = full_data[['sin 달','cos 시간']]
y = full_data['풍속(m/s)']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler

#scaler = StandardScaler()
#x = scaler.fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, 
                                                 random_state = 42)

In [None]:
## Light GBM 

In [None]:
import lightgbm as lgb
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split



# define hyperparameters
lgb_model = lgb.LGBMRegressor()

# RandomizedSearchCV를 이용한 하이퍼파라미터 튜닝
params = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(2, 50),
    'learning_rate': uniform(0.01, 0.5),
    'num_leaves': randint(5, 50)
}

rs = RandomizedSearchCV(lgb_model, params, cv=5, n_iter=10, n_jobs=-1)

# 모델 훈련
rs.fit(x_train, y_train)

# 테스트 데이터 예측
y_pred = rs.predict(x_test)

# 결과 출력
print('Best params:', rs.best_params_)
print('Best score:', rs.best_score_)
print('Test score:', rs.score(x_test, y_test))

Best params: {'learning_rate': 0.12611128690024853, 'max_depth': 37, 'n_estimators': 182, 'num_leaves': 10}
Best score: 0.13204398786554078
Test score: 0.13751683428376715


In [None]:
x_final = df[['sin 달','cos 시간']]
wind_speed_answer = rs.predict(x_final)

In [None]:
from copy import deepcopy 
import joblib
wind_deg_protector = deepcopy(rs)

filename = 'wind_pow_estimator.joblib'
joblib.dump(rs, filename)



['wind_pow_estimator.joblib']

In [None]:
rs = joblib.load(filename)

In [None]:
len(x_final)
len(df)

596087

In [None]:
## 빈칸 채우기 
cond = (df['풍속(m/s)'].isnull())
answer_sheet = []
missing = list(df[cond].index)
for i in missing:
    answer_sheet.append(wind_speed_answer[i])


In [None]:
inputed_df = df.copy()
inputed_df.loc[inputed_df['풍속(m/s)'].isna(),'풍속(m/s)'] = answer_sheet

In [None]:
inputed_df.isnull().sum()

연도                        0
sin 달                     0
cos 달                     0
sin 날짜                    0
cos 날짜                    0
sin 시간                    0
cos 시간                    0
측정소                       0
Rad                       0
Count_AWS                 0
Cent_lat                  0
Cent_lon                  0
PM-Cent_d                 0
기온(°C)                11062
풍향(deg)               11062
풍속(m/s)                   0
강수량(mm)               11062
습도(%)                 11062
PM2.5                 15542
wind_pow_yesterday    11062
dtype: int64

In [None]:
## 기온 예측기 

x = full_data[['sin 달','cos 시간']]
y = full_data['기온(°C)']

## 

#scaler = StandardScaler()
#x = scaler.fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, 
                                                 random_state = 42)

## 


lgb_model1 = lgb.LGBMRegressor()


params1 = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(2, 50),
    'learning_rate': uniform(0.01, 0.5),
    'num_leaves': randint(5, 50)
}

rs1 = RandomizedSearchCV(lgb_model1, params1, cv=5, n_iter=10, n_jobs=-1)


rs1.fit(x_train, y_train)


y_pred = rs1.predict(x_test)


print('Best params:', rs1.best_params_)
print('Best score:', rs1.best_score_)
print('Test score:', rs1.score(x_test, y_test))


##

x_final = df[['sin 달','cos 시간']]
temperature_answer = rs1.predict(x_final)

temperature = deepcopy(rs1)

filename = 'temperature_estimator.joblib'
joblib.dump(rs1, filename)


##

## 빈칸 채우기 
cond = (df['기온(°C)'].isnull())
answer_sheet = []
missing = list(df[cond].index)
for i in missing:
    answer_sheet.append(temperature_answer[i])



inputed_df.loc[inputed_df['기온(°C)'].isna(),'기온(°C)'] = answer_sheet

Best params: {'learning_rate': 0.12611128690024853, 'max_depth': 37, 'n_estimators': 182, 'num_leaves': 10}
Best score: 0.5999010762061389
Test score: 0.6015049250404575


In [None]:
inputed_df.isnull().sum()

연도                        0
sin 달                     0
cos 달                     0
sin 날짜                    0
cos 날짜                    0
sin 시간                    0
cos 시간                    0
측정소                       0
Rad                       0
Count_AWS                 0
Cent_lat                  0
Cent_lon                  0
PM-Cent_d                 0
기온(°C)                    0
풍향(deg)               11062
풍속(m/s)                   0
강수량(mm)               11062
습도(%)                 11062
PM2.5                 15542
wind_pow_yesterday    11062
dtype: int64

In [None]:
## 강수량 예측기 

x = full_data[['sin 달','cos 시간',]]
y = full_data['강수량(mm)']

## 

#scaler = StandardScaler()
#x = scaler.fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, 
                                                 random_state = 42)

## 


lgb_model2 = lgb.LGBMRegressor()


params2 = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(2, 50),
    'learning_rate': uniform(0.01, 0.5),
    'num_leaves': randint(5, 50)
}

rs2 = RandomizedSearchCV(lgb_model2, params2, cv=5, n_iter=10, n_jobs=-1)


rs2.fit(x_train, y_train)


y_pred = rs2.predict(x_test)


print('Best params:', rs2.best_params_)
print('Best score:', rs2.best_score_)
print('Test score:', rs2.score(x_test, y_test))


##

x_final = df[['sin 달','cos 시간']]
percipitation_answer = rs2.predict(x_final)

percipitation = deepcopy(rs2)

filename = 'percipitation_estimator.joblib'
joblib.dump(rs2, filename)


##

## 빈칸 채우기 
cond = (df['강수량(mm)'].isnull())
answer_sheet = []
missing = list(df[cond].index)
for i in missing:
    answer_sheet.append(temperature_answer[i])



inputed_df.loc[inputed_df['강수량(mm)'].isna(),'강수량(mm)'] = answer_sheet

Best params: {'learning_rate': 0.4463005301820564, 'max_depth': 3, 'n_estimators': 262, 'num_leaves': 19}
Best score: 0.006421524273064594
Test score: 0.0071134796008824885


In [None]:
inputed_df.isnull().sum()

연도                        0
sin 달                     0
cos 달                     0
sin 날짜                    0
cos 날짜                    0
sin 시간                    0
cos 시간                    0
측정소                       0
Rad                       0
Count_AWS                 0
Cent_lat                  0
Cent_lon                  0
PM-Cent_d                 0
기온(°C)                    0
풍향(deg)               11062
풍속(m/s)                   0
강수량(mm)                   0
습도(%)                 11062
PM2.5                 15542
wind_pow_yesterday    11062
dtype: int64

In [None]:
## 습도 예측기 

x = full_data[['sin 달','cos 시간','강수량(mm)']]
y = full_data['습도(%)']

## 

#scaler = StandardScaler()
#x = scaler.fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, 
                                                 random_state = 42)

## 


lgb_model3 = lgb.LGBMRegressor()


params3 = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(2, 50),
    'learning_rate': uniform(0.01, 0.5),
    'num_leaves': randint(5, 50)
}

rs3 = RandomizedSearchCV(lgb_model3, params3, cv=5, n_iter=10, n_jobs=-1)


rs3.fit(x_train, y_train)


y_pred = rs3.predict(x_test)


print('Best params:', rs3.best_params_)
print('Best score:', rs3.best_score_)
print('Test score:', rs3.score(x_test, y_test))


##

x_final = df[['sin 달','cos 시간',"강수량(mm)"]]
humidity_answer = rs3.predict(x_final)



filename = 'humidity_estimator.joblib'
joblib.dump(rs3, filename)


##

## 빈칸 채우기 
cond = (df['습도(%)'].isnull())
answer_sheet = []
missing = list(df[cond].index)
for i in missing:
    answer_sheet.append(humidity_answer[i])



inputed_df.loc[inputed_df['습도(%)'].isna(),'습도(%)'] = answer_sheet

Best params: {'learning_rate': 0.07203353864219907, 'max_depth': 35, 'n_estimators': 387, 'num_leaves': 25}
Best score: 0.45743562273401944
Test score: 0.457137486892247


In [None]:
inputed_df.isnull().sum()

연도                        0
sin 달                     0
cos 달                     0
sin 날짜                    0
cos 날짜                    0
sin 시간                    0
cos 시간                    0
측정소                       0
Rad                       0
Count_AWS                 0
Cent_lat                  0
Cent_lon                  0
PM-Cent_d                 0
기온(°C)                    0
풍향(deg)               11062
풍속(m/s)                   0
강수량(mm)                   0
습도(%)                     0
PM2.5                 15542
wind_pow_yesterday    11062
dtype: int64

In [None]:
## 풍향 예측기 

x = full_data[['sin 달','cos 시간','풍향(deg)']]
y = full_data['풍속(m/s)']

## 

#scaler = StandardScaler()
#x = scaler.fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, 
                                                 random_state = 42)

## 


lgb_model4 = lgb.LGBMRegressor()


params4 = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(2, 50),
    'learning_rate': uniform(0.01, 0.5),
    'num_leaves': randint(5, 50)
}

rs4 = RandomizedSearchCV(lgb_model3, params3, cv=5, n_iter=10, n_jobs=-1)


rs4.fit(x_train, y_train)


y_pred = rs4.predict(x_test)


print('Best params:', rs4.best_params_)
print('Best score:', rs4.best_score_)
print('Test score:', rs4.score(x_test, y_test))


##

x_final = df[['sin 달','cos 시간','풍향(deg)']]
wind_deg_answer = rs4.predict(x_final)



filename = 'wind_deg_estimator.joblib'
joblib.dump(rs4, filename)


##

## 빈칸 채우기 
cond = (df['풍향(deg)'].isnull())
answer_sheet = []
missing = list(df[cond].index)
for i in missing:
    answer_sheet.append(wind_deg_answer[i])



inputed_df.loc[inputed_df['풍향(deg)'].isna(),'풍향(deg)'] = answer_sheet

Best params: {'learning_rate': 0.29963877469363287, 'max_depth': 15, 'n_estimators': 489, 'num_leaves': 6}
Best score: 0.3567869942898088
Test score: 0.3602701950759807


In [None]:
inputed_df.isnull().sum()

연도                        0
sin 달                     0
cos 달                     0
sin 날짜                    0
cos 날짜                    0
sin 시간                    0
cos 시간                    0
측정소                       0
Rad                       0
Count_AWS                 0
Cent_lat                  0
Cent_lon                  0
PM-Cent_d                 0
기온(°C)                    0
풍향(deg)                   0
풍속(m/s)                   0
강수량(mm)                   0
습도(%)                     0
PM2.5                 15542
wind_pow_yesterday    11062
dtype: int64

In [None]:
inputed_df.to_csv(os.path.join(cur_path,"inputed_data.csv"))