In [24]:
import pandas as pd
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import time
from tqdm import tqdm
from lightgbm import LGBMClassifier

### 6개월치 proba 평균

In [25]:
df_weather=pd.read_csv('static/data/기상데이터 0510 index X.csv')
df_weather.head()

Unnamed: 0,관측일자,temperature,temp_max,temp_min,rainfall,humidity,조사지역,연,월,일
0,1988-01-01,-2.4,-0.8,-4.4,0.5,64.0,서울,1988,1,1
1,1988-01-01,-2.6,0.5,-6.2,1.9,73.0,관악산,1988,1,1
2,1988-01-01,-4.1,-0.7,-8.6,0.0,66.0,춘천,1988,1,1
3,1988-01-01,-3.1,-1.0,-8.6,0.4,79.0,철원,1988,1,1
4,1988-01-01,3.7,6.5,0.1,0.0,57.0,속초,1988,1,1


In [32]:
df_grain=pd.read_csv('static/data/0515_작물추가.csv')
df_grain.head()

Unnamed: 0,temperature,rainfall,temp_min,temp_max,label
0,20.879744,202.935536,18,28,rice
1,21.770462,226.655537,18,28,rice
2,23.004459,263.964248,18,28,rice
3,26.491096,242.864034,18,28,rice
4,20.130175,262.71734,18,28,rice


In [33]:
df_grain['label'].value_counts()

wheat          101
cocoa          101
rice           100
고추             100
감              100
레빗아이 블루베리      100
하이부시 블루베리      100
딸기             100
토마토            100
가지             100
오이             100
베              100
참외             100
멜론             100
호박             100
마늘             100
시금치            100
무              100
배추             100
상추             100
살구             100
복숭아            100
maize          100
매실             100
chickpea       100
kidneybeans    100
lentil         100
pomegranate    100
banana         100
mango          100
grapes         100
watermelon     100
muskmelon      100
apple          100
orange         100
papaya         100
coconut        100
cotton         100
coffee         100
양파             100
Name: label, dtype: int64

In [50]:
# 6개월치 데이터를 추합 return 습도 x
def predict_6m_proba(region, start_month, pred_year):
    df = pd.DataFrame()
    for mon in range(start_month, start_month+6):
        pred_data = OLS_model_predict(region, mon, pred_year)
        # print(pred_data)
        tmp_data =[]
        for data in pred_data :
            tmp_data.append(data[-1])
        # print(tmp_data)
        df_pred  = predict_crops_6m(tmp_data)
        df = pd.concat([df, df_pred])
    return df.groupby(df.index).mean().sort_values(by='확률', ascending=False)

def predict_crops_6m(data):
    tmp_grain = df_grain[['temperature', 'temp_max', 'temp_min',  'rainfall', 'label']]
    xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.01, n_estimators = 10)  
    labelencoder = LabelEncoder()
    target_lb = labelencoder.fit_transform(tmp_grain.label)
    scaler = MinMaxScaler()
    grain_data = tmp_grain[tmp_grain.columns[:-1]]

    grain_data= grain_data.append({'temperature' :  data[0], 'temp_max' : data[1], 'temp_min' : data[2], 'rainfall' : data[3]*30} , ignore_index=True)

    grain_mmx = scaler.fit_transform(grain_data)



    X_train, X_test, y_train, y_test = train_test_split(
        grain_mmx[:-1], target_lb, stratify=target_lb, test_size=0.2,
        random_state=2022
    )
    xgb.fit(X_train, y_train)

    print(grain_mmx[-1])
    print(labelencoder.classes_)
    df_pred = pd.DataFrame(xgb.predict_proba((grain_mmx[-1].reshape(1,-1))), columns=labelencoder.classes_).T
    df_pred.columns = ['확률']
    df_pred = df_pred.sort_values(by='확률',ascending=False)
    return_pred = df_pred
    return return_pred

def OLS_model_predict(region, month, pred_year):
    df_tmp_region = df_weather[df_weather['조사지역'] == region]
    df_weather_region = df_tmp_region.groupby(['조사지역', '관측일자']).mean()
    df_weather_region = df_weather_region.reset_index(drop=False)

    df_group_mean = df_weather_region.groupby(['연', '월']).mean()
    df_group_mean = df_group_mean.reset_index(drop=False)
    df_month_mean = df_group_mean[df_group_mean['월'] == month]

    for i in df_month_mean.index:
        df_tmp = df_weather_region.loc[(df_weather_region['연'] == df_month_mean.loc[i, '연']) & (df_weather_region['월'] == df_month_mean.loc[i, '월'])]
        temp_max = df_tmp['temp_max'].max()
        temp_min = df_tmp['temp_min'].min()
        df_month_mean.loc[i, 'temp_max'] = temp_max
        df_month_mean.loc[i, 'temp_min'] = temp_min


    pred_list = []
    for value in df_month_mean.columns[2: -1]:
        df_target = df_month_mean[value]
        X_train2 = pd.DataFrame(df_month_mean[['연']], columns = ['연'])
        y_train2 = df_target.values

        # import statsmodels.api as sm
        X_train = sm.add_constant(X_train2)
        model = sm.OLS(y_train2, X_train2).fit()
        pred_value = model.predict(pred_year)
        pred_list.append([value, float(pred_value)])

    # print(f'\t\t\t{region}의 {month}월 {pred_year}의 예측값:')
    # print(pred_list)
    return pred_list
    


In [51]:
predict_6m_proba('서울', 3, 2022)

[0.         0.14845221 0.39091017 0.02928312]
['apple' 'banana' 'chickpea' 'cocoa' 'coconut' 'coffee' 'cotton' 'grapes'
 'kidneybeans' 'lentil' 'maize' 'mango' 'muskmelon' 'orange' 'papaya'
 'pomegranate' 'rice' 'watermelon' 'wheat' '가지' '감' '고추' '딸기' '레빗아이 블루베리'
 '마늘' '매실' '멜론' '무' '배추' '베' '복숭아' '살구' '상추' '시금치' '양파' '오이' '참외' '토마토'
 '하이부시 블루베리' '호박']
[0.11298909 0.45742011 0.52368955 0.04863353]
['apple' 'banana' 'chickpea' 'cocoa' 'coconut' 'coffee' 'cotton' 'grapes'
 'kidneybeans' 'lentil' 'maize' 'mango' 'muskmelon' 'orange' 'papaya'
 'pomegranate' 'rice' 'watermelon' 'wheat' '가지' '감' '고추' '딸기' '레빗아이 블루베리'
 '마늘' '매실' '멜론' '무' '배추' '베' '복숭아' '살구' '상추' '시금치' '양파' '오이' '참외' '토마토'
 '하이부시 블루베리' '호박']
[0.26782186 0.642786   0.64693917 0.06576816]
['apple' 'banana' 'chickpea' 'cocoa' 'coconut' 'coffee' 'cotton' 'grapes'
 'kidneybeans' 'lentil' 'maize' 'mango' 'muskmelon' 'orange' 'papaya'
 'pomegranate' 'rice' 'watermelon' 'wheat' '가지' '감' '고추' '딸기' '레빗아이 블루베리'
 '마늘' '매실' '멜론' '무' '배추' '

Unnamed: 0,확률
멜론,0.03192
kidneybeans,0.031215
딸기,0.031196
매실,0.030872
grapes,0.030869
lentil,0.030866
maize,0.029268
rice,0.02383
cocoa,0.023822
chickpea,0.023801


In [22]:
df.groupby(df.index).mean().sort_values(by='확률', ascending=False).head(3)

NameError: name 'df' is not defined

In [23]:
for region in tqdm(df_weather['조사지역'].unique()) :
    print(f'{region}지역 추천 Top 3 작물')
    display(predict_6m_proba(region, 3, 2022).head(3))
    print('\n')

  0%|          | 0/73 [00:00<?, ?it/s]

서울지역 추천 Top 3 작물


  0%|          | 0/73 [00:03<?, ?it/s]






ValueError: ('Expecting 2 dimensional numpy.ndarray, got: ', (4,))

### 모든지역 데이터를 한개의 df로 리턴해주는 함수

In [52]:
# 모든지역 데이터 return
def predict_6m_all_region(start_month, pred_year):
    df = pd.DataFrame()
    for mon in range(start_month, start_month+6):
        region_data = []
        for region in df_weather['조사지역'].unique():
            pred_data = OLS_model_predict(region, mon, pred_year)
            tmp_data =[]
            for data in pred_data :
                tmp_data.append(data[-1])
            tmp_data.append(region)
            region_data.append(tmp_data)

        # print(tmp_data)
        df_pred  = predict_crops_6m(region_data)
        df = pd.concat([df, df_pred])
        df_gr = df.groupby(df.index).mean()
        for col in df_gr.columns :
            df_gr = df_gr.sort_values(by=col, ascending=False)
        
    return df_gr

def predict_crops_6m(data):
    tmp_grain = df_grain[['temperature', 'temp_max', 'temp_min',  'rainfall', 'label']]
    xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.01, n_estimators = 10)  
    labelencoder = LabelEncoder()
    target_lb = labelencoder.fit_transform(tmp_grain.label)
    scaler = MinMaxScaler()
    grain_data = tmp_grain[tmp_grain.columns[:-1]]

    for value in data:
        grain_data= grain_data.append({'temperature' :  value[0], 'temp_max' : value[1], 'temp_min' : value[2], 'rainfall' : value[3]*30} , ignore_index=True)

    grain_mmx = scaler.fit_transform(grain_data)

    # display(grain_data.tail())
    # display(grain_mmx[:-1])
    # display(grain_mmx[-1])

    X_train, X_test, y_train, y_test = train_test_split(
        grain_mmx[:-len(data)], target_lb, stratify=target_lb, test_size=0.2,
        random_state=2022
    )
    xgb.fit(X_train, y_train)

    # print(grain_mmx[-1])
    return_pred = pd.DataFrame()
    for i, value in enumerate(data):
        df_pred = pd.DataFrame(xgb.predict_proba(grain_mmx[-(len(data)-i)].reshape(1,-1)), columns=labelencoder.classes_).T
        df_pred.columns = [value[4]]
        # df_pred = df_pred.sort_values(by='확률',ascending=False)
        return_pred = pd.concat([return_pred, df_pred], axis=1)
    return return_pred

def OLS_model_predict(region, month, pred_year):
    df_tmp_region = df_weather[df_weather['조사지역'] == region]
    df_weather_region = df_tmp_region.groupby(['조사지역', '관측일자']).mean()
    df_weather_region = df_weather_region.reset_index(drop=False)

    df_group_mean = df_weather_region.groupby(['연', '월']).mean()
    df_group_mean = df_group_mean.reset_index(drop=False)
    df_month_mean = df_group_mean[df_group_mean['월'] == month]

    for i in df_month_mean.index:
        df_tmp = df_weather_region.loc[(df_weather_region['연'] == df_month_mean.loc[i, '연']) & (df_weather_region['월'] == df_month_mean.loc[i, '월'])]
        temp_max = df_tmp['temp_max'].max()
        temp_min = df_tmp['temp_min'].min()
        df_month_mean.loc[i, 'temp_max'] = temp_max
        df_month_mean.loc[i, 'temp_min'] = temp_min

    pred_list = []
    for value in df_month_mean.columns[2: -2]:
        df_target = df_month_mean[value]
        X_train2 = pd.DataFrame(df_month_mean[['연']], columns = ['연'])
        y_train2 = df_target.values

        # import statsmodels.api as sm
        X_train = sm.add_constant(X_train2)
        model = sm.OLS(y_train2, X_train2).fit()
        pred_value = model.predict(pred_year)
        pred_list.append([region, float(pred_value)])

    # print(f'\t\t\t{region}의 {month}월 {pred_year}의 예측값:')
    # print(pred_list)
    return pred_list






In [54]:
def top3(data):
    df_all = pd.DataFrame()
    for col in data.columns:  
        df_tmp = pred_list[[col]]
        # display(df_tmp)
        df_tmp= df_tmp.sort_values(by=col, ascending=False)
        # display(df_tmp)
        # display(df_tmp.head(3).index)
        df_co = pd.DataFrame(df_tmp.head(3).index, columns = [col])
        df_co.index = [1, 2, 3]
        df_all = pd.concat([df_all, df_co], axis=1)

    return df_all

: 

In [53]:
%time predict_6m_all_region(4, 2100)

CPU times: user 23min 11s, sys: 1min 44s, total: 24min 55s
Wall time: 27min 9s


Unnamed: 0,서울,관악산,춘천,철원,속초,원주,대관령,태백,인제,홍천,...,포항,울릉도,동해,영월,흑산도,동두천,문산,진도,상주,고창
coconut,0.031503,0.024379,0.024458,0.024446,0.024569,0.024279,0.024408,0.024484,0.024466,0.024279,...,0.039007,0.024409,0.024534,0.024279,0.032031,0.024375,0.024454,0.024604,0.024554,0.031986
pomegranate,0.024228,0.024378,0.024457,0.024444,0.024567,0.031899,0.024407,0.024482,0.024465,0.031899,...,0.039011,0.024407,0.024533,0.031898,0.024227,0.024374,0.024452,0.024603,0.032355,0.031806
딸기,0.032032,0.024385,0.024464,0.024452,0.032127,0.024285,0.032218,0.024531,0.024472,0.024286,...,0.024147,0.03221,0.032092,0.024285,0.024229,0.024381,0.02446,0.032409,0.024596,0.031745
maize,0.024276,0.029885,0.029711,0.029953,0.029822,0.029533,0.02446,0.029988,0.029719,0.029533,...,0.029611,0.024462,0.035254,0.029532,0.024279,0.029628,0.029967,0.024606,0.030055,0.029446
멜론,0.032049,0.028472,0.032644,0.032621,0.028671,0.032285,0.02441,0.024485,0.032657,0.032282,...,0.024137,0.02441,0.024557,0.032281,0.02423,0.032477,0.032636,0.024606,0.028667,0.028102
rice,0.024322,0.024427,0.024551,0.024538,0.024662,0.02437,0.02441,0.024485,0.024513,0.02437,...,0.024192,0.024417,0.024629,0.024369,0.024278,0.024467,0.024546,0.024697,0.024646,0.024276
chickpea,0.024264,0.024447,0.024491,0.02448,0.024637,0.024348,0.024443,0.024551,0.024535,0.024313,...,0.024176,0.024409,0.024602,0.024347,0.024295,0.024408,0.024488,0.024605,0.024624,0.024255
토마토,0.024235,0.024425,0.024505,0.024454,0.024574,0.024285,0.024461,0.024533,0.024474,0.024288,...,0.024111,0.024414,0.024539,0.024287,0.024233,0.024422,0.024499,0.02461,0.024559,0.024233
cocoa,0.02432,0.024423,0.024542,0.02453,0.024654,0.024362,0.02441,0.024485,0.024509,0.024362,...,0.024117,0.040052,0.02462,0.024361,0.024251,0.024458,0.024538,0.032467,0.024637,0.024233
banana,0.031504,0.024379,0.024458,0.024446,0.024569,0.024279,0.024408,0.024484,0.024466,0.024279,...,0.024183,0.024409,0.024534,0.024279,0.024237,0.024375,0.024454,0.024604,0.024554,0.024227


### xgbooster best_params 추출
- 시계열 데이터의 특성을 고려하여 해당월의 classifier를 다 따로 돌림

In [None]:
# OLS_model_predict(region, month, pred_year):
region = '서울'
month = 6
pred_year = 2022

df_tmp_region = df_weather[df_weather['조사지역'] == region]
df_weather_region = df_tmp_region.groupby(['조사지역', '관측일자']).mean()
df_weather_region = df_weather_region.reset_index(drop=False)

df_group_mean = df_weather_region.groupby(['연', '월']).mean()
df_group_mean = df_group_mean.reset_index(drop=False)
df_month_mean = df_group_mean[df_group_mean['월'] == month]

for i in df_month_mean.index:
    df_tmp = df_weather_region.loc[(df_weather_region['연'] == df_month_mean.loc[i, '연']) & (df_weather_region['월'] == df_month_mean.loc[i, '월'])]
    temp_max = df_tmp['temp_max'].max()
    temp_min = df_tmp['temp_min'].min()
    df_month_mean.loc[i, 'temp_max'] = temp_max
    df_month_mean.loc[i, 'temp_min'] = temp_min

pred_list = []
for value in df_month_mean.columns[2: -1]:
    df_target = df_month_mean[value]
    X_train2 = pd.DataFrame(df_month_mean[['연']], columns = ['연'])
    y_train2 = df_target.values

    # import statsmodels.api as sm
    X_train = sm.add_constant(X_train2)
    model = sm.OLS(y_train2, X_train2).fit()
    pred_value = model.predict(pred_year)
    pred_list.append([value, float(pred_value)])

# print(f'\t\t\t{region}의 {month}월 {pred_year}의 예측값:')
# print(pred_list)
pred_list

[['temperature', 22.80263004569429],
 ['temp_max', 32.66101643862828],
 ['temp_min', 14.510903067907272],
 ['rainfall', 4.682851271821746],
 ['humidity', 67.1027178427563]]

In [None]:
from sklearn.model_selection import GridSearchCV
XGBClassifier().get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'verbosity': 1}

In [None]:
pred_data = pred_list
tmp_data =[]
for data in pred_data :
    tmp_data.append(data[-1])

data = tmp_data
tmp_grain = df_grain[['temperature', 'temp_max', 'temp_min',  'rainfall', 'label']]



labelencoder = LabelEncoder()
target_lb = labelencoder.fit_transform(tmp_grain.label)
scaler = MinMaxScaler()
grain_data = tmp_grain[tmp_grain.columns[:-1]]

grain_data= grain_data.append({'temperature' :  data[0], 'temp_max' : data[1], 'temp_min' : data[2], 'rainfall' : data[3]*30} , ignore_index=True)

grain_mmx = scaler.fit_transform(grain_data)

# display(grain_data.tail())
# display(grain_mmx[:-1])
# display(grain_mmx[-1])

X_train, X_test, y_train, y_test = train_test_split(
    grain_mmx[:-1], target_lb, stratify=target_lb, test_size=0.2,
    random_state=2022
)

xgb = XGBClassifier()  
params = {
    'booster' : ['gbtree', 'gblinear'],
    'n_estimators' : [10, 100, 1000],
    'learning_rate' : [0.01, 0.05, 0.1]
}

xgb_grid = GridSearchCV(
    xgb, params, scoring='accuracy', cv=5
)


xgb_grid.fit(X_train, y_train)

print(xgb_grid.best_score_, xgb_grid.best_params_)
xgb_best = xgb_grid.best_estimator_

# print(grain_mmx[-1])
df_pred = pd.DataFrame(xgb_best.predict_proba(list(grain_mmx[-1])), columns=labelencoder.classes_).T
df_pred.columns = ['확률']
df_pred = df_pred.sort_values(by='확률',ascending=False)
return_pred = df_pred
return_pred

0.9953968938740294 {'booster': 'gbtree', 'learning_rate': 0.01, 'n_estimators': 10}


Unnamed: 0,확률
cocoa,0.076987
coconut,0.051366
cotton,0.051278
banana,0.051278
maize,0.051278
papaya,0.051275
kidneybeans,0.051274
watermelon,0.051274
chickpea,0.051274
lentil,0.051273


In [30]:
# xgbooster GridSearch 샘플 10개만 (너무 오래 걸림)
def xgboost_best_params(start_month, pred_year):
    sample_list = np.random.choice(72, 10)
    df = pd.DataFrame()
    for mon in range(start_month, start_month+6):
        region_data = []
        for region in df_weather['조사지역'].unique()[sample_list]:
            pred_data = OLS_model_predict(region, mon, pred_year)
            tmp_data =[]
            for data in pred_data :
                tmp_data.append(data[-1])
            tmp_data.append(region)
            region_data.append(tmp_data)

        # print(tmp_data)
        df_pred  = xgboost_params(region_data)
        df = pd.concat([df, df_pred])
        df_gr = df.groupby(df.index).mean()
        for col in df_gr.columns :
            df_gr = df_gr.sort_values(by=col, ascending=False)
        
    return df_gr

def xgboost_params(data):
    tmp_grain = df_grain[['temperature', 'temp_max', 'temp_min',  'rainfall', 'label']]

    labelencoder = LabelEncoder()
    target_lb = labelencoder.fit_transform(tmp_grain.label)
    scaler = MinMaxScaler()
    grain_data = tmp_grain[tmp_grain.columns[:-1]]

    for value in data:
        grain_data= grain_data.append({'temperature' :  value[0], 'temp_max' : value[1], 'temp_min' : value[2], 'rainfall' : value[3]*30} , ignore_index=True)

    grain_mmx = scaler.fit_transform(grain_data)

    # display(grain_data.tail())
    # display(grain_mmx[:-1])
    # display(grain_mmx[-1])

    X_train, X_test, y_train, y_test = train_test_split(
        grain_mmx[:-len(data)], target_lb, stratify=target_lb, test_size=0.2,
        random_state=2022
    )


    xgb = XGBClassifier()  
    params = {
        'booster' : ['gbtree', 'gblinear'],
        'n_estimators' : [10, 100, 1000],
        'learning_rate' : [0.01, 0.05, 0.1]
    }

    xgb_grid = GridSearchCV(
        xgb, params, scoring='accuracy', cv=5
    )

    xgb_grid.fit(X_train, y_train)

    print(xgb_grid.best_score_, xgb_grid.best_params_)
    xgb_best = xgb_grid.best_estimator_

    # print(grain_mmx[-1])
    return_pred = pd.DataFrame()
    for i, value in enumerate(data):
        df_pred = pd.DataFrame(xgb_best.predict_proba(list(grain_mmx[-(len(data)-i)])), columns=labelencoder.classes_).T
        df_pred.columns = [value[4]]
        # df_pred = df_pred.sort_values(by='확률',ascending=False)
        return_pred = pd.concat([return_pred, df_pred], axis=1)
    return return_pred

def OLS_model_predict(region, month, pred_year):
    df_tmp_region = df_weather[df_weather['조사지역'] == region]
    df_weather_region = df_tmp_region.groupby(['조사지역', '관측일자']).mean()
    df_weather_region = df_weather_region.reset_index(drop=False)

    df_group_mean = df_weather_region.groupby(['연', '월']).mean()
    df_group_mean = df_group_mean.reset_index(drop=False)
    df_month_mean = df_group_mean[df_group_mean['월'] == month]

    for i in df_month_mean.index:
        df_tmp = df_weather_region.loc[(df_weather_region['연'] == df_month_mean.loc[i, '연']) & (df_weather_region['월'] == df_month_mean.loc[i, '월'])]
        temp_max = df_tmp['temp_max'].max()
        temp_min = df_tmp['temp_min'].min()
        df_month_mean.loc[i, 'temp_max'] = temp_max
        df_month_mean.loc[i, 'temp_min'] = temp_min

    pred_list = []
    for value in df_month_mean.columns[2: -2]:
        df_target = df_month_mean[value]
        X_train2 = pd.DataFrame(df_month_mean[['연']], columns = ['연'])
        y_train2 = df_target.values

        # import statsmodels.api as sm
        X_train = sm.add_constant(X_train2)
        model = sm.OLS(y_train2, X_train2).fit()
        pred_value = model.predict(pred_year)
        pred_list.append([region, float(pred_value)])

    # print(f'\t\t\t{region}의 {month}월 {pred_year}의 예측값:')
    # print(pred_list)
    return pred_list






In [31]:
xgboost_best_params(3, 2022)

KeyboardInterrupt: 

### LGBMClassifier 사용

In [None]:
# lgbm GridSearch 샘플 10개만 (너무 오래 걸림)
def lgbm_best_params(start_month, pred_year):
    sample_list = np.random.choice(72, 10)
    df = pd.DataFrame()
    for mon in range(start_month, start_month+6):
        region_data = []
        for region in df_weather['조사지역'].unique()[sample_list]:
            pred_data = OLS_model_predict(region, mon, pred_year)
            tmp_data =[]
            for data in pred_data :
                tmp_data.append(data[-1])
            tmp_data.append(region)
            region_data.append(tmp_data)

        # print(tmp_data)
        df_pred  = lgbm_params(region_data)
        df = pd.concat([df, df_pred])
        df_gr = df.groupby(df.index).mean()
        for col in df_gr.columns :
            df_gr = df_gr.sort_values(by=col, ascending=False)
        
    return df_gr

def lgbm_params(data):
    tmp_grain = df_grain[['temperature', 'temp_max', 'temp_min',  'rainfall', 'label']]

    labelencoder = LabelEncoder()
    target_lb = labelencoder.fit_transform(tmp_grain.label)
    scaler = MinMaxScaler()
    grain_data = tmp_grain[tmp_grain.columns[:-1]]

    for value in data:
        grain_data= grain_data.append({'temperature' :  value[0], 'temp_max' : value[1], 'temp_min' : value[2], 'rainfall' : value[3]*30} , ignore_index=True)

    grain_mmx = scaler.fit_transform(grain_data)

    # display(grain_data.tail())
    # display(grain_mmx[:-1])
    # display(grain_mmx[-1])

    X_train, X_test, y_train, y_test = train_test_split(
        grain_mmx[:-len(data)], target_lb, stratify=target_lb, test_size=0.2,
        random_state=2022
    )


    lgb = LGBMClassifier() 
    params = {
        # 'boosting ' : ['gbdt', 'rf'],
        'n_estimators' : [100, 1000, 1000],
        'learning_rate' : [0.01, 0.05],
        'num_iteration ' : [5, 10, 50]
    }

    lgb_grid = GridSearchCV(
        lgb, params, scoring='accuracy', cv=5
    )



    evals = [(X_test, y_test)]
    lgb_grid.fit(X_train, y_train, eval_set=evals, eval_metric='logloss', verbose=False)

    print(lgb_grid.best_score_, lgb_grid.best_params_)
    lgb_best = lgb_grid.best_estimator_

    # print(grain_mmx[-1])
    return_pred = pd.DataFrame()
    for i, value in enumerate(data):
        df_pred = pd.DataFrame(lgb_best.predict_proba(list(grain_mmx[-(len(data)-i)])), columns=labelencoder.classes_).T
        df_pred.columns = [value[4]]
        # df_pred = df_pred.sort_values(by='확률',ascending=False)
        return_pred = pd.concat([return_pred, df_pred], axis=1)
    return return_pred

def OLS_model_predict(region, month, pred_year):
    df_tmp_region = df_weather[df_weather['조사지역'] == region]
    df_weather_region = df_tmp_region.groupby(['조사지역', '관측일자']).mean()
    df_weather_region = df_weather_region.reset_index(drop=False)

    df_group_mean = df_weather_region.groupby(['연', '월']).mean()
    df_group_mean = df_group_mean.reset_index(drop=False)
    df_month_mean = df_group_mean[df_group_mean['월'] == month]

    for i in df_month_mean.index:
        df_tmp = df_weather_region.loc[(df_weather_region['연'] == df_month_mean.loc[i, '연']) & (df_weather_region['월'] == df_month_mean.loc[i, '월'])]
        temp_max = df_tmp['temp_max'].max()
        temp_min = df_tmp['temp_min'].min()
        df_month_mean.loc[i, 'temp_max'] = temp_max
        df_month_mean.loc[i, 'temp_min'] = temp_min

    pred_list = []
    for value in df_month_mean.columns[2: -2]:
        df_target = df_month_mean[value]
        X_train2 = pd.DataFrame(df_month_mean[['연']], columns = ['연'])
        y_train2 = df_target.values

        # import statsmodels.api as sm
        X_train = sm.add_constant(X_train2)
        model = sm.OLS(y_train2, X_train2).fit()
        pred_value = model.predict(pred_year)
        pred_list.append([region, float(pred_value)])

    # print(f'\t\t\t{region}의 {month}월 {pred_year}의 예측값:')
    # print(pred_list)
    return pred_list






In [None]:
lgbm_best_params(4, 2022)

0.9980263157894737 {'learning_rate': 0.01, 'n_estimators': 1000, 'num_iteration ': 5}


ValueError: ignored

In [None]:
# 모든지역 데이터 return
def predict_6m_all_region_lgb(start_month, pred_year):
    df = pd.DataFrame()
    for mon in range(start_month, start_month+6):
        region_data = []
        for region in df_weather['조사지역'].unique():
            pred_data = OLS_model_predict(region, mon, pred_year)
            tmp_data =[]
            for data in pred_data :
                tmp_data.append(data[-1])
            tmp_data.append(region)
            region_data.append(tmp_data)

        # print(tmp_data)
        df_pred  = predict_crops_6m(region_data)
        df = pd.concat([df, df_pred])
        df_gr = df.groupby(df.index).mean()
        for col in df_gr.columns :
            df_gr = df_gr.sort_values(by=col, ascending=False)
        
    return df_gr

def predict_crops_6m_lgb(data):
    tmp_grain = df_grain[['temperature', 'temp_max', 'temp_min',  'rainfall', 'label']]

    labelencoder = LabelEncoder()
    target_lb = labelencoder.fit_transform(tmp_grain.label)
    scaler = MinMaxScaler()
    grain_data = tmp_grain[tmp_grain.columns[:-1]]

    for value in data:
        grain_data= grain_data.append({'temperature' :  value[0], 'temp_max' : value[1], 'temp_min' : value[2], 'rainfall' : value[3]*30} , ignore_index=True)

    grain_mmx = scaler.fit_transform(grain_data)

    # display(grain_data.tail())
    # display(grain_mmx[:-1])
    # display(grain_mmx[-1])

    X_train, X_test, y_train, y_test = train_test_split(
        grain_mmx[:-len(data)], target_lb, stratify=target_lb, test_size=0.2,
        random_state=2022
    )

    lgb = LGBMClassifier(learning_rate = 0.01, n_estimators = 1000, num_iteration = 5) 

    evals = [(X_test, y_test)]
    lgb.fit(X_train, y_train, eval_set=evals, eval_metric='logloss', verbose=False)


    # print(grain_mmx[-1])
    return_pred = pd.DataFrame()
    for i, value in enumerate(data):
        df_pred = pd.DataFrame(lgb.predict_proba(list(grain_mmx[-(len(data)-i)])), columns=labelencoder.classes_).T
        df_pred.columns = [value[4]]
        # df_pred = df_pred.sort_values(by='확률',ascending=False)
        return_pred = pd.concat([return_pred, df_pred], axis=1)
    return return_pred

def OLS_model_predict(region, month, pred_year):
    df_tmp_region = df_weather[df_weather['조사지역'] == region]
    df_weather_region = df_tmp_region.groupby(['조사지역', '관측일자']).mean()
    df_weather_region = df_weather_region.reset_index(drop=False)

    df_group_mean = df_weather_region.groupby(['연', '월']).mean()
    df_group_mean = df_group_mean.reset_index(drop=False)
    df_month_mean = df_group_mean[df_group_mean['월'] == month]

    for i in df_month_mean.index:
        df_tmp = df_weather_region.loc[(df_weather_region['연'] == df_month_mean.loc[i, '연']) & (df_weather_region['월'] == df_month_mean.loc[i, '월'])]
        temp_max = df_tmp['temp_max'].max()
        temp_min = df_tmp['temp_min'].min()
        df_month_mean.loc[i, 'temp_max'] = temp_max
        df_month_mean.loc[i, 'temp_min'] = temp_min

    pred_list = []
    for value in df_month_mean.columns[2: -2]:
        df_target = df_month_mean[value]
        X_train2 = pd.DataFrame(df_month_mean[['연']], columns = ['연'])
        y_train2 = df_target.values

        # import statsmodels.api as sm
        X_train = sm.add_constant(X_train2)
        model = sm.OLS(y_train2, X_train2).fit()
        pred_value = model.predict(pred_year)
        pred_list.append([region, float(pred_value)])

    # print(f'\t\t\t{region}의 {month}월 {pred_year}의 예측값:')
    # print(pred_list)
    return pred_list






In [None]:
%time df = predict_6m_all_region_lgb(3, 2100)
for col in df.columns:
    df = df.sort_values(by=col, ascending=False)
    print(col, '\n', df[col].head(3))

CPU times: user 1min 3s, sys: 441 ms, total: 1min 3s
Wall time: 1min 4s
서울 
 wheat      0.066671
coconut    0.058220
banana     0.058210
Name: 서울, dtype: float32
관악산 
 wheat          0.065890
rice           0.058279
kidneybeans    0.057813
Name: 관악산, dtype: float32
춘천 
 wheat     0.066810
lentil    0.058663
maize     0.057711
Name: 춘천, dtype: float32
철원 
 wheat     0.06640
grapes    0.05791
lentil    0.05791
Name: 철원, dtype: float32
속초 
 wheat    0.067379
cocoa    0.059515
maize    0.057883
Name: 속초, dtype: float32
원주 
 wheat     0.066593
lentil    0.058446
maize     0.057494
Name: 원주, dtype: float32
대관령 
 wheat     0.075296
grapes    0.058402
maize     0.057829
Name: 대관령, dtype: float32
태백 
 wheat          0.073527
cocoa          0.062328
kidneybeans    0.057393
Name: 태백, dtype: float32
인제 
 wheat     0.074589
lentil    0.058243
maize     0.057305
Name: 인제, dtype: float32
홍천 
 wheat     0.074350
lentil    0.058004
maize     0.057460
Name: 홍천, dtype: float32
대전 
 wheat      0.067166
co

In [None]:
%time df_xg = predict_6m_all_region(3, 2022)

for col in df_xg.columns:
    df_xg = df_xg.sort_values(by=col, ascending=False)
    print(col, '\n', df_xg[col].head(3))

CPU times: user 1min 9s, sys: 467 ms, total: 1min 9s
Wall time: 1min 15s
서울 
 wheat          0.065607
kidneybeans    0.057892
grapes         0.057516
Name: 서울, dtype: float32
관악산 
 wheat          0.065893
rice           0.057904
kidneybeans    0.057816
Name: 관악산, dtype: float32
춘천 
 wheat     0.066429
maize     0.064586
lentil    0.058283
Name: 춘천, dtype: float32
철원 
 wheat          0.073012
cocoa          0.057890
kidneybeans    0.057241
Name: 철원, dtype: float32
속초 
 wheat     0.067120
grapes    0.058626
maize     0.058033
Name: 속초, dtype: float32
원주 
 wheat     0.067121
lentil    0.058627
maize     0.058034
Name: 원주, dtype: float32
대관령 
 wheat    0.072875
maize    0.070929
rice     0.064342
Name: 대관령, dtype: float32
태백 
 wheat    0.074154
maize    0.063932
cocoa    0.058281
Name: 태백, dtype: float32
인제 
 wheat     0.074049
grapes    0.057503
lentil    0.057503
Name: 인제, dtype: float32
홍천 
 wheat     0.074283
maize     0.064440
lentil    0.058080
Name: 홍천, dtype: float32
대전 
 wheat    

In [None]:
%time df_xg = predict_6m_all_region(3, 2100)

for col in df_xg.columns:
    df_xg = df_xg.sort_values(by=col, ascending=False)
    print(col, '\n', df_xg[col].head(3))

CPU times: user 1min 6s, sys: 556 ms, total: 1min 6s
Wall time: 1min 13s
서울 
 wheat      0.066671
coconut    0.058220
banana     0.058210
Name: 서울, dtype: float32
관악산 
 wheat          0.065890
rice           0.058279
kidneybeans    0.057813
Name: 관악산, dtype: float32
춘천 
 wheat     0.066810
lentil    0.058663
maize     0.057711
Name: 춘천, dtype: float32
철원 
 wheat     0.06640
grapes    0.05791
lentil    0.05791
Name: 철원, dtype: float32
속초 
 wheat    0.067379
cocoa    0.059515
maize    0.057883
Name: 속초, dtype: float32
원주 
 wheat     0.066593
lentil    0.058446
maize     0.057494
Name: 원주, dtype: float32
대관령 
 wheat     0.075296
grapes    0.058402
maize     0.057829
Name: 대관령, dtype: float32
태백 
 wheat          0.073527
cocoa          0.062328
kidneybeans    0.057393
Name: 태백, dtype: float32
인제 
 wheat     0.074589
lentil    0.058243
maize     0.057305
Name: 인제, dtype: float32
홍천 
 wheat     0.074350
lentil    0.058004
maize     0.057460
Name: 홍천, dtype: float32
대전 
 wheat      0.067166
c