In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
# 사용자 운영체제 확인
import os
os.name

# 운영체제별 한글 폰트 설정
if os.name == 'posix': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif os.name == 'nt': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')

plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정


# 글씨 선명하게 출력하는 설정
%config InlineBackend.figure_format = 'retina'

In [14]:
train = pd.read_csv("data_dacon/train.csv", parse_dates=['일자'])
test = pd.read_csv("data_dacon/test.csv", parse_dates=['일자'])
submission = pd.read_csv("data_dacon/sample_submission.csv")

In [16]:
train['year'] = train['일자'].dt.year
train['month'] = train['일자'].dt.month
train['day']  = train['일자'].dt.day
train['dayofweek'] = train['일자'].dt.day_of_week

test['year'] = test['일자'].dt.year
test['month'] = test['일자'].dt.month
test['day']  = test['일자'].dt.day
test['dayofweek'] = test['일자'].dt.day_of_week

In [17]:
train['출근자수'] = train['본사정원수'] - train['본사휴가자수'] - train['본사출장자수'] - train['현본사소속재택근무자수']
test['출근자수'] = test['본사정원수'] - test['본사휴가자수'] - test['본사출장자수'] - test['현본사소속재택근무자수']

In [20]:
train['식사자수'] = train['출근자수'] * (1 + train['본사시간외근무명령서승인건수'] / train['출근자수'])
test['식사자수'] = test['출근자수'] * (1 + test['본사시간외근무명령서승인건수'] / test['출근자수'])

In [None]:
train.iloc[0]['중식계']/ train.iloc[0]['출근자수']

In [None]:
train.iloc[0]['석식계']/ train.iloc[0]['출근자수']

In [None]:
train['중식비율'] = train['중식계'] / train['출근자수']
train['석식비율'] = train['석식계'] / train['출근자수']

In [None]:
train[['출근자수', '중식비율', '석식비율', '중식계', '석식계']].describe()

In [None]:
train[(train['석식계'] == 0) & (train['요일'] =='화')]

In [None]:
tmp = train['중식메뉴'].iloc[0].split()
tmp

In [None]:
tmp.index('쌀밥/잡곡밥')

In [23]:
def sep_lunch(lunch):
    tmp = lunch.split()
    for menu in tmp:
        if "(" in menu:
            tmp.remove(menu)
        if "쌀밥" in menu:
            test1 = tmp.index(menu)
            tmp[test1] = "밥"
    return tmp

In [24]:
train['lunch_menu'] = train['중식메뉴'].apply(sep_lunch)
train['lunch_menu']

0                [밥, 오징어찌개, 쇠불고기, 계란찜, 청포묵무침, 요구르트, 포기김치]
1           [밥, 김치찌개, 가자미튀김, 모둠소세지구이, 마늘쫑무침, 요구르트, 배추겉절이]
2           [카레덮밥, 팽이장국, 치킨핑거, 쫄면야채무침, 견과류조림, 요구르트, 포기김치]
3               [밥, 쇠고기무국, 주꾸미볶음, 부추전, 시금치나물, 요구르트, 포기김치]
4                [밥, 떡국, 돈육씨앗강정, 우엉잡채, 청경채무침, 요구르트, 포기김치]
                              ...                        
1200    [밥, 아욱국, 수제함박스테이크, 견과류마카로니범벅, 생깻잎지, 단호박물김치, 양상...
1201    [밥, 냉이된장국, 동파육, 봄동전, 청경채/버섯숙회*초장, 무생채, 양상추샐러드*...
1202    [전주비빔밥*약고추장, 계란파국, 요거닭, 올방개묵무침, 파프리카해초무침, 포기김치...
1203    [밥, 전주식콩나물해장국, 돈육간장불고기, 깐풍연근, 연두부*달래양념장, 봄동겉절이...
1204    [밥, 들깨미역국, 교촌간장치킨, 옥수수콘치즈구이, 가지고추장무침, 포기김치/요구르...
Name: lunch_menu, Length: 1205, dtype: object

In [25]:
test['lunch_menu'] = test['중식메뉴'].apply(sep_lunch)
test['lunch_menu']

0     [밥, 대구지리, 매운돈갈비찜, 오꼬노미계란말이, 상추무침, 포기김치, 양상추샐러드...
1     [밥, 우렁된장찌개, 오리주물럭, 청양부추전, 수제삼색무쌈, 겉절이김치, 양상추샐러...
2     [밥, 팽이장국, 수제돈까스*소스, 가자미조림, 동초나물무침, 포기김치, 양상추샐러...
3     [밥, 배추들깨국, 오리대패불고기, 시금치프리타타, 부추고추장무침, 포기김치, 양상...
4     [밥, 부대찌개, 닭살데리야끼조림, 버섯탕수, 세발나물무침, 알타리김치/사과푸딩, ...
5         [밥, 아욱국, 매콤해물볶음, 감자조림, 미나리나물, 포기김치, 콥샐러드*렌치D]
6     [밥, 설렁탕, 고등어김치말이찜, 볼어묵굴소스볶음, 브로콜리숙회*초장, 석박지, 양...
7     [밥, 북엇국, 닭볶음탕, 채소전*장, 솎음열무나물무침, 포기김치, 양상추샐러드*황도D]
8     [밥, 감자양파국, 돈수육*씨앗쌈장, 매콤어묵볶음, 콩나물파채무침, 포기김치, 양상...
9     [밥, 장각백숙, 적어양념장구이, 채소스틱*쌈장, 도라지오이초무침, 겉절이김치, 양...
10    [유니짜장밥, 짬뽕국, 수제찹쌀꿔바로우, 계란후라이, 단무지락교무침, 포기김치, 그...
11    [밥, 떡국, 소갈비찜, 한식잡채, 참나물겉절이, 포기김치, 양상추샐러드*블루베리요...
12    [밥, 육개장, 닭살겨자냉채, 오이스틱*쌈장, 탕평채, 깍두기/수박, 양상추샐러드*...
13    [밥, 미니쌀국수, 삼겹살고추장구이, 스프링롤*타르타르D, 동초나물무침, 알타리김치...
14    [밥, 김치어묵탕, 수원왕갈비통닭, 두부양념조림, 연근깨소스무침, 포기김치, 양상추...
15    [밥, 유부장국, 해물누룽지탕, 김치전, 마약계란장조림, 포기김치, 양상추샐러드*딸기D]
16    [밥, 호박고추장찌개, 안동찜닭, 마카로니치즈범벅, 세발나물무침, 포기김치/요구르트...
17         [밥, 근대국, 등갈비김치찜, 감자채전*장, 치커리무침, 깍두기, 

In [26]:
train['lunch_bob'] = train['lunch_menu'].apply(lambda x: x[0])
train['lunch_soup'] = train['lunch_menu'].apply(lambda x: x[1])
train['lunch_main'] = train['lunch_menu'].apply(lambda x: x[2])
test['lunch_bob'] = test['lunch_menu'].apply(lambda x: x[0])
test['lunch_soup'] = test['lunch_menu'].apply(lambda x: x[1])
test['lunch_main'] = test['lunch_menu'].apply(lambda x: x[2])

In [None]:
train.drop(['lunch_menu'], axis=1, inplace=True)

In [None]:
train.drop(['조식메뉴'], axis=1, inplace=True)

In [None]:
train.drop(['lunch_dessert'], axis=1, inplace=True)

In [27]:
train['dinner_menu'] = train['석식메뉴'].apply(sep_lunch)
test['dinner_menu'] = test['석식메뉴'].apply(sep_lunch)

In [None]:
train['dinner_menu'] = train['석식메뉴'].apply(sep_lunch)

In [28]:
bobd=[]
soupd=[]
maind=[]

for word in  train['dinner_menu']:
    if len(word) == 0:
        bobd.append('None') #비어있으면 그 날짜자체가 없어질수있으므로 'None' 값으로 대체
        soupd.append('None')
        maind.append('None')
    elif '*' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '가정의날' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '가정의달' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '자기계발의날' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '*자기계발의날*' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '자기개발의날' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')

    else:
        bobd.append(word[0])
        soupd.append(word[1])
        maind.append(word[2])



In [30]:
bobd=[]
soupd=[]
maind=[]

for word in  test['dinner_menu']:
    if len(word) == 0:
        bobd.append('None') #비어있으면 그 날짜자체가 없어질수있으므로 'None' 값으로 대체
        soupd.append('None')
        maind.append('None')
    elif '*' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '가정의날' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '가정의달' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '자기계발의날' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '*자기계발의날*' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '자기개발의날' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')

    else:
        bobd.append(word[0])
        soupd.append(word[1])
        maind.append(word[2])



In [None]:
print(len(train['dinner_menu']))
print(len(bobd))
print(len(soupd))
print(len(maind))

In [29]:
train['dinner_bob'] = bobd
train['dinner_soup'] = soupd
train['dinner_main'] = maind

In [31]:
test['dinner_bob'] = bobd
test['dinner_soup'] = soupd
test['dinner_main'] = maind

In [None]:
train.drop(['dinner)_main'], axis=1, inplace=True)

In [None]:
train.drop(['중식메뉴', '석식메뉴', 'dinner'], axis=1, inplace=True)

In [None]:
train.drop('dinner_menu', axis=1, inplace=True)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
train['출근자수'] = train['출근자수'].astype(int)

In [None]:
train

In [None]:
plt.figure(figsize=(10,10))


sns.barplot(x="요일", y="중식계", data=train)
plt.title("요일별 점심 식사 수")
plt.show()

#월요일이 가장 많고 금요일로 갈수록 줄어든다.

In [None]:
plt.figure(figsize=(10,10))


sns.barplot(x="요일", y="석식계", data=train)
plt.title("요일별 석식 식사 수")
plt.show()

#월요일이 가장 많고 금요일로 갈수록 줄어든다.

In [None]:
#월별 점심떄 사람수
plt.figure(figsize=(20,10))
rot = sns.boxplot(x='month',y='중식계', data = train)

for item in rot.get_xticklabels():
    item.set_rotation(90)
#2017 년부 12월에 먹는 사람수가 좀 줄어든다.
#연말이라 회식이많은것같다. 그래서 밖에서 사먹고 와서 그런것같다.
#생각보다 코라나로인해 확 줄거나 그런것은 안보이는 것같다.

In [32]:
# encoding

train['요일'] =  train['요일'].astype('category')
train['요일'] = train.요일.cat.codes


train['lunch_bob'] =  train['lunch_bob'].astype('category')
train['lunch_bob'] = train.lunch_bob.cat.codes

train['lunch_soup'] =  train['lunch_soup'].astype('category')
train['lunch_soup'] = train.lunch_soup.cat.codes

train['lunch_main'] =  train['lunch_main'].astype('category')
train['lunch_main'] = train.lunch_main.cat.codes

train['dinner_bob'] =  train['dinner_bob'].astype('category')
train['dinner_bob'] = train.dinner_bob.cat.codes

train['dinner_soup'] =  train['dinner_soup'].astype('category')
train['dinner_soup'] = train.dinner_soup.cat.codes

train['dinner_main'] =  train['dinner_main'].astype('category')
train['dinner_main'] = train.dinner_main.cat.codes


In [112]:
train

Unnamed: 0,일자,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,조식메뉴,중식메뉴,석식메뉴,...,출근자수,식사자수,lunch_menu,lunch_bob,lunch_soup,lunch_main,dinner_menu,dinner_bob,dinner_soup,dinner_main
0,2016-02-01,3,2601,50,150,238,0.0,모닝롤/찐빵 우유/두유/주스 계란후라이 호두죽/쌀밥 (쌀:국내산) 된장찌개 쥐...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 오징어찌개 쇠불고기 (쇠고기:호주산) 계란찜 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 육개장 자반고등어구이 두부조림 건파래무침 ...",...,2401.0,2639.0,"[밥, 오징어찌개, 쇠불고기, 계란찜, 청포묵무침, 요구르트, 포기김치]",23,189,236,"[밥, 육개장, 자반고등어구이, 두부조림, 건파래무침, 포기김치]",39,224,341
1,2016-02-02,4,2601,50,173,319,0.0,모닝롤/단호박샌드 우유/두유/주스 계란후라이 팥죽/쌀밥 (쌀:국내산) 호박젓국찌...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 김치찌개 가자미튀김 모둠소세지구이 마늘쫑무...","콩나물밥*양념장 (쌀,현미흑미:국내산) 어묵국 유산슬 (쇠고기:호주산) 아삭고추무...",...,2378.0,2697.0,"[밥, 김치찌개, 가자미튀김, 모둠소세지구이, 마늘쫑무침, 요구르트, 배추겉절이]",23,28,8,"[콩나물밥*양념장, 어묵국, 유산슬, 아삭고추무침, 바나나, 포기김치]",106,183,335
2,2016-02-03,2,2601,56,180,111,0.0,모닝롤/베이글 우유/두유/주스 계란후라이 표고버섯죽/쌀밥 (쌀:국내산) 콩나물국...,"카레덮밥 (쌀,현미흑미:국내산) 팽이장국 치킨핑거 (닭고기:국내산) 쫄면야채무침 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 청국장찌개 황태양념구이 (황태:러시아산) 고기...",...,2365.0,2476.0,"[카레덮밥, 팽이장국, 치킨핑거, 쫄면야채무침, 견과류조림, 요구르트, 포기김치]",50,249,334,"[밥, 청국장찌개, 황태양념구이, 고기전, 새송이버섯볶음, 포기김치]",39,244,430
3,2016-02-04,1,2601,104,220,355,0.0,"모닝롤/토마토샌드 우유/두유/주스 계란후라이 닭죽/쌀밥 (쌀,닭:국내산) 근대국...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 쇠고기무국 주꾸미볶음 부추전 시금치나물 ...","미니김밥*겨자장 (쌀,현미흑미:국내산) 우동 멕시칸샐러드 군고구마 무피클 포...",...,2277.0,2632.0,"[밥, 쇠고기무국, 주꾸미볶음, 부추전, 시금치나물, 요구르트, 포기김치]",23,149,309,"[미니김밥*겨자장, 우동, 멕시칸샐러드, 군고구마, 무피클, 포기김치]",35,213,172
4,2016-02-05,0,2601,278,181,34,0.0,모닝롤/와플 우유/두유/주스 계란후라이 쇠고기죽/쌀밥 (쌀:국내산) 재첩국 방...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 떡국 돈육씨앗강정 (돼지고기:국내산) 우엉잡채...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 차돌박이찌개 (쇠고기:호주산) 닭갈비 (닭고기:...",...,2142.0,2176.0,"[밥, 떡국, 돈육씨앗강정, 우엉잡채, 청경채무침, 요구르트, 포기김치]",23,75,106,"[밥, 차돌박이찌개, 닭갈비, 감자소세지볶음, 콩나물무침, 포기김치]",39,240,92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,2021-01-20,2,2983,75,198,4,391.0,모닝롤/페퍼로니피자 우유/주스 계란후라이/찐계란 크루통크림스프/흑미밥 아귀지리 마늘...,쌀밥/흑미밥/찰현미밥 아욱국 수제함박스테이크 견과류마카로니범벅 생깻잎지 단호박물김치...,김치볶음밥 미니쫄우동*맛살튀김 브로콜리깨소스무침 계란후라이 고들빼기무침 겉절이김치,...,2319.0,2323.0,"[밥, 아욱국, 수제함박스테이크, 견과류마카로니범벅, 생깻잎지, 단호박물김치, 양상...",23,167,248,"[김치볶음밥, 미니쫄우동*맛살튀김, 브로콜리깨소스무침, 계란후라이, 고들빼기무침, ...",13,115,213
1201,2021-01-21,1,2983,92,231,462,351.0,모닝롤/생크림단팥빵 우유/주스 계란후라이/찐계란 누룽지탕/흑미밥 떡국 해물땡굴소스볶...,쌀밥/수수밥/찰현미밥 냉이된장국 동파육 봄동전 청경채/버섯숙회*초장 무생채 양상추샐...,흑미밥 쇠고기무국 삼치양념구이 비엔나채소볶음 숙주나물당근무침 포기김치,...,2309.0,2771.0,"[밥, 냉이된장국, 동파육, 봄동전, 청경채/버섯숙회*초장, 무생채, 양상추샐러드*...",23,41,115,"[흑미밥, 쇠고기무국, 삼치양념구이, 비엔나채소볶음, 숙주나물당근무침, 포기김치]",120,159,228
1202,2021-01-22,0,2983,255,248,1,303.0,모닝롤/BLT샌드위치 우유/주스 계란후라이/찐계란 흑임자죽/흑미밥 바지락살국 두부조...,전주비빔밥*약고추장 계란파국 요거닭 올방개묵무침 파프리카해초무침 포기김치 양상추샐러...,흑미밥 수제비국 수제맛쵸킹탕수육 유부채소겨자냉채 참나물무침 갓김치/겉절이김치,...,2177.0,2178.0,"[전주비빔밥*약고추장, 계란파국, 요거닭, 올방개묵무침, 파프리카해초무침, 포기김치...",46,16,291,"[흑미밥, 수제비국, 수제맛쵸킹탕수육, 유부채소겨자냉채, 참나물무침, 갓김치/겉절이김치]",120,163,277
1203,2021-01-25,3,2983,107,153,616,327.0,모닝롤/호박고구마오븐구이 우유/주스 계란후라이/찐계란 누룽지탕/흑미밥 감자양파국 분...,쌀밥/흑미밥/찰현미밥 전주식콩나물해장국 돈육간장불고기 깐풍연근 연두부*달래양념장 봄...,흑미밥 열무된장국 장어강정*데리야끼소스 깻잎쌈*생강채 오이선 포기김치,...,2396.0,3012.0,"[밥, 전주식콩나물해장국, 돈육간장불고기, 깐풍연근, 연두부*달래양념장, 봄동겉절이...",23,215,94,"[흑미밥, 열무된장국, 장어강정*데리야끼소스, 깻잎쌈*생강채, 오이선, 포기김치]",120,194,345


In [33]:
# encoding

test['요일'] =  test['요일'].astype('category')
test['요일'] = test.요일.cat.codes


test['lunch_bob'] =  test['lunch_bob'].astype('category')
test['lunch_bob'] = test.lunch_bob.cat.codes

test['lunch_soup'] =  test['lunch_soup'].astype('category')
test['lunch_soup'] = test.lunch_soup.cat.codes

test['lunch_main'] =  test['lunch_main'].astype('category')
test['lunch_main'] = test.lunch_main.cat.codes

test['dinner_bob'] =  test['dinner_bob'].astype('category')
test['dinner_bob'] = test.dinner_bob.cat.codes

test['dinner_soup'] =  test['dinner_soup'].astype('category')
test['dinner_soup'] = test.dinner_soup.cat.codes

test['dinner_main'] =  test['dinner_main'].astype('category')
test['dinner_main'] = test.dinner_main.cat.codes


In [None]:
train

In [None]:
train.corr()

In [None]:
train.columns

In [None]:
test.columns

In [None]:
test['lunch_menu'] = test['중식메뉴'].apply(sep_lunch)

In [None]:
test

In [None]:
test['출근자수'] = test['본사정원수'] - test['본사휴가자수'] - test['본사출장자수'] - test['본사시간외근무명령서승인건수'] - test['현본사소속재택근무자수']

In [None]:
test['year'] = test['일자'].dt.year
test['month'] = test['일자'].dt.month
test['day']  = test['일자'].dt.day
test['dayofweek'] = test['일자'].dt.day_of_week

In [34]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', 'year', 'month',
       'day', 'dayofweek', '출근자수', '식사자수', 'lunch_menu', 'lunch_bob',
       'lunch_soup', 'lunch_main', 'dinner_menu', 'dinner_bob', 'dinner_soup',
       'dinner_main'],
      dtype='object')

In [35]:
test.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', 'year', 'month', 'day',
       'dayofweek', '출근자수', '식사자수', 'lunch_menu', 'lunch_bob', 'lunch_soup',
       'lunch_main', 'dinner_menu', 'dinner_bob', 'dinner_soup',
       'dinner_main'],
      dtype='object')

In [None]:
# Lunch Train 완성
#lunch_train
train = df[
    ['day','numbers','dayoff','work','outsidework','workfhome','lunch_t','Month','Date','bob','soup','main']
]

In [None]:
train = df[
    ['day','numbers','dayoff','work','outsidework','workfhome','dinner_t','Month','Date','bobd','soupd','maind']
]

In [None]:
feature_cols_1 = ['요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', 'year', 'month', 'day', 'lunch_bob', 'lunch_soup', 'lunch_main', '출근자수']
feature_cols_2 = ['요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', 'year', 'month', 'day', 'dinner_bob', 'dinner_soup', 'dinner_main', '출근자수']
feature_cols_3 = ['요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', 'year', 'month', 'day', 'lunch_bob', 'lunch_soup', 'lunch_main', '식사자수']
feature_cols_4 = ['요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', 'year', 'month', 'day', 'dinner_bob', 'dinner_soup', 'dinner_main', '식사자수']

X_train_1 = train[feature_cols_1]
y_train_1 = train['중식계']

X_train_2 = train[feature_cols_2]
y_train_2 = train['석식계']

X_test_1 = test[feature_cols_1]
X_test_2 = test[feature_cols_2]

X_train_3 = train[feature_cols_3]
y_train_3 = train['중식계']

X_train_4 = train[feature_cols_4]
y_train_4 = train['석식계']

X_test_3 = test[feature_cols_3]
X_test_4 = test[feature_cols_4]

In [37]:
feature_cols_1 = ['요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', 'year', 'month', 'day', 'lunch_bob', 'lunch_soup', 'lunch_main', '출근자수']
feature_cols_2 = ['요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', 'year', 'month', 'day', 'dinner_bob', 'dinner_soup', 'dinner_main', '출근자수']

In [38]:
feature_cols_3 = ['요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', 'year', 'month', 'day', 'lunch_bob', 'lunch_soup', 'lunch_main', '식사자수']
feature_cols_4 = ['요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', 'year', 'month', 'day', 'dinner_bob', 'dinner_soup', 'dinner_main', '식사자수']

In [None]:
feature_cols_1 = ['본사정원수','year', 'month', 'day', 'dayofweek', '출근자수', 'lunch_bob', 'lunch_soup', 'lunch_main']
feature_cols_2 = ['본사정원수','year', 'month', 'day', 'dayofweek', '출근자수', 'dinner_bob', 'dinner_soup', 'dinner_main']

In [None]:
test['lunch_bob'] = test['lunch_menu'].apply(lambda x: x[0])
test['lunch_soup'] = test['lunch_menu'].apply(lambda x: x[1])
test['lunch_main'] = test['lunch_menu'].apply(lambda x: x[2])

In [None]:
bobd=[]
soupd=[]
maind=[]

for word in  test['dinner_menu']:
    if len(word) == 0:
        bobd.append('None') #비어있으면 그 날짜자체가 없어질수있으므로 'None' 값으로 대체
        soupd.append('None')
        maind.append('None')
    elif '*' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '가정의날' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '가정의달' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '자기계발의날' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '*자기계발의날*' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')
    elif '자기개발의날' in word:
        bobd.append('None')
        soupd.append('None')
        maind.append('None')

    else:
        bobd.append(word[0])
        soupd.append(word[1])
        maind.append(word[2])



In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
test['dinner_menu'] = test['석식메뉴'].apply(sep_lunch)

In [None]:
test['dinner_bob'] = bobd
test['dinner_soup'] = soupd
test['dinner_main'] = maind

In [None]:
# encoding

test['day'] =  test['day'].astype('category')
test['day'] = test.day.cat.codes


test['lunch_bob'] =  test['lunch_bob'].astype('category')
test['lunch_bob'] = test.lunch_bob.cat.codes

test['lunch_soup'] =  test['lunch_soup'].astype('category')
test['lunch_soup'] = test.lunch_soup.cat.codes

test['lunch_main'] =  test['lunch_main'].astype('category')
test['lunch_main'] = test.lunch_main.cat.codes

test['dinner_bob'] =  test['dinner_bob'].astype('category')
test['dinner_bob'] = test.dinner_bob.cat.codes

test['dinner_soup'] =  test['dinner_soup'].astype('category')
test['dinner_soup'] = test.dinner_soup.cat.codes

test['dinner_main'] =  test['dinner_main'].astype('category')
test['dinner_main'] = test.dinner_main.cat.codes


In [None]:
train.columns

In [39]:
X_train_1 = train[feature_cols_1]
y_train_1 = train['중식계']

X_train_2 = train[feature_cols_2]
y_train_2 = train['석식계']

X_test_1 = test[feature_cols_1]
X_test_2 = test[feature_cols_2]

In [40]:
X_train_3 = train[feature_cols_3]
y_train_3 = train['중식계']

X_train_4 = train[feature_cols_4]
y_train_4 = train['석식계']

X_test_3 = test[feature_cols_3]
X_test_4 = test[feature_cols_4]

In [41]:
# 모델링
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

rf_reg_1 = RandomForestRegressor(n_jobs=-1, n_estimators=500, criterion='absolute_error')
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
# Deprecated since version 1.0: Criterion “mae” was deprecated in v1.0 and will be removed in version 1.2. 
# Use criterion="absolute_error" which is equivalent.

The scikit-learn version is 1.0.2.


In [None]:
list_max_features = [0.1, 0.5, 0.9]
list_max_depth = [1, 3, 5]
n_estimators = 300

list_hparam = []

for max_depth in list_max_depth:
    for max_features in list_max_features:
        rf = RandomForestRegressor(n_jobs=-1,criterion='absolute_error', 
                                   n_estimators = n_estimators, 
                                   max_depth = max_depth, 
                                   max_features = max_features)
        
        score = cross_val_score(rf, X_train_1, y_train_1, cv=5).mean()
        
        result = {'score': score, 'n_esti':n_estimators, 'max_depth':max_depth,
                 'max_feat':max_features}
        
        list_hparam.append(result)
        # print(max_depth, max_features, score, n_estimators, max_depth, max_features)
        print(f"score: {score} n_esti:{n_estimators} max_depth:{max_depth} max_feat:{max_features}")

In [42]:
list_max_features = [0.1, 0.5, 0.9]
list_max_depth = [1, 3, 5]
n_estimators = 500

list_hparam = []

num = 1
for max_depth in list_max_depth:
    for max_features in list_max_features:
        rf = RandomForestRegressor(n_jobs=-1,criterion='absolute_error', 
                                   n_estimators = n_estimators, 
                                   max_depth = max_depth, 
                                   max_features = max_features)
        
        score = cross_val_score(rf, X_train_1, y_train_1, cv=5).mean()
        
        result = {'score': score, 'n_esti':n_estimators, 'max_depth':max_depth,
                 'max_feat':max_features}
        
        list_hparam.append(result)
        # print(max_depth, max_features, score, n_estimators, max_depth, max_features)
        print(f"{num}번째\nscore: {score}\nn_esti:{n_estimators}\nmax_depth:{max_depth}\nmax_feat:{max_features}")
        print()
        num += 1

1번째
score: 0.13168560207842947
n_esti:500
max_depth:1
max_feat:0.1

2번째
score: 0.35406501870517426
n_esti:500
max_depth:1
max_feat:0.5

3번째
score: 0.3637784125144792
n_esti:500
max_depth:1
max_feat:0.9

4번째
score: 0.3373255124043546
n_esti:500
max_depth:3
max_feat:0.1

5번째
score: 0.6309967469670111
n_esti:500
max_depth:3
max_feat:0.5

6번째
score: 0.6440645858561723
n_esti:500
max_depth:3
max_feat:0.9

7번째
score: 0.47187267284817536
n_esti:500
max_depth:5
max_feat:0.1

8번째
score: 0.7126060364938961
n_esti:500
max_depth:5
max_feat:0.5

9번째
score: 0.7244679938668723
n_esti:500
max_depth:5
max_feat:0.9



In [43]:
list_max_features = [0.1, 0.5, 0.9]
list_max_depth = [1, 3, 5]
n_estimators = 500

list_hparam = []

num = 1
for max_depth in list_max_depth:
    for max_features in list_max_features:
        rf = RandomForestRegressor(n_jobs=-1,criterion='absolute_error', 
                                   n_estimators = n_estimators, 
                                   max_depth = max_depth, 
                                   max_features = max_features)
        
        score = cross_val_score(rf, X_train_3, y_train_3, cv=5).mean()
        
        result = {'score': score, 'n_esti':n_estimators, 'max_depth':max_depth,
                 'max_feat':max_features}
        
        list_hparam.append(result)
        # print(max_depth, max_features, score, n_estimators, max_depth, max_features)
        print(f"{num}번째\nscore: {score}\nn_esti:{n_estimators}\nmax_depth:{max_depth}\nmax_feat:{max_features}")
        print()
        num += 1

1번째
score: 0.1368684038263697
n_esti:500
max_depth:1
max_feat:0.1

2번째
score: 0.35220537717568046
n_esti:500
max_depth:1
max_feat:0.5

3번째
score: 0.3625436079303276
n_esti:500
max_depth:1
max_feat:0.9

4번째
score: 0.3661264268832237
n_esti:500
max_depth:3
max_feat:0.1

5번째
score: 0.6302136339430883
n_esti:500
max_depth:3
max_feat:0.5

6번째
score: 0.6407876609188113
n_esti:500
max_depth:3
max_feat:0.9

7번째
score: 0.49245959135663603
n_esti:500
max_depth:5
max_feat:0.1

8번째
score: 0.712045350252621
n_esti:500
max_depth:5
max_feat:0.5

9번째
score: 0.719664499984305
n_esti:500
max_depth:5
max_feat:0.9



In [44]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators' : [100,200,300],
    'max_depth' : [10,20,30,40,50,60,70,80,90],
    'max_features' : [0.1, 0.5, 0.9]    
}

rf_1 = RandomForestRegressor(n_jobs=-1, criterion='absolute_error')

grid_cv = GridSearchCV(rf_1, param_grid = params, cv=5, n_jobs=-1)

grid_cv.fit(X_train_1, y_train_1)

GridSearchCV(cv=5,
             estimator=RandomForestRegressor(criterion='absolute_error',
                                             n_jobs=-1),
             n_jobs=-1,
             param_grid={'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90],
                         'max_features': [0.1, 0.5, 0.9],
                         'n_estimators': [100, 200, 300]})

In [59]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators' : [100,200,300],
    'max_depth' : [10,20,30,40,50,60,70,80,90],
    'max_features' : [0.1, 0.5, 0.9]    
}

rf_1 = RandomForestRegressor(n_jobs=-1, criterion='absolute_error')

grid_cv_test = GridSearchCV(rf_1, param_grid = params, cv=5, n_jobs=-1)

grid_cv_test.fit(X_train_3, y_train_3)

KeyboardInterrupt: 

In [45]:
print(grid_cv.best_params_)
print(grid_cv.best_score_)

{'max_depth': 70, 'max_features': 0.5, 'n_estimators': 200}
0.7475282220506749


In [53]:
from sklearn.metrics import make_scorer, mean_absolute_error

In [50]:
grid_cv

GridSearchCV(cv=5,
             estimator=RandomForestRegressor(criterion='absolute_error',
                                             n_jobs=-1),
             n_jobs=-1,
             param_grid={'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90],
                         'max_features': [0.1, 0.5, 0.9],
                         'n_estimators': [100, 200, 300]})

In [58]:
print(grid_cv_test.best_params_)
print(grid_cv_test.best_score_)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
print(score, n_estimators, max_depth, max_features)

In [46]:
pd.DataFrame(list_hparam).sort_values(by='score')

Unnamed: 0,score,n_esti,max_depth,max_feat
0,0.136868,500,1,0.1
1,0.352205,500,1,0.5
2,0.362544,500,1,0.9
3,0.366126,500,3,0.1
6,0.49246,500,5,0.1
4,0.630214,500,3,0.5
5,0.640788,500,3,0.9
7,0.712045,500,5,0.5
8,0.719664,500,5,0.9


In [None]:
rf_reg_1 = RandomForestRegressor(n_jobs=-1, n_estimators=10, criterion='absolute_error',
                                max_depth=1, max_features=0.1)
rf_reg_2 = RandomForestRegressor(n_jobs=-1, n_estimators=10, criterion='absolute_error',
                                max_depth=1, max_features=0.1)

In [None]:
rf_reg_1.fit(X_train_1, y_train_1)

ftr_importance = pd.Series(rf_reg_1.feature_importances_, 
                           index = X_train_1.columns).sort_values(ascending=False)
sns.barplot(x=ftr_importance, y=ftr_importance.index)

In [None]:
rf_reg_2.fit(X_train_2, y_train_2)

pred_1 = rf_reg_1.predict(X_test_1)
pred_1

In [None]:
pred_2 = rf_reg_2.predict(X_test_2)
pred_2

In [None]:
submission['중식계'] = pred_1
submission['석식계'] = pred_2

In [None]:
submission.to_csv("data_dacon/dacon_submission-my1.csv", index=False)

In [None]:
n_estimators = 300
num_epoch = 10
hyper_list_1 = []

for epoch in range(num_epoch):
    max_depth = np.random.randint(low=2, high=100)
    max_features = np.random.uniform(low=0.1, high=1.0)
    
    rf = RandomForestRegressor(n_jobs=-1, criterion='squared_error', 
                               n_estimators = n_estimators, 
                               max_depth = max_depth, 
                               max_features = max_features)
    
    score = cross_val_score(rf, X_train_1, y_train_1, cv=5).mean()
    
    h_params = {
        'epoch' : epoch,
        'score' : score,
        'n_estimators' : n_estimators,
        'max_depth' : max_depth,
        'max_features' : max_features
    }  
    
    hyper_list_1.append(h_params)

In [None]:
pd.DataFrame.from_dict(hyper_list_1).sort_values(by='score')

In [None]:
n_estimators = 300
num_epoch = 10
hyper_list_2 = []

for epoch in range(num_epoch):
    max_depth = np.random.randint(low=2, high=100)
    max_features = np.random.uniform(low=0.1, high=1.0)
    
    rf = RandomForestRegressor(n_jobs=-1, criterion='absolute_error', 
                               n_estimators = n_estimators, 
                               max_depth = max_depth, 
                               max_features = max_features)
    
    score = cross_val_score(rf, X_train_2, y_train_2, cv=5).mean()
    
    h_params = {
        'epoch' : epoch,
        'score' : score,
        'n_estimators' : n_estimators,
        'max_depth' : max_depth,
        'max_features' : max_features
    }  
    
    hyper_list_2.append(h_params)

In [None]:
pd.DataFrame.from_dict(hyper_list_2).sort_values(by='score').head(10)

In [None]:
rf_reg_1 = RandomForestRegressor(n_jobs=-1, n_estimators=300, criterion='absolute_error',
                                max_depth=3, max_features=0.407424)
rf_reg_2 = RandomForestRegressor(n_jobs=-1, n_estimators=300, criterion='absolute_error',
                                max_depth=3, max_features=0.407424)

In [None]:
rf_reg_1.fit(X_train_1, y_train_1)

ftr_importance = pd.Series(rf_reg_1.feature_importances_, 
                           index = X_train_1.columns).sort_values(ascending=False)
sns.barplot(x=ftr_importance, y=ftr_importance.index)

In [None]:
rf_reg_2.fit(X_train_2, y_train_2)

pred_1 = rf_reg_1.predict(X_test_1)
pred_1

In [None]:
pred_2 = rf_reg_2.predict(X_test_2)
pred_2

In [None]:
submission['중식계'] = pred_1
submission.head(5)

In [None]:
submission['석식계'] = pred_2
submission.head(5)

In [None]:
submission.to_csv("data_dacon/dacon_submission-my2.csv", index=False)

In [67]:
import lightgbm as lgbm

lgbm_1 = lgbm.LGBMRegressor(learning_rate=0.1, n_estimators=500)
lgbm_2 = lgbm.LGBMRegressor(learning_rate=0.1, n_estimators=500)

from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=777)



In [68]:
lgbms_1 = []

for train_idx, val_idx in kfold.split(X_train_1):
    X_1 = X_train_1.iloc[train_idx]
    y_1 = y_train_1.iloc[train_idx]
    X_1_val = X_train_1.iloc[val_idx]
    y_1_val = y_train_1.iloc[val_idx]
    
    lgbms_1.append(lgbm_1.fit(X_1, y_1, eval_set=(X_1_val, y_1_val) )) #, early_stopping_rounds=100, verbose=100))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000395 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1476
[LightGBM] [Info] Number of data points in the train set: 964, number of used features: 13
[LightGBM] [Info] Start training from score 886.710581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000366 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1477
[LightGBM] [Info] Number of data points in the train set: 964, number of used features: 13
[LightGBM] [Info] Start training from score 886.352697
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bi

In [69]:
lgbm_1.fit(X_train_1, y_train_1)
pred_1 = lgbm_1.predict(X_test_1)

lgbm_2.fit(X_train_2, y_train_2)
pred_2 = lgbm_2.predict(X_test_2)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1601
[LightGBM] [Info] Number of data points in the train set: 1205, number of used features: 13
[LightGBM] [Info] Start training from score 890.334440
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1641
[LightGBM] [Info] Number of data points in the train set: 1205, number of used features: 13
[LightGBM] [Info] Start training from score 461.772614


In [70]:
submission['중식계'] = pred_1
submission['석식계'] = pred_2
submission.head(5)

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,995.0682,353.104487
1,2021-01-28,893.186239,420.447777
2,2021-01-29,528.724547,231.864798
3,2021-02-01,1216.637279,568.688154
4,2021-02-02,950.501678,412.618028


In [71]:
submission.to_csv("data_dacon/dacon_submission-my3.csv", index=False)

In [72]:
lgbms_2 = []

for train_idx, val_idx in kfold.split(X_train_3):
    X_3 = X_train_3.iloc[train_idx]
    y_3 = y_train_3.iloc[train_idx]
    X_3_val = X_train_3.iloc[val_idx]
    y_3_val = y_train_3.iloc[val_idx]
    
    lgbms_2.append(lgbm_2.fit(X_3, y_3, eval_set=(X_3_val, y_3_val) )) #, early_stopping_rounds=100, verbose=100))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1476
[LightGBM] [Info] Number of data points in the train set: 964, number of used features: 13
[LightGBM] [Info] Start training from score 886.710581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000622 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1477
[LightGBM] [Info] Number of data points in the train set: 964, number of used features: 13
[LightGBM] [Info] Start training from score 886.352697
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1482
[LightGBM] [Info] Number of data points in the train set

In [73]:
lgbm_2.fit(X_train_3, y_train_3)
pred_3 = lgbm_2.predict(X_test_3)

lgbm_2.fit(X_train_4, y_train_4)
pred_4 = lgbm_2.predict(X_test_4)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000765 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1601
[LightGBM] [Info] Number of data points in the train set: 1205, number of used features: 13
[LightGBM] [Info] Start training from score 890.334440
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1641
[LightGBM] [Info] Number of data points in the train set: 1205, number of used features: 13
[LightGBM] [Info] Start training from score 461.772614


In [74]:
submission['중식계'] = pred_3
submission['석식계'] = pred_4
submission.head(5)

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,995.54269,360.071418
1,2021-01-28,952.453163,409.487559
2,2021-01-29,489.906745,205.867632
3,2021-02-01,1225.213404,558.510364
4,2021-02-02,962.503175,432.32196


In [75]:
submission.to_csv("data_dacon/dacon_submission-my4.csv", index=False)

In [77]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.0-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.0


In [78]:
import numpy as np
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb

xgb_clf = xgb.XGBClassifier()

In [80]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100), #  'n_estimators':range(600,700,50) 여기에 cv 10 (이거와 별반차이가 없다.)
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, 
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

grid_search.fit(X_train_1, y_train_1)
print(grid_search.best_params_)
{'colsample_bylevel': 0.5, 'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 600}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=600,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

{'colsample_bylevel': 1, 'colsample_bytree': 0.5, 'max_depth': 2, 'n_estimators': 300}


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=0.5, colsample_bynode=None, colsample_bytree=0.7,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=0, max_depth=3,
             max_leaves=None, min_child_weight=1, missing=None,
             monotone_constraints=None, multi_strategy=None, n_estimators=600,
             n_jobs=1, nthread=None, num_parallel_tree=None, ...)

In [81]:
y_pred_1 = grid_search.predict(X_test_1)

In [82]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100), #  'n_estimators':range(600,700,50) 여기에 cv 10 (이거와 별반차이가 없다.)
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, 
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

grid_search.fit(X_train_2, y_train_2)
print(grid_search.best_params_)
{'colsample_bylevel': 0.5, 'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 600}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=600,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

{'colsample_bylevel': 0.7, 'colsample_bytree': 1, 'max_depth': 2, 'n_estimators': 300}


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=0.5, colsample_bynode=None, colsample_bytree=0.7,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=0, max_depth=3,
             max_leaves=None, min_child_weight=1, missing=None,
             monotone_constraints=None, multi_strategy=None, n_estimators=600,
             n_jobs=1, nthread=None, num_parallel_tree=None, ...)

In [83]:
y_pred_2 = grid_search.predict(X_test_2)

In [94]:
submission['중식계'] = y_pred_1
submission['석식계'] = y_pred_2
submission.head(5)

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1090.09729,415.644623
1,2021-01-28,1025.37561,475.955597
2,2021-01-29,744.5849,276.476196
3,2021-02-01,1370.823608,591.086914
4,2021-02-02,1007.453674,508.029846


In [95]:
submission.to_csv("data_dacon/dacon_submission-gpu1.csv", index=False)

In [86]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100), #  'n_estimators':range(600,700,50) 여기에 cv 10 (이거와 별반차이가 없다.)
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, 
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

grid_search.fit(X_train_3, y_train_3)
print(grid_search.best_params_)
{'colsample_bylevel': 0.5, 'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 600}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=600,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

{'colsample_bylevel': 0.7, 'colsample_bytree': 0.5, 'max_depth': 2, 'n_estimators': 300}


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=0.5, colsample_bynode=None, colsample_bytree=0.7,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=0, max_depth=3,
             max_leaves=None, min_child_weight=1, missing=None,
             monotone_constraints=None, multi_strategy=None, n_estimators=600,
             n_jobs=1, nthread=None, num_parallel_tree=None, ...)

In [87]:
y_pred_3 = grid_search.predict(X_test_3)

In [88]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100), #  'n_estimators':range(600,700,50) 여기에 cv 10 (이거와 별반차이가 없다.)
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, 
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

grid_search.fit(X_train_4, y_train_4)
print(grid_search.best_params_)
{'colsample_bylevel': 0.5, 'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 600}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=600,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

{'colsample_bylevel': 0.5, 'colsample_bytree': 0.7, 'max_depth': 2, 'n_estimators': 300}


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=0.5, colsample_bynode=None, colsample_bytree=0.7,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=0, max_depth=3,
             max_leaves=None, min_child_weight=1, missing=None,
             monotone_constraints=None, multi_strategy=None, n_estimators=600,
             n_jobs=1, nthread=None, num_parallel_tree=None, ...)

In [89]:
y_pred_4 = grid_search.predict(X_test_4)

In [93]:
submission['중식계'] = y_pred_3
submission['석식계'] = y_pred_4
submission.to_csv("data_dacon/dacon_submission-gpu2.csv", index=False)

In [92]:
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1111.65332,382.472321
1,2021-01-28,1013.794434,408.206757
2,2021-01-29,683.064575,257.598511
3,2021-02-01,1333.469727,584.386475
4,2021-02-02,982.564148,539.794006


In [None]:
x_train = train[['day', 'numbers', 'dayoff', 'work', 'outsidework', 'workfhome','Month','Date','bob','soup','main']]
y_train = train['lunch_t'] 
x_test = test[['day', 'numbers', 'dayoff', 'work', 'outsidework', 'workfhome','Month','Date','bob','soup','main']]

In [106]:
feature_cols_test_1 = ['요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', 'year', 'month', 'day', 'lunch_bob', 'lunch_soup', 'lunch_main']
feature_cols_test_2 = ['요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수', 'year', 'month', 'day', 'dinner_bob', 'dinner_soup', 'dinner_main']

X_train_test_1 = train[feature_cols_test_1]
y_train_test_1 = train['중식계']

X_train_test_2 = train[feature_cols_test_2]
y_train_test_2 = train['석식계']

X_test_test_1 = test[feature_cols_test_1]
X_test_test_2 = test[feature_cols_test_2]

In [107]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100), #  'n_estimators':range(600,700,50) 여기에 cv 10 (이거와 별반차이가 없다.)
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, 
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

grid_search.fit(X_train_test_1, y_train_test_1)
print(grid_search.best_params_)

{'colsample_bylevel': 1, 'colsample_bytree': 1, 'max_depth': 2, 'n_estimators': 300}


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=0.5, colsample_bynode=None, colsample_bytree=0.7,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=0, max_depth=3,
             max_leaves=None, min_child_weight=1, missing=None,
             monotone_constraints=None, multi_strategy=None, n_estimators=600,
             n_jobs=1, nthread=None, num_parallel_tree=None, ...)

In [None]:
{'colsample_bylevel': 0.5, 'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 600}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=600,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

In [108]:
y_test_pred1 = grid_search.predict(X_test_test_1)

In [109]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100), #  'n_estimators':range(600,700,50) 여기에 cv 10 (이거와 별반차이가 없다.)
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, 
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

grid_search.fit(X_train_test_2, y_train_test_2)
print(grid_search.best_params_)
{'colsample_bylevel': 0.5, 'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 600}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=600,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

{'colsample_bylevel': 0.5, 'colsample_bytree': 0.7, 'max_depth': 2, 'n_estimators': 300}


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=0.5, colsample_bynode=None, colsample_bytree=0.7,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=0, max_depth=3,
             max_leaves=None, min_child_weight=1, missing=None,
             monotone_constraints=None, multi_strategy=None, n_estimators=600,
             n_jobs=1, nthread=None, num_parallel_tree=None, ...)

In [110]:
y_test_pred2 = grid_search.predict(X_test_test_2)

In [111]:
submission['중식계'] = y_test_pred1
submission['석식계'] = y_test_pred2
submission.to_csv("data_dacon/dacon_submission-gpu-menu.csv", index=False)

In [113]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100), #  'n_estimators':range(600,700,50) 여기에 cv 10 (이거와 별반차이가 없다.)
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, 
                           scoring='neg_mean_absolute_error',
                           n_jobs=-1)

grid_search.fit(X_train_1, y_train_1)
print(grid_search.best_params_)
{'colsample_bylevel': 0.5, 'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 600}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=600,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

{'colsample_bylevel': 1, 'colsample_bytree': 1, 'max_depth': 2, 'n_estimators': 300}


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=0.5, colsample_bynode=None, colsample_bytree=0.7,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=0, max_depth=3,
             max_leaves=None, min_child_weight=1, missing=None,
             monotone_constraints=None, multi_strategy=None, n_estimators=600,
             n_jobs=1, nthread=None, num_parallel_tree=None, ...)

In [115]:
y_pred1 = grid_search.predict(X_test_1)

In [116]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBRegressor

param = {
    'max_depth':[2,3,4],
    'n_estimators':range(300,600,100), #  'n_estimators':range(600,700,50) 여기에 cv 10 (이거와 별반차이가 없다.)
    'colsample_bytree':[0.5,0.7,1],
    'colsample_bylevel':[0.5,0.7,1],
}
model = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, cv=10, 
                           scoring='neg_mean_absolute_error',
                           n_jobs=-1)

grid_search.fit(X_train_2, y_train_2)
print(grid_search.best_params_)
{'colsample_bylevel': 0.5, 'colsample_bytree': 0.5, 'max_depth': 3, 'n_estimators': 600}
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=600,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)

{'colsample_bylevel': 0.5, 'colsample_bytree': 0.7, 'max_depth': 2, 'n_estimators': 300}


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=0.5, colsample_bynode=None, colsample_bytree=0.7,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.1, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=0, max_depth=3,
             max_leaves=None, min_child_weight=1, missing=None,
             monotone_constraints=None, multi_strategy=None, n_estimators=600,
             n_jobs=1, nthread=None, num_parallel_tree=None, ...)

In [117]:
y_pred2 = grid_search.predict(X_test_2)

In [118]:
submission['중식계'] = y_pred1
submission['석식계'] = y_pred2
submission.to_csv("data_dacon/dacon_submission-gpu-test.csv", index=False)