In [11]:
import pandas as pd
import numpy as np
import datetime as dt
from tqdm.notebook import tqdm

In [2]:
# 데이터 로드

train = pd.read_csv('데이콘/train.csv')
test = pd.read_csv('데이콘/test.csv')

train[['현본사소속재택근무자수', '중식계', '석식계']] = train[['현본사소속재택근무자수', '중식계', '석식계']].astype('int')
test['현본사소속재택근무자수'] = test['현본사소속재택근무자수'].astype('int')

train['일자'] = pd.to_datetime(train['일자'])
test['일자'] = pd.to_datetime(test['일자'])

# 추가 변수 생성

train['년'] = train['일자'].dt.year
train['월'] = train['일자'].dt.month
train['일'] = train['일자'].dt.day
train['주'] = train['일자'].dt.week
train['요일'] = train['일자'].dt.weekday
train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

train['식사가능자수'] = train['본사정원수'] - train['본사휴가자수'] - train['현본사소속재택근무자수']
train['식사가능자수'] = train['식사가능자수'].astype('int')

test['년'] = test['일자'].dt.year
test['월'] = test['일자'].dt.month
test['일'] = test['일자'].dt.day
test['주'] = test['일자'].dt.week
test['요일'] = test['일자'].dt.weekday
test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

test['식사가능자수'] = test['본사정원수'] - test['본사휴가자수'] - test['현본사소속재택근무자수']
test['식사가능자수'] = test['식사가능자수'].astype('int')

train['지표'] = train['석식메뉴'].apply(lambda x : len(x.split()))
train['메뉴x'] = 0
test['메뉴x'] = 0

train.loc[(train['지표'] < 5),'메뉴x']= 1
train = train.drop(['지표'], axis = 1)

lunch_day_mean = train.loc[train['메뉴x'] == 0].groupby(['요일'])[['중식계']].mean()
dinner_day_mean = train.loc[train['메뉴x'] == 0].groupby(['요일'])[['석식계']].mean()
lunch_mon_mean = train.loc[train['메뉴x'] == 0].groupby(['월'])[['중식계']].mean()
dinner_mon_mean = train.loc[train['메뉴x'] == 0].groupby(['월'])[['석식계']].mean()

train['요일중식평균'] = train['요일'].apply(lambda x : lunch_day_mean.loc[x, '중식계'])
train['요일석식평균'] = train['요일'].apply(lambda x : dinner_day_mean.loc[x, '석식계'])
train['월중식평균'] = train['월'].apply(lambda x : lunch_mon_mean.loc[x, '중식계'])
train['월석식평균'] = train['월'].apply(lambda x : dinner_mon_mean.loc[x, '석식계'])
test['요일중식평균'] = test['요일'].apply(lambda x : lunch_day_mean.loc[x, '중식계'])
test['요일석식평균'] = test['요일'].apply(lambda x : dinner_day_mean.loc[x, '석식계'])
test['월중식평균'] = test['월'].apply(lambda x : lunch_mon_mean.loc[x, '중식계'])
test['월석식평균'] = test['월'].apply(lambda x : dinner_mon_mean.loc[x, '석식계'])

train['연말'] = 0
train.loc[(train['월'] == 12) & (train['일']>=21), '연말'] = 1

print('done')

done




In [3]:
# 변수들의 시간적 효과 고려

df = pd.concat([train,test],axis = 0).reset_index(drop = True)
vacation_yesterday = []
vacation_tomarrow = []
vacation_ratio_yesterday = []
vacation_ratio_tomarrow = []
night_yesterday = []
night_tomarrow = []
night_ratio_yesterday = []
night_ratio_tomarrow = []
home_yesterday = []
home_tomarrow = []
home_ratio_yesterday = []
home_ratio_tomarrow = []
for i in range(len(df)):
  if df['요일'][i] == 0:
    vacation_yesterday.append(0)
    vacation_ratio_yesterday.append(0)
    night_yesterday.append(0)
    night_ratio_yesterday.append(0)
    home_yesterday.append(0)
    home_ratio_yesterday.append(0)  
    vacation_tomarrow.append(df['본사휴가자수'][i + 1])
    vacation_ratio_tomarrow.append(df['휴가비율'][i + 1])
    night_tomarrow.append(df['본사시간외근무명령서승인건수'][i + 1])
    night_ratio_tomarrow.append(df['야근비율'][i + 1])
    home_tomarrow.append(df['현본사소속재택근무자수'][i + 1])
    home_ratio_tomarrow.append(df['재택비율'][i + 1])
  elif df['요일'][i] == 4:
    vacation_tomarrow.append(0)
    vacation_ratio_tomarrow.append(0)
    night_tomarrow.append(0)
    night_ratio_tomarrow.append(0)
    home_tomarrow.append(0)
    home_ratio_tomarrow.append(0)  
    vacation_yesterday.append(df['본사휴가자수'][i - 1])
    vacation_ratio_yesterday.append(df['휴가비율'][i - 1])
    night_yesterday.append(df['본사시간외근무명령서승인건수'][i - 1])
    night_ratio_yesterday.append(df['야근비율'][i - 1])
    home_yesterday.append(df['현본사소속재택근무자수'][i - 1])
    home_ratio_yesterday.append(df['재택비율'][i - 1])
  else:
    vacation_tomarrow.append(df['본사휴가자수'][i + 1])
    vacation_ratio_tomarrow.append(df['휴가비율'][i + 1])
    night_tomarrow.append(df['본사시간외근무명령서승인건수'][i + 1])
    night_ratio_tomarrow.append(df['야근비율'][i + 1])
    home_tomarrow.append(df['현본사소속재택근무자수'][i + 1])
    home_ratio_tomarrow.append(df['재택비율'][i + 1])
    vacation_yesterday.append(df['본사휴가자수'][i - 1])
    vacation_ratio_yesterday.append(df['휴가비율'][i - 1])
    night_yesterday.append(df['본사시간외근무명령서승인건수'][i - 1])
    night_ratio_yesterday.append(df['야근비율'][i - 1])
    home_yesterday.append(df['현본사소속재택근무자수'][i - 1])
    home_ratio_yesterday.append(df['재택비율'][i - 1])
df['본사휴가자수-1'] = vacation_yesterday
df['본사휴가자수+1'] = vacation_tomarrow
df['휴가비율-1'] = vacation_ratio_yesterday
df['휴가비율+1'] = vacation_ratio_tomarrow
df['본사시간외근무명령서승인건수-1'] = night_yesterday
df['본사시간외근무명령서승인건수+1'] = night_tomarrow
df['야근비율-1'] = night_ratio_yesterday
df['야근비율+1'] = night_ratio_tomarrow
df['현본사소속재택근무자수-1'] = home_yesterday
df['현본사소속재택근무자수+1'] = home_tomarrow
df['재택비율-1'] = home_ratio_yesterday
df['재택비율+1'] = home_ratio_tomarrow
train = df.iloc[:1205]
test = df.iloc[1205:].reset_index(drop= True).drop(['중식계','석식계'],axis =1)

In [5]:
# 계절 추가
def season_define(x):
  if x in [3,4,5]:
    return 0
  elif x in [6,7,8]:
    return 1
  elif x in [9,10,11]:
    return 2
  else:
    return 3
train['계절'] = train['월'].apply(season_define)
test['계절'] = test['월'].apply(season_define)
seasondf = pd.get_dummies(pd.concat([train[['계절']], test[['계절']]], axis = 0),columns = ['계절']).iloc[:1205,:]
seasondftest = pd.get_dummies(pd.concat([train[['계절']], test[['계절']]], axis = 0),columns = ['계절']).iloc[1205:,:].reset_index().drop(['index'],axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [6]:
# 메뉴
!pip install xlrd
menu = pd.read_excel('데이콘/주메뉴.xlsx', engine = 'openpyxl')
menu = menu.drop(['Unnamed: 0'],axis = 1).dropna()
menu['메뉴'] = menu['메뉴'].apply(lambda x : x.split('*')[0] if '*' in x else (x.split('&')[0] if '&' in x else (x.split('/')[0] if '/' in x else x)))
train.replace({'석식메뉴': '카레라이스 (쌀:국내산,돈육:국내 맑은국  생선까스*탈탈소스  멕시칸샐러드 (스모크햄:외국산) 오복지무침  포기김치 (김치:국내산) '}, 
              {'석식메뉴': '카레라이스 (쌀:국내산,돈육:국내) 맑은국  생선까스*탈탈소스  멕시칸샐러드 (스모크햄:외국산) 오복지무침  포기김치 (김치:국내산) '}, inplace = True)
train.replace({'석식메뉴': '카레라이스 (쌀:국내산,돈육:국내 미소시루  감자만두*양념  애기새송이버섯볶음  골뱅이야채무침  포기김치 (김치:국내산) '}, 
              {'석식메뉴': '카레라이스 (쌀:국내산,돈육:국내) 미소시루  감자만두*양념  애기새송이버섯볶음  골뱅이야채무침  포기김치 (김치:국내산) '}, inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [7]:
import re
train2 = train.copy()
train1 = train.copy()
test2 = test.copy()
test1 = test.copy()
r = '\([^)]*\)'
train2['중식메뉴_split'] = train['중식메뉴'].apply(lambda x : (re.sub(r, ' ', x).replace('/',' ').replace('*', ' ').replace('&', ' ').replace('(new)', '').split()))
train2['석식메뉴_split'] = train['석식메뉴'].apply(lambda x : (re.sub(r, ' ', x).replace('/',' ').replace('*', ' ').replace('&', ' ').replace('(new)', '').split()))
test2['중식메뉴_split'] = test['중식메뉴'].apply(lambda x : (re.sub(r, ' ', x).replace('/',' ').replace('*', ' ').replace('&', ' ').replace('(new)', '').split()))
test2['석식메뉴_split'] = test['석식메뉴'].apply(lambda x : (re.sub(r, ' ', x).replace('/',' ').replace('*', ' ').replace('&', ' ').replace('(new)', '').split()))

def get_food_embedding(x):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '(' in i and ':' in i and ')' in i:
            continue
        if '/' in i:
            x_.extend(i.split('/'))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove('')
    return x_
train1['중식메뉴_split'] = train['중식메뉴'].apply(lambda x: get_food_embedding(x))
train1['석식메뉴_split'] = train['석식메뉴'].apply(lambda x: get_food_embedding(x))
test1['중식메뉴_split'] = test['중식메뉴'].apply(lambda x: get_food_embedding(x))
test1['석식메뉴_split'] = test['석식메뉴'].apply(lambda x: get_food_embedding(x))

In [12]:
#train1 메뉴 처리 

lunch_lst = []
lunch_count = []
pref = 0
for i in tqdm(range(1205)):
    try:
        if train1['중식계'][i] >= 880:
              pref = 1
        else:
              pref = 0

        lunch_lst.append(train1['중식메뉴_split'][i])
        lunch_count.append(pref)
    except:
        pass
# for i in range(len(lunch_lst)):
  # lunch_lst[i] = lunch_lst[i].replace(',', ' ')
print('done')

dinner_lst = []
dinner_count = []
pref_d = 0
for i in tqdm(range(1205)):
    try:
        if train1['석식계'][i] > 476:
            pref_d = 1
        else:
            pref_d = 0

        dinner_lst.append(train1['석식메뉴_split'][i])
        dinner_count.append(pref_d)
    except:
        pass
# for i in range(len(dinner_lst)):
#   dinner_lst[i] = dinner_lst[i].replace(',', ' ')
print('done')

lunch_lst_test = []
lunch_count_test = []
for i in tqdm(range(50)):
    try:
        lunch_lst_test.append(test1['중식메뉴_split'][i])
        lunch_count_test.append(0)
    except:
        pass
# for i in range(len(lunch_lst_test)):
#   lunch_lst_test[i] = lunch_lst_test[i].replace(',', ' ')
print('done')

dinner_lst_test = []
dinner_count_test = []
for i in tqdm(range(50)):
    try:
        dinner_lst_test.append(test1['석식메뉴_split'][i])
        dinner_count_test.append(0)
    except:
        pass
# for i in range(len(dinner_lst_test)):
#   dinner_lst_test[i] = dinner_lst_test[i].replace(',', ' ')
print('done')

lunch_df = pd.DataFrame({'중식메뉴':lunch_lst})
dinner_df = pd.DataFrame({'석식메뉴':dinner_lst})
lunch_df_t = pd.DataFrame({'중식메뉴':lunch_lst_test})
dinner_df_t = pd.DataFrame({'석식메뉴':dinner_lst_test})

food = list(menu['메뉴'])
result_lunch = []
result_dinner = []
for k in range(len(lunch_df)):
    trigger = np.zeros(34, dtype='int')
    for i in range(len(lunch_df.iloc[k][0])):
        if lunch_df.iloc[k][0][i] in food:
            trigger += np.array(menu[menu['메뉴'] == lunch_df.iloc[k][0][i]].iloc[0][1:-1], dtype='int')
    #trigger = trigger / trigger.sum()
    result_lunch.append(trigger)

for k in range(len(dinner_df)):
    trigger = np.zeros(34, dtype='int')
    for i in range(len(dinner_df.iloc[k][0])):
        if dinner_df.iloc[k][0][i] in food:
            trigger += np.array(menu[menu['메뉴'] == dinner_df.iloc[k][0][i]].iloc[0][1:-1], dtype='int')
    #trigger = trigger / trigger.sum()
    result_dinner.append(trigger)
    
result_lunch_t = []
result_dinner_t = []
for k in range(len(lunch_df_t)):
    trigger = np.zeros(34, dtype='int')
    for i in range(len(lunch_df_t.iloc[k][0])):
        if lunch_df_t.iloc[k][0][i] in food:
            trigger += np.array(menu[menu['메뉴'] == lunch_df_t.iloc[k][0][i]].iloc[0][1:-1], dtype='int')
    # trigger = trigger / trigger.sum()
    result_lunch_t.append(trigger)

for k in range(len(dinner_df_t)):
    trigger = np.zeros(34, dtype='int')
    for i in range(len(dinner_df_t.iloc[k][0])):
        if dinner_df_t.iloc[k][0][i] in food:
            trigger += np.array(menu[menu['메뉴'] == dinner_df_t.iloc[k][0][i]].iloc[0][1:-1], dtype='int')
    #trigger = trigger / trigger.sum()
    result_dinner_t.append(trigger)
    
column = list(menu.columns[1:-1])
lunch_sort1 = pd.DataFrame(result_lunch ,columns=column)
dinner_sort1 = pd.DataFrame(result_dinner, columns = column)
lunch_sort_t1 = pd.DataFrame(result_lunch_t, columns=column)
dinner_sort_t1 = pd.DataFrame(result_dinner_t, columns = column)

  0%|          | 0/1205 [00:00<?, ?it/s]

done


  0%|          | 0/1205 [00:00<?, ?it/s]

done


  0%|          | 0/50 [00:00<?, ?it/s]

done


  0%|          | 0/50 [00:00<?, ?it/s]

done


In [13]:
# train2 메뉴 처리

lunch_lst = []
lunch_count = []
pref = 0
for i in tqdm(range(1205)):
    try:
        if train2['중식계'][i] >= 880:
              pref = 1
        else:
              pref = 0

        lunch_lst.append(train2['중식메뉴_split'][i])
        lunch_count.append(pref)
    except:
        pass
# for i in range(len(lunch_lst)):
  # lunch_lst[i] = lunch_lst[i].replace(',', ' ')
print('done')

dinner_lst = []
dinner_count = []
pref_d = 0
for i in tqdm(range(1205)):
    try:
        if train2['석식계'][i] > 476:
            pref_d = 1
        else:
            pref_d = 0

        dinner_lst.append(train2['석식메뉴_split'][i])
        dinner_count.append(pref_d)
    except:
        pass
# for i in range(len(dinner_lst)):
#   dinner_lst[i] = dinner_lst[i].replace(',', ' ')
print('done')

lunch_lst_test = []
lunch_count_test = []
for i in tqdm(range(50)):
    try:
        lunch_lst_test.append(test1['중식메뉴_split'][i])
        lunch_count_test.append(0)
    except:
        pass
# for i in range(len(lunch_lst_test)):
#   lunch_lst_test[i] = lunch_lst_test[i].replace(',', ' ')
print('done')

dinner_lst_test = []
dinner_count_test = []
for i in tqdm(range(50)):
    try:
        dinner_lst_test.append(test1['석식메뉴_split'][i])
        dinner_count_test.append(0)
    except:
        pass
# for i in range(len(dinner_lst_test)):
#   dinner_lst_test[i] = dinner_lst_test[i].replace(',', ' ')
print('done')

lunch_df = pd.DataFrame({'중식메뉴':lunch_lst})
dinner_df = pd.DataFrame({'석식메뉴':dinner_lst})
lunch_df_t = pd.DataFrame({'중식메뉴':lunch_lst_test})
dinner_df_t = pd.DataFrame({'석식메뉴':dinner_lst_test})

food = list(menu['메뉴'])
result_lunch = []
result_dinner = []
for k in range(len(lunch_df)):
    trigger = np.zeros(34, dtype='int')
    for i in range(len(lunch_df.iloc[k][0])):
        if lunch_df.iloc[k][0][i] in food:
            trigger += np.array(menu[menu['메뉴'] == lunch_df.iloc[k][0][i]].iloc[0][1:-1], dtype='int')
    #trigger = trigger / trigger.sum()
    result_lunch.append(trigger)

for k in range(len(dinner_df)):
    trigger = np.zeros(34, dtype='int')
    for i in range(len(dinner_df.iloc[k][0])):
        if dinner_df.iloc[k][0][i] in food:
            trigger += np.array(menu[menu['메뉴'] == dinner_df.iloc[k][0][i]].iloc[0][1:-1], dtype='int')
    #trigger = trigger / trigger.sum()
    result_dinner.append(trigger)
    
result_lunch_t = []
result_dinner_t = []
for k in range(len(lunch_df_t)):
    trigger = np.zeros(34, dtype='int')
    for i in range(len(lunch_df_t.iloc[k][0])):
        if lunch_df_t.iloc[k][0][i] in food:
            trigger += np.array(menu[menu['메뉴'] == lunch_df_t.iloc[k][0][i]].iloc[0][1:-1], dtype='int')
    # trigger = trigger / trigger.sum()
    result_lunch_t.append(trigger)

for k in range(len(dinner_df_t)):
    trigger = np.zeros(34, dtype='int')
    for i in range(len(dinner_df_t.iloc[k][0])):
        if dinner_df_t.iloc[k][0][i] in food:
            trigger += np.array(menu[menu['메뉴'] == dinner_df_t.iloc[k][0][i]].iloc[0][1:-1], dtype='int')
    #trigger = trigger / trigger.sum()
    result_dinner_t.append(trigger)
    
column = list(menu.columns[1:-1])
lunch_sort2 = pd.DataFrame(result_lunch ,columns=column)
dinner_sort2 = pd.DataFrame(result_dinner, columns = column)
lunch_sort_t2 = pd.DataFrame(result_lunch_t, columns=column)
dinner_sort_t2 = pd.DataFrame(result_dinner_t, columns = column)

  0%|          | 0/1205 [00:00<?, ?it/s]

done


  0%|          | 0/1205 [00:00<?, ?it/s]

done


  0%|          | 0/50 [00:00<?, ?it/s]

done


  0%|          | 0/50 [00:00<?, ?it/s]

done


In [14]:
# train1  공휴일 변수 추가


train1['공휴일전후'] = 0
test1['공휴일전후'] = 0

train1['공휴일전후'][17] = 1
train1['공휴일전후'][3] = 1
train1['공휴일전후'][62] = 1
# train1['공휴일전후'][67] = 1
# train1['공휴일전후'][82] = 1
train1['공휴일전후'][131] = 1
# train1['공휴일전후'][130] = 1
train1['공휴일전후'][152] = 1
train1['공휴일전후'][226] = 1
train1['공휴일전후'][221] = 1
train1['공휴일전후'][224] = 1
# train1['공휴일전후'][244] = 1
train1['공휴일전후'][245] = 1
# train1['공휴일전후'][267] = 1
train1['공휴일전후'][310] = 2
train1['공휴일전후'][311] = 1
train1['공휴일전후'][309] = 1
train1['공휴일전후'][330] = 1
train1['공휴일전후'][379] = 1
train1['공휴일전후'][467] = 1
# train1['공휴일전후'][469] = 1
train1['공휴일전후'][470] = 1
train1['공휴일전후'][502] = 2
# train1['공휴일전후'][501] = 1
# train1['공휴일전후'][511] = 1
train1['공휴일전후'][565] = 1
train1['공휴일전후'][623] = 1
train1['공휴일전후'][651] = 1
# train1['공휴일전후'][650] = 1
train1['공휴일전후'][705] = 1
# train1['공휴일전후'][707] = 1
train1['공휴일전후'][709] = 1
# train1['공휴일전후'][733] = 1
# train1['공휴일전후'][748] = 1
# train1['공휴일전후'][792] = 1
train1['공휴일전후'][815] = 1
train1['공휴일전후'][864] = 1
# train1['공휴일전후'][863] = 1
train1['공휴일전후'][950] = 1
train1['공휴일전후'][951] = 1
train1['공휴일전후'][953] = 1
train1['공휴일전후'][954] = 1
train1['공휴일전후'][955] = 1
train1['공휴일전후'][971] = 2
# train1['공휴일전후'][970] = 1
# train1['공휴일전후'][1037] = 1
train1['공휴일전후'][1038] = 1
train1['공휴일전후'][1099] = 1
train1['공휴일전후'][1129] = 1
# train1['공휴일전후'][1128] = 1
train1['공휴일전후'][1187] = 1
# train1['공휴일전후'][1186] = 1

test1['공휴일전후'][10] =2
test1['공휴일전후'][20] = 1

train1 = pd.get_dummies(train1, columns=['공휴일전후'])
test1 = pd.get_dummies(test1, columns=['공휴일전후'])

test1['공휴일전후_0'][20] =1
test1['공휴일전후_1'][20] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: ht

In [15]:
# train2  공휴일 변수 추가


train2['공휴일전후'] = 0
test2['공휴일전후'] = 0

train2['공휴일전후'][17] = 1
train2['공휴일전후'][3] = 1
train2['공휴일전후'][62] = 1
# train2['공휴일전후'][67] = 1
# train2['공휴일전후'][82] = 1
train2['공휴일전후'][131] = 1
# train2['공휴일전후'][130] = 1
train2['공휴일전후'][152] = 1
train2['공휴일전후'][226] = 1
train2['공휴일전후'][221] = 1
train2['공휴일전후'][224] = 1
# train2['공휴일전후'][244] = 1
train2['공휴일전후'][245] = 1
# train2['공휴일전후'][267] = 1
train2['공휴일전후'][310] = 2
train2['공휴일전후'][311] = 1
train2['공휴일전후'][309] = 1
train2['공휴일전후'][330] = 1
train2['공휴일전후'][379] = 1
train2['공휴일전후'][467] = 1
# train2['공휴일전후'][469] = 1
train2['공휴일전후'][470] = 1
train2['공휴일전후'][502] = 2
# train2['공휴일전후'][501] = 1
# train2['공휴일전후'][511] = 1
train2['공휴일전후'][565] = 1
train2['공휴일전후'][623] = 1
train2['공휴일전후'][651] = 1
# train2['공휴일전후'][650] = 1
train2['공휴일전후'][705] = 1
# train2['공휴일전후'][707] = 1
train2['공휴일전후'][709] = 1
# train2['공휴일전후'][733] = 1
# train2['공휴일전후'][748] = 1
# train2['공휴일전후'][792] = 1
train2['공휴일전후'][815] = 1
train2['공휴일전후'][864] = 1
# train2['공휴일전후'][863] = 1
train2['공휴일전후'][950] = 1
train2['공휴일전후'][951] = 1
train2['공휴일전후'][953] = 1
train2['공휴일전후'][954] = 1
train2['공휴일전후'][955] = 1
train2['공휴일전후'][971] = 2
# train2['공휴일전후'][970] = 1
# train2['공휴일전후'][1037] = 1
train2['공휴일전후'][1038] = 1
train2['공휴일전후'][1099] = 1
train2['공휴일전후'][1129] = 1
# train2['공휴일전후'][1128] = 1
train2['공휴일전후'][1187] = 1
# train2['공휴일전후'][1186] = 1

test2['공휴일전후'][10] =2
test2['공휴일전후'][20] = 1

train2 = pd.get_dummies(train2, columns=['공휴일전후'])
test2 = pd.get_dummies(test2, columns=['공휴일전후'])

test2['공휴일전후_0'][20] =1
test2['공휴일전후_1'][20] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: ht

In [16]:
#코로나 확진자, 코로나 검색량, 날씨, 코스피지수, 소비자물가지수, 비트코인가격 추가

covid = pd.read_excel('데이콘/코로나감염자.xlsx', engine = 'openpyxl')
covidtrend = pd.read_excel('데이콘/datalab (1).xlsx', engine = 'openpyxl')
weather = pd.read_csv('기상진주.csv', encoding = 'cp949')
weathertest = pd.read_csv('기상진주test.csv', encoding = 'cp949')
kospi = pd.read_csv('코스피코스닥.csv', encoding= 'cp949')
bitcoin = pd.read_excel('데이콘/비트코인(네이버).xlsx', engine = 'openpyxl')
customprice = pd.read_excel('데이콘/소비자물가.xlsx', engine = 'openpyxl').dropna()
covid['날짜'] = covid['날짜'].astype('string')
covidtrend['날짜'] = covidtrend['날짜'].astype('string')

train1['일자'] = train1['일자'].astype('string')
test1['일자'] = test1['일자'].astype('string')

train2['일자'] = train2['일자'].astype('string')
test2['일자'] = test2['일자'].astype('string')

In [17]:
#train1
customprice['년'] = customprice['날짜'].apply(lambda x : int(x[:4]))
customprice['월'] = customprice['날짜'].apply(lambda x : int(x[5:]))
train1 = pd.merge(train1, customprice,how = 'left').drop(['날짜','소비자물가지수-1'], axis = 1)
test1 = pd.merge(test1, customprice,how = 'left').drop(['날짜','소비자물가지수-1'], axis = 1)

train1 = pd.merge(train1, kospi.fillna(method="ffill"), left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1)
train1 = pd.merge(train1, bitcoin, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1)
train1['KOSPI지수'] = train1['KOSPI지수'].apply(lambda x: float(x.replace(',','')))

test1 = pd.merge(test1, kospi.fillna(method="ffill"), left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1)
test1 = pd.merge(test1, bitcoin, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1)
test1['KOSPI지수'] = test1['KOSPI지수'].apply(lambda x: float(x.replace(',','')))

covid['날짜'] = covid['날짜'].astype('string')
covidtrend['날짜'] = covidtrend['날짜'].astype('string')
train1['일자'] = train1['일자'].astype('string')
test1['일자'] = test1['일자'].astype('string')

train1 = pd.merge(pd.merge(train1, covid, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1), covidtrend, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1).fillna(0)
test1 = pd.merge(pd.merge(test1, covid, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1), covidtrend, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1).fillna(0)

train1 = pd.merge(train1,weather[['일시', '최고기온(°C)', '일강수량(mm)', '평균 상대습도(%)', '합계 일조시간(hr)']].fillna(0), left_on = '일자', right_on = '일시').drop(['일시'],axis = 1)
test1 = pd.merge(test1,weathertest[['일시', '최고기온(°C)', '일강수량(mm)', '평균 상대습도(%)', '합계 일조시간(hr)']].fillna(0), left_on = '일자', right_on = '일시').drop(['일시'],axis = 1)

In [18]:
#train2
customprice['년'] = customprice['날짜'].apply(lambda x : int(x[:4]))
customprice['월'] = customprice['날짜'].apply(lambda x : int(x[5:]))
train2 = pd.merge(train2, customprice,how = 'left').drop(['날짜','소비자물가지수-1'], axis = 1)
test2 = pd.merge(test2, customprice,how = 'left').drop(['날짜','소비자물가지수-1'], axis = 1)

train2 = pd.merge(train2, kospi.fillna(method="ffill"), left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1)
train2 = pd.merge(train2, bitcoin, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1)
train2['KOSPI지수'] = train2['KOSPI지수'].apply(lambda x: float(x.replace(',','')))

test2 = pd.merge(test2, kospi.fillna(method="ffill"), left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1)
test2 = pd.merge(test2, bitcoin, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1)
test2['KOSPI지수'] = test2['KOSPI지수'].apply(lambda x: float(x.replace(',','')))

covid['날짜'] = covid['날짜'].astype('string')
covidtrend['날짜'] = covidtrend['날짜'].astype('string')
train2['일자'] = train2['일자'].astype('string')
test2['일자'] = test2['일자'].astype('string')

train2 = pd.merge(pd.merge(train2, covid, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1), covidtrend, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1).fillna(0)
test2 = pd.merge(pd.merge(test2, covid, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1), covidtrend, left_on = '일자', right_on = '날짜', how = 'left').drop(['날짜'], axis = 1).fillna(0)

train2 = pd.merge(train2,weather[['일시', '최고기온(°C)', '일강수량(mm)', '평균 상대습도(%)', '합계 일조시간(hr)']].fillna(0), left_on = '일자', right_on = '일시').drop(['일시'],axis = 1)
test2 = pd.merge(test2,weathertest[['일시', '최고기온(°C)', '일강수량(mm)', '평균 상대습도(%)', '합계 일조시간(hr)']].fillna(0), left_on = '일자', right_on = '일시').drop(['일시'],axis = 1)