# ***대분류 3번 집중 모델링***

In [None]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import plotly.express as px
import matplotlib.pyplot as plt 

from prophet import Prophet

# ***Read Data***

In [None]:
train_data = pd.read_csv(os.path.abspath("./data")+"/train.csv")
keyword = pd.read_csv(os.path.abspath("./data")+"/brand_keyword_cnt.csv")
product = pd.read_csv(os.path.abspath("./data")+"/product_info.csv")
sales = pd.read_csv(os.path.abspath("./data")+"/sales.csv")

In [None]:
submission = pd.read_csv(os.path.abspath("./submission")+"/20230811_first.csv")

***

In [None]:
data_3 = train_data[train_data['대분류'] == 'B002-C001-0003']

In [None]:
data_3['중분류'].value_counts()

## ***About Product***
***

### ***중분류 B002-C002-0008***
  
#### 유아식기 관련 용품

In [None]:
data_3_1 = data_3[data_3['중분류'] == 'B002-C002-0008']

In [None]:
lst_3_1 = data_3_1['제품'].value_counts().index.tolist()

In [None]:
product_3_1 = product[product['제품'].isin(lst_3_1)]

In [None]:
data_3_1['소분류'].value_counts()

##### 정리

    B002-C003-0044   |   유아식기
    B002-C003-0043   |   젖병
    B002-C003-0042   |   젖병건조용품

***
### ***중분류 B002-C002-0010***
  
#### 유아식기 관련 용품

In [None]:
data_3_2 = data_3[data_3['중분류'] == 'B002-C002-0010']

In [None]:
lst_3_2 = data_3_2['제품'].value_counts().index.tolist()

In [None]:
product_3_2 = product[product['제품'].isin(lst_3_2)]

In [None]:
data_3_2['소분류'].value_counts()

##### 정리

    B002-C003-0050   |   유아바디용품

***

## ***Data Pre_Processing***

In [None]:
train = train_data[train_data['대분류'] == 'B002-C001-0003']
train.reset_index(drop=True, inplace=True)

***

Z-score로 이상치 처리.

In [None]:
df = train.iloc[:,6:]

In [None]:
df = df.T

In [None]:
df.columns = df.columns.astype(str)

In [None]:
for i in range(len(df.columns)):
    df_t_t = pd.DataFrame(df[str(i)])
    
    # Z-Score 계산
    z_scores = np.abs((df_t_t[str(i)] - df_t_t[str(i)].mean()) / df_t_t[str(i)].std())

    # Z-Score 임계값 설정 (일반적으로 2 ~ 3 사이의 값 사용)
    z_threshold = 3
    
    # 이상치 대체
    outliers = df[z_scores > z_threshold]
    replacement_value = df_t_t[str(i)].mean() + df_t_t[str(i)].std() * z_threshold
    
    for index, row in outliers.iterrows():
        df.at[index, str(i)] = round(replacement_value)

In [None]:
train.iloc[:,6:] = df.T.values

***
***
***데이터가 전부 0인값 및 거의 모든 데이터가 0인값 제거***

In [None]:
train_zero = train[train.iloc[:,6:].sum(axis=1)==0]

In [None]:
zero_idx = []

In [None]:
zero_idx.extend(train_zero['ID'])

In [None]:
train = train[~train['ID'].isin(train_zero['ID'])]
train.reset_index(drop=True, inplace=True)

*해당 submission 전부 1로 대체*

In [None]:
submission.loc[submission['ID'].isin(zero_idx), submission.columns != 'ID'] = 1

***
***
***값이 0과 특정값 두개 밖에 없다면 max 값으로 대체***

In [None]:
# train_2023 데이터프레임에서 6번째 컬럼부터 끝까지 선택
subset = train.iloc[:, 6:]

# 각 행 별로 유일한 값의 개수를 세어서 결과를 Series로 저장
unique_counts = subset.nunique(axis=1)

# 유일한 값이 세 개밖에 없는 행 추출
rows_with_two_unique_values = subset[unique_counts == 2]

In [None]:
two_unique_lst = rows_with_two_unique_values.index

In [None]:
two_id_lst = train.loc[two_unique_lst]['ID'].tolist()

In [None]:
# two_unique_lst에 해당하는 행 추출
selected_rows = train.loc[two_unique_lst].iloc[:,6:]

# 0을 제외한 각 행의 가장 많이 나온 값 찾기
most_common_values = selected_rows.apply(lambda row: np.bincount(row[row != 0]).argmax(), axis=1)

In [None]:
result_df = pd.DataFrame({'ID': two_id_lst, 'Most_Common_Value': most_common_values})

In [None]:
for i, row in result_df.iterrows():
    id_value = row['ID']
    value = row['Most_Common_Value']
    submission.loc[submission['ID']==id_value, submission.columns != 'ID'] = value

In [None]:
train = train[~train['ID'].isin(two_id_lst)]
train.reset_index(drop=True, inplace=True)

***
***
***값이 0과 특정값 세개 밖에 없다면 max 값으로 대체  (but, 마지막 값이 없을경우)***

In [None]:
# train_2023 데이터프레임에서 6번째 컬럼부터 끝까지 선택
subset = train.iloc[:, 6:]

# 각 행 별로 유일한 값의 개수를 세어서 결과를 Series로 저장
unique_counts = subset.nunique(axis=1)

# 유일한 값이 세 개밖에 없는 행 추출
rows_with_three_unique_values = subset[unique_counts == 3]

In [None]:
three_unique_lst = rows_with_three_unique_values.index

In [None]:
three_id_lst = train.loc[three_unique_lst]['ID'].tolist()

f는 마지막 값이 없는경우. 이에 대해서는 최빈값 적용

In [None]:
f = rows_with_three_unique_values[rows_with_three_unique_values.iloc[:, -7:].eq(0).all(axis=1)]

f_lst = f.index.tolist()

In [None]:
f_id_lst = train.loc[f_lst]['ID'].tolist()

In [None]:
f_unique_lst = train[train['ID'].isin(f_id_lst)].index

In [None]:
# three_unique_lst에 해당하는 행 추출
selected_rows = train.loc[f_unique_lst].iloc[:,6:]

# 0을 제외한 각 행의 가장 많이 나온 값 찾기
most_common_values = selected_rows.apply(lambda row: np.bincount(row[row != 0]).argmax(), axis=1)

In [None]:
result_df = pd.DataFrame({'ID': f_id_lst, 'Most_Common_Value': most_common_values})

5보다 큰 경우엔 두번째 최빈값으로 설정

In [None]:
result_df[result_df['Most_Common_Value']>5]

In [None]:
train.iloc[151,6:].value_counts()

In [None]:
result_df.loc[result_df['ID'] == 5846, 'Most_Common_Value'] = 3

In [None]:
for i, row in result_df.iterrows():
    id_value = row['ID']
    value = row['Most_Common_Value']
    submission.loc[submission['ID']==id_value, submission.columns != 'ID'] = value

In [None]:
train = train[~train['ID'].isin(f_id_lst)]
train.reset_index(drop=True, inplace=True)

***
filtered_three_id_lst / 마지막 값이 있는 이런경우엔 마지막의 시계열성에 따라 값을 넣어주어야 할 수도 있음.

In [None]:
filtered_three_id_lst = [value for value in three_id_lst if value not in f_id_lst]

In [None]:
three_unique_lst = train[train['ID'].isin(filtered_three_id_lst)].index

In [None]:
train_three = train.loc[three_unique_lst]

In [None]:
data = train_three.set_index('ID').iloc[:,5:].T
ID_list = data.columns
datanew_columns = list(range(len(data.columns)))
data.columns = datanew_columns

In [None]:
forecast_list = []

for i in range(0,len(data.columns)):
    train_1 = data[i]
    train_1 = pd.DataFrame(train_1)
    
    while (train_1.iloc[1:, :] == 0).iloc[1][i]:
        train_1 = train_1.iloc[1:, :]
        
    train_1.reset_index(inplace=True)
    train_1.columns = ['ds','y']
    train_1['ds'] = pd.to_datetime(train_1['ds'])

    train_1['extra_weight'] = 1.0  # 기본 가중치
    train_1.loc[train_1['ds'] >= train_1['ds'].max() - pd.Timedelta(days=7), 'extra_weight'] = 1.5  # 마지막 1주일에 더 높은 가중치 설정

    m = Prophet(
    changepoint_prior_scale=0.2,
    weekly_seasonality=4,
    daily_seasonality = True,
    seasonality_mode='additive'
    )
    m.add_regressor('extra_weight')
#     m.add_seasonality(name='monthly', period=30.5, fourier_order=3)
    m.fit(train_1)
    future = m.make_future_dataframe(periods=21)
    future['extra_weight'] = 1.0  # 미래 날짜에는 기본 가중치로 설정
    forecast = m.predict(future)
    forecast_list.append(forecast['yhat'][-21:])
    fig1 = m.plot(forecast)

In [None]:
t = pd.DataFrame()
for i in range(len(forecast_list)):
    a = pd.DataFrame(forecast_list[i])
    k = a.T
    k.reset_index(inplace=True)
    k['index'] = ID_list[i]
    k.columns = range(0,22)
    t = pd.concat([t,k],axis=0)

t[t<0] = 0

In [None]:
IDlist = t[0]
submission3 = submission.loc[IDlist]
t.columns = submission3.columns

In [None]:
t.reset_index(drop=True,inplace=True)
t = np.round(t).astype(int)

In [None]:
id_lst = t['ID'].tolist()

In [None]:
t = t.set_index('ID')

In [None]:
idx_lst = t.index.tolist()

In [None]:
t = t.replace(0, 1)

In [None]:
submission.iloc[idx_lst,1:] = t.loc[idx_lst]

In [None]:
train = train[~train['ID'].isin(three_id_lst)]
train.reset_index(drop=True, inplace=True)

***
***
***2022년에 판매량이 전부 0인 값들 따로 처리 위해 제거***

In [None]:
lst_2023 = train[train.iloc[:,6:-94].sum(axis=1)==0]['ID'].tolist()

In [None]:
train_2023 = train[train['ID'].isin(lst_2023)]
train_2023.reset_index(drop=True, inplace=True)

In [None]:
train = train[~train['ID'].isin(lst_2023)]
train.reset_index(drop=True, inplace=True)

*train_2023은 2023년 데이터만 가지고 모델링*

In [None]:
train_2023 = train_2023.drop(train_2023.columns[6:-94], axis=1)

***2023-02-23 ~ 2023-03-28 사이값 때문에 이상치가 반영되는 경우가 많음.***

In [None]:
zero_2023 = train_2023[train_2023.iloc[:,-41:-7].sum(axis=1)==0]['ID'].tolist()

In [None]:
train_zero_2023 = train_2023[train_2023['ID'].isin(zero_2023)]
train_zero_2023.reset_index(drop=True, inplace=True)

### ***특수 case*** 
중간 결측치를 제외 하였는데도 마지막에 비슷한 양상을 보임.

***ID = 3232 / 6으로 대체***

In [None]:
train_zero_2023[train_zero_2023['ID'] == 3232].iloc[:,-18:]

In [None]:
train_zero_2023[train_zero_2023['ID'] == 3232].iloc[:,6:].T.value_counts()

In [None]:
submission.loc[submission['ID']==3232, submission.columns != 'ID'] = 6

***ID = 15631 / 10으로 대체***

In [None]:
train_zero_2023[train_zero_2023['ID'] == 15631].iloc[:,-18:]

In [None]:
train_zero_2023[train_zero_2023['ID'] == 15631].iloc[:,6:].T.value_counts()

In [None]:
submission.loc[submission['ID']==15631, submission.columns != 'ID'] = 10

***
***

In [None]:
# 'ID' 값이 3232 또는 15631인 행을 삭제합니다.
train_zero_2023 = train_zero_2023[~train_zero_2023['ID'].isin([3232, 15631])]
train_zero_2023.reset_index(drop=True, inplace=True)

***마지막 값들이 전부 0인 애들은 따로 처리해야함.***

In [None]:
train_zero_2023_conti_0 = train_zero_2023[train_zero_2023.iloc[:,-7:].sum(axis=1)==0]

In [None]:
rows_with_high_zeros = train_zero_2023_conti_0

train_list = []
test_list = []

for i in range(len(rows_with_high_zeros)):
    data = rows_with_high_zeros.iloc[i, 6:].reset_index()  # 6번째 컬럼부터 끝까지 추출
    data.columns = ['date_time', 0]  # 컬럼 이름 변경
    train_list.append(data)

for i in range(len(rows_with_high_zeros)):
    train_list[i]['date_time'] = pd.to_datetime(train_list[i]['date_time'])
    # 그래프 그리기
    plt.figure(figsize=(15, 6))
    plt.plot(train_list[i]['date_time'], train_list[i][0], marker='o')
    plt.title('Sales Over Time')
    plt.xlabel('Date')
    plt.ylabel('Sales')
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

해당값들은 마지막날 거의 모든 값들이 0임을 확인 할 수 있음. 따라서 1로 대체

In [None]:
conti_0_lst = train_zero_2023_conti_0['ID'].tolist()

In [None]:
submission.loc[submission['ID'].isin(conti_0_lst), submission.columns != 'ID'] = 1

In [None]:
# 'ID' 값이 conti_0_lst인 행을 삭제합니다.
train_zero_2023 = train_zero_2023[~train_zero_2023['ID'].isin(conti_0_lst)]
train_zero_2023.reset_index(drop=True, inplace=True)

***결측치 기준 앞뒤에 비슷한 값이 계속되는 경우.***

In [None]:
fff = train_zero_2023.drop(train_zero_2023.columns[-41:-7], axis=1)

In [None]:
fff.iloc[:,-4:]

4,5 번 확인 필요

In [None]:
rows_with_high_zeros = fff.iloc[4:6,6:]

train_list = []
test_list = []

for i in range(len(rows_with_high_zeros)):
    data = rows_with_high_zeros.iloc[i, 6:].reset_index()  # 6번째 컬럼부터 끝까지 추출
    data.columns = ['date_time', 0]  # 컬럼 이름 변경
    train_list.append(data)

for i in range(len(rows_with_high_zeros)):
    train_list[i]['date_time'] = pd.to_datetime(train_list[i]['date_time'])
    # 그래프 그리기
    plt.figure(figsize=(15, 6))
    plt.plot(train_list[i]['date_time'], train_list[i][0], marker='o')
    plt.title('Sales Over Time')
    plt.xlabel('Date')
    plt.ylabel('Sales')
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

앞뒤로 똑같은 값이 계속해서 연속됨. 두 값 처리해주도록 하겠음. 16, 28

In [None]:
conti_lst = fff.iloc[4:6]['ID'].tolist()

In [None]:
train_zero_2023[train_zero_2023['ID']==conti_lst[0]]

In [None]:
submission.loc[submission['ID']==conti_lst[0], submission.columns != 'ID'] = 16

In [None]:
train_zero_2023[train_zero_2023['ID']==conti_lst[1]]

In [None]:
submission.loc[submission['ID']==conti_lst[1], submission.columns != 'ID'] = 28

In [None]:
# 'ID' 값이 conti_0_lst인 행을 삭제합니다.
train_zero_2023 = train_zero_2023[~train_zero_2023['ID'].isin(conti_lst)]
train_zero_2023.reset_index(drop=True, inplace=True)

***결측치 prophet으로 예측 후 이를 test 예측에 사용***

In [None]:
ttt = train_zero_2023.drop(train_zero_2023.columns[-41:-7], axis=1)

In [None]:
ttt = ttt.set_index('ID')

In [None]:
start_date = '2023-02-23'
end_date = '2023-03-28'
future_dates = pd.date_range(start=start_date, end=end_date)

In [None]:
data = ttt.iloc[:,5:].T
ID_list = data.columns
datanew_columns = list(range(len(data.columns)))
data.columns = datanew_columns

In [None]:
forecast_list = []

for i in range(0,len(data.columns)):
    train_1 = data[i]
    train_1 = pd.DataFrame(train_1)
    
    while (train_1.iloc[1:, :] == 0).iloc[1][i]:
        train_1 = train_1.iloc[1:, :]
    
    train_1.reset_index(inplace=True)
    train_1.columns = ['ds','y']
    train_1['ds'] = pd.to_datetime(train_1['ds'])

    train_1['extra_weight'] = 1.0  # 기본 가중치
#     train_1.loc[train_1['ds'] >= train_1['ds'].max() - pd.Timedelta(days=7), 'extra_weight'] = 1.5  # 마지막 1주일에 더 높은 가중치 설정

    m = Prophet(
        changepoint_prior_scale=0.3,
        weekly_seasonality=3,
        daily_seasonality = True,
        seasonality_mode='additive'
    )
    m.add_regressor('extra_weight')
#     m.add_seasonality(name='monthly', period=30.5, fourier_order=3)
    m.fit(train_1)
    future = pd.DataFrame({'ds': future_dates, 'extra_weight': 1.0})
    future['extra_weight'] = 1.0  # 미래 날짜에는 기본 가중치로 설정
    forecast = m.predict(future)
    forecast_list.append(forecast['yhat'])
    fig1 = m.plot(forecast)

In [None]:
t = pd.DataFrame()
for i in range(len(forecast_list)):
    a = pd.DataFrame(forecast_list[i])
    k = a.T
    k.reset_index(inplace=True)
    k['index'] = ID_list[i]
    t = pd.concat([t,k],axis=0)

t[t<0] = 0

In [None]:
t = t.set_index('index')
t = np.round(t).astype(int)

In [None]:
t = t.reset_index(drop=True)

In [None]:
train_zero_2023.iloc[:,-41:-7] = t

#### ***앞에 연속된 0값들이 데이터의 시간성을 해쳐 이를 해결해 주어야함***

In [None]:
data = train_zero_2023.set_index('ID').iloc[:,5:].T
ID_list = data.columns
datanew_columns = list(range(len(data.columns)))
data.columns = datanew_columns

In [None]:
forecast_list = []

for i in range(0,len(data.columns)):
    train_1 = data[i]
    train_1 = pd.DataFrame(train_1)
    
    while (train_1.iloc[1:, :] == 0).iloc[1][i]:
        train_1 = train_1.iloc[1:, :]
        
    train_1.reset_index(inplace=True)
    train_1.columns = ['ds','y']
    train_1['ds'] = pd.to_datetime(train_1['ds'])

    train_1['extra_weight'] = 1.0  # 기본 가중치
    train_1.loc[train_1['ds'] >= train_1['ds'].max() - pd.Timedelta(days=7), 'extra_weight'] = 1.5  # 마지막 1주일에 더 높은 가중치 설정

    m = Prophet(
    changepoint_prior_scale=0.2,
    weekly_seasonality=4,
    daily_seasonality = True,
    seasonality_mode='additive'
    )
    m.add_regressor('extra_weight')
#     m.add_seasonality(name='monthly', period=30.5, fourier_order=3)
    m.fit(train_1)
    future = m.make_future_dataframe(periods=21)
    future['extra_weight'] = 1.0  # 미래 날짜에는 기본 가중치로 설정
    forecast = m.predict(future)
    forecast_list.append(forecast['yhat'][-21:])
    fig1 = m.plot(forecast)

In [None]:
t = pd.DataFrame()
for i in range(len(forecast_list)):
    a = pd.DataFrame(forecast_list[i])
    k = a.T
    k.reset_index(inplace=True)
    k['index'] = ID_list[i]
    k.columns = range(0,22)
    t = pd.concat([t,k],axis=0)
ID_list
t[t<0] = 0

In [None]:
IDlist = t[0]
submission3 = submission.loc[IDlist]
t.columns = submission3.columns

In [None]:
t.reset_index(drop=True,inplace=True)
t = np.round(t).astype(int)

In [None]:
id_lst = t['ID'].tolist()

In [None]:
t = t.set_index('ID')

In [None]:
idx_lst = t.index.tolist()

In [None]:
t = t.replace(0, 1)

In [None]:
submission.iloc[idx_lst,1:] = t.loc[idx_lst]

***
2023년에 판매량이 전부 0인 값들 따로 처리 위해 제거

In [None]:
lst_non_2023 = train[train.iloc[:,-94:].sum(axis=1)==0]['ID'].tolist()

In [None]:
train_non_2023 = train[train['ID'].isin(lst_non_2023)]
train_non_2023.reset_index(drop=True, inplace=True)

In [None]:
train = train[~train['ID'].isin(lst_non_2023)]
train.reset_index(drop=True, inplace=True)

해당 값들은 최근 최소 4개월 이상 판매량 0이기 때문에 예측불가. 따라서 1로 치환해주겠음.

In [None]:
submission.loc[submission['ID'].isin(lst_non_2023), submission.columns != 'ID'] = 1

In [None]:
submission.loc[submission['ID'].isin(lst_non_2023)]

***

2023-02-23 ~ 2023-03-28 사이값이 null 이 아닌 것

ID 3792 제외 나머지는 모든 시계열 사용해서 진행.

In [None]:
train_conti = train[train.iloc[:,424:-7].sum(axis=1)!=0]

In [None]:
train_conti.iloc[5:6,430:450]

2023-03-08	2023-03-09	2023-03-10	2023-03-11 이상치 처리 요망

In [None]:
# 중복 제거 후 정렬
unique_sorted_data = np.unique(train_conti.iloc[5:6,6:].T.values.flatten())
unique_sorted_data = unique_sorted_data[unique_sorted_data != 0]  # 0 값 제외

# 두 번째로 큰 값을 찾기
second_largest_value = unique_sorted_data[-2]

In [None]:
train_conti.iloc[5, 6:] = train_conti.iloc[5, 6:].replace(61, second_largest_value)

In [None]:
# 중복 제거 후 정렬
unique_sorted_data = np.unique(train_conti.iloc[5:6,6:].T.values.flatten())
unique_sorted_data = unique_sorted_data[unique_sorted_data != 0]  # 0 값 제외

# 두 번째로 큰 값을 찾기
second_largest_value = unique_sorted_data[-2]

In [None]:
train_conti.iloc[5, 6:] = train_conti.iloc[5, 6:].replace(unique_sorted_data[-1], second_largest_value)

In [None]:
conti_lst = train_conti['ID'].tolist()

In [None]:
train = train[~train['ID'].isin(conti_lst)]

In [None]:
train_3792 = train_conti[train_conti['ID']==3792]

In [None]:
train_conti = train_conti[train_conti['ID']!=3792]

In [None]:
conti_lst.remove(3792)

In [None]:
data = train_conti.set_index('ID').iloc[:,5:].T
ID_list = data.columns
datanew_columns = list(range(len(data.columns)))
data.columns = datanew_columns

In [None]:
forecast_list = []

for i in range(0,len(data.columns)):
    train_1 = data[i]
    train_1 = pd.DataFrame(train_1)
    
    while (train_1.iloc[1:, :] == 0).iloc[1][i]:
        train_1 = train_1.iloc[1:, :]
        
    train_1.reset_index(inplace=True)
    train_1.columns = ['ds','y']
    train_1['ds'] = pd.to_datetime(train_1['ds'])

    train_1['extra_weight'] = 1.0  # 기본 가중치
    train_1.loc[train_1['ds'] >= train_1['ds'].max() - pd.Timedelta(days=7), 'extra_weight'] = 1.4  # 마지막 1주일에 더 높은 가중치 설정

    m = Prophet(
    changepoint_prior_scale=0.15,
    weekly_seasonality=3,
    daily_seasonality = True,
    seasonality_mode='additive'
    )
    m.add_regressor('extra_weight')
    m.add_seasonality(name='monthly', period=30.5, fourier_order=3)
    m.fit(train_1)
    future = m.make_future_dataframe(periods=21)
    future['extra_weight'] = 1.0  # 미래 날짜에는 기본 가중치로 설정
    forecast = m.predict(future)
    forecast_list.append(forecast['yhat'][-21:])
    fig1 = m.plot(forecast)

In [None]:
t = pd.DataFrame()
for i in range(len(forecast_list)):
    a = pd.DataFrame(forecast_list[i])
    k = a.T
    k.reset_index(inplace=True)
    k['index'] = ID_list[i]
    k.columns = range(0,22)
    t = pd.concat([t,k],axis=0)

t[t<0] = 0

In [None]:
IDlist = t[0]
submission3 = submission.loc[IDlist]
t.columns = submission3.columns

In [None]:
t.reset_index(drop=True,inplace=True)
t = np.round(t).astype(int)

In [None]:
id_lst = t['ID'].tolist()

In [None]:
t = t.set_index('ID')

In [None]:
idx_lst = t.index.tolist()

In [None]:
t = t.replace(0, 1)

In [None]:
submission.iloc[idx_lst,1:] = t.loc[idx_lst]

***
***3972는 해당 기간에 대해서 따로 전처리 후 모델 생성***

In [None]:
train_3792.iloc[:,-51:-31]

In [None]:
train_3792.iloc[:,-20:]

In [None]:
ttt = train_3792.drop(train_3792.columns[-41:-7], axis=1)

In [None]:
ttt = ttt.set_index('ID')

In [None]:
start_date = '2023-02-23'
end_date = '2023-03-28'
future_dates = pd.date_range(start=start_date, end=end_date)

In [None]:
data = ttt.iloc[:,5:].T
ID_list = data.columns
datanew_columns = list(range(len(data.columns)))
data.columns = datanew_columns

In [None]:
forecast_list = []

for i in range(0,len(data.columns)):
    train_1 = data[i]
    train_1 = pd.DataFrame(train_1)
    train_1.reset_index(inplace=True)
    train_1.columns = ['ds','y']
    train_1['ds'] = pd.to_datetime(train_1['ds'])

    train_1['extra_weight'] = 1.0  # 기본 가중치
#     train_1.loc[train_1['ds'] >= train_1['ds'].max() - pd.Timedelta(days=7), 'extra_weight'] = 1.5  # 마지막 1주일에 더 높은 가중치 설정

    m = Prophet(
        changepoint_prior_scale=0.3,
        weekly_seasonality=3,
        daily_seasonality = True,
        seasonality_mode='additive'
    )
    m.add_regressor('extra_weight')
#     m.add_seasonality(name='monthly', period=30.5, fourier_order=3)
    m.fit(train_1)
    future = pd.DataFrame({'ds': future_dates, 'extra_weight': 1.0})
    future['extra_weight'] = 1.0  # 미래 날짜에는 기본 가중치로 설정
    forecast = m.predict(future)
    forecast_list.append(forecast['yhat'])
    fig1 = m.plot(forecast)

In [None]:
t = pd.DataFrame()
for i in range(len(forecast_list)):
    a = pd.DataFrame(forecast_list[i])
    k = a.T
    k.reset_index(inplace=True)
    k['index'] = ID_list[i]
    t = pd.concat([t,k],axis=0)

t[t<0] = 0

In [None]:
t = t.set_index('index')
t = np.round(t).astype(int)

In [None]:
t = t.reset_index(drop=True)

In [None]:
t.index = train_3792.index

In [None]:
train_3792.iloc[:,-41:-7] = t

In [None]:
data = train_3792.set_index('ID').iloc[:,5:].T
ID_list = data.columns
datanew_columns = list(range(len(data.columns)))
data.columns = datanew_columns

In [None]:
forecast_list = []

for i in range(0,len(data.columns)):
    train_1 = data[i]
    train_1 = pd.DataFrame(train_1)
    
    while (train_1.iloc[1:, :] == 0).iloc[1][i]:
        train_1 = train_1.iloc[1:, :]
        
    train_1.reset_index(inplace=True)
    train_1.columns = ['ds','y']
    train_1['ds'] = pd.to_datetime(train_1['ds'])

    train_1['extra_weight'] = 1.0  # 기본 가중치
    train_1.loc[train_1['ds'] >= train_1['ds'].max() - pd.Timedelta(days=7), 'extra_weight'] = 1.4  # 마지막 1주일에 더 높은 가중치 설정

    m = Prophet(
    changepoint_prior_scale=0.15,
    weekly_seasonality=3,
    daily_seasonality = True,
    seasonality_mode='additive'
    )
    m.add_regressor('extra_weight')
    m.add_seasonality(name='monthly', period=30.5, fourier_order=3)
    m.fit(train_1)
    future = m.make_future_dataframe(periods=21)
    future['extra_weight'] = 1.0  # 미래 날짜에는 기본 가중치로 설정
    forecast = m.predict(future)
    forecast_list.append(forecast['yhat'][-21:])
    fig1 = m.plot(forecast)

In [None]:
t = pd.DataFrame()
for i in range(len(forecast_list)):
    a = pd.DataFrame(forecast_list[i])
    k = a.T
    k.reset_index(inplace=True)
    k['index'] = ID_list[i]
    k.columns = range(0,22)
    t = pd.concat([t,k],axis=0)

t[t<0] = 0

In [None]:
IDlist = t[0]
submission3 = submission.loc[IDlist]
t.columns = submission3.columns

In [None]:
t.reset_index(drop=True,inplace=True)
t = np.round(t).astype(int)

In [None]:
id_lst = t['ID'].tolist()

In [None]:
t = t.set_index('ID')

In [None]:
idx_lst = t.index.tolist()

In [None]:
t = t.replace(0, 1)

In [None]:
submission.iloc[idx_lst,1:] = t.loc[idx_lst]

***
마지막 값이 0인것 기준으로 한달동안 제품이 팔리지 않았을 경우 따로 모델 생성

In [None]:
filtered_rows = train[train.iloc[:, -1] == 0]

In [None]:
filtered_rows_zero = filtered_rows[filtered_rows.iloc[:, -30:].eq(0).all(axis=1)]

In [None]:
recent_lst = filtered_rows_zero['ID'].tolist()

In [None]:
train = train[~train['ID'].isin(recent_lst)]

In [None]:
data = filtered_rows.set_index('ID').iloc[:,5:].T
ID_list = data.columns
datanew_columns = list(range(len(data.columns)))
data.columns = datanew_columns

In [None]:
forecast_list = []

for i in range(0,len(data.columns)):
    train_1 = data[i]
    train_1 = pd.DataFrame(train_1)
    
    while (train_1.iloc[1:, :] == 0).iloc[1][i]:
        train_1 = train_1.iloc[1:, :]
        
    train_1.reset_index(inplace=True)
    train_1.columns = ['ds','y']
    train_1['ds'] = pd.to_datetime(train_1['ds'])

    train_1['extra_weight'] = 1.0  # 기본 가중치
    train_1.loc[train_1['ds'] >= train_1['ds'].max() - pd.Timedelta(days=30), 'extra_weight'] = 2.0  # 마지막 1주일에 더 높은 가중치 설정

    m = Prophet(
    changepoint_prior_scale=0.15,
    weekly_seasonality=3,
    daily_seasonality = True,
    seasonality_mode='additive'
    )
    m.add_regressor('extra_weight')
    m.add_seasonality(name='monthly', period=30.5, fourier_order=3)
    m.fit(train_1)
    future = m.make_future_dataframe(periods=21)
    future['extra_weight'] = 1.0  # 미래 날짜에는 기본 가중치로 설정
    forecast = m.predict(future)
    forecast_list.append(forecast['yhat'][-21:])
    fig1 = m.plot(forecast)

In [None]:
t = pd.DataFrame()
for i in range(len(forecast_list)):
    a = pd.DataFrame(forecast_list[i])
    k = a.T
    k.reset_index(inplace=True)
    k['index'] = ID_list[i]
    k.columns = range(0,22)
    t = pd.concat([t,k],axis=0)

t[t<0] = 0

In [None]:
IDlist = t[0]
submission3 = submission.loc[IDlist]
t.columns = submission3.columns

In [None]:
t.reset_index(drop=True,inplace=True)
t = np.round(t).astype(int)

In [None]:
id_lst = t['ID'].tolist()

In [None]:
t = t.set_index('ID')

In [None]:
idx_lst = t.index.tolist()

In [None]:
t = t.replace(0, 1)

In [None]:
submission.iloc[idx_lst,1:] = t.loc[idx_lst]

***
나머지 train에 대해서는 2023-02-23 ~ 2023-03-28 을 prophet으로 보간 후 모델링 진행

In [None]:
ttt = train.drop(train.columns[-41:-7], axis=1)

In [None]:
ttt = ttt.set_index('ID')

In [None]:
start_date = '2023-02-23'
end_date = '2023-03-28'
future_dates = pd.date_range(start=start_date, end=end_date)

In [None]:
data = ttt.iloc[:,5:].T
ID_list = data.columns
datanew_columns = list(range(len(data.columns)))
data.columns = datanew_columns

In [None]:
forecast_list = []

for i in range(0,len(data.columns)):
    train_1 = data[i]
    train_1 = pd.DataFrame(train_1)
    
    while (train_1.iloc[1:, :] == 0).iloc[1][i]:
        train_1 = train_1.iloc[1:, :]
    
    train_1.reset_index(inplace=True)
    train_1.columns = ['ds','y']
    train_1['ds'] = pd.to_datetime(train_1['ds'])

    train_1['extra_weight'] = 1.0  # 기본 가중치
#     train_1.loc[train_1['ds'] >= train_1['ds'].max() - pd.Timedelta(days=7), 'extra_weight'] = 1.5  # 마지막 1주일에 더 높은 가중치 설정

    m = Prophet(
        changepoint_prior_scale=0.1,
        weekly_seasonality=3,
        daily_seasonality = True,
        seasonality_mode='additive'
    )
    m.add_regressor('extra_weight')
#     m.add_seasonality(name='monthly', period=30.5, fourier_order=3)
    m.fit(train_1)
    future = pd.DataFrame({'ds': future_dates, 'extra_weight': 1.0})
    future['extra_weight'] = 1.0  # 미래 날짜에는 기본 가중치로 설정
    forecast = m.predict(future)
    forecast_list.append(forecast['yhat'])
    fig1 = m.plot(forecast)

In [None]:
t = pd.DataFrame()
for i in range(len(forecast_list)):
    a = pd.DataFrame(forecast_list[i])
    k = a.T
    k.reset_index(inplace=True)
    k['index'] = ID_list[i]
    t = pd.concat([t,k],axis=0)

t[t<0] = 0

In [None]:
t = t.set_index('index')
t = np.round(t).astype(int)

In [None]:
t = t.reset_index(drop=True)

In [None]:
t.index = train.index

In [None]:
train.iloc[:,-41:-7] = t

In [None]:
data = train.set_index('ID').iloc[:,5:].T
ID_list = data.columns
datanew_columns = list(range(len(data.columns)))
data.columns = datanew_columns

In [None]:
forecast_list = []

for i in range(0,len(data.columns)):
    train_1 = data[i]
    train_1 = pd.DataFrame(train_1)
    
    while (train_1.iloc[1:, :] == 0).iloc[1][i]:
        train_1 = train_1.iloc[1:, :]
        
    train_1.reset_index(inplace=True)
    train_1.columns = ['ds','y']
    train_1['ds'] = pd.to_datetime(train_1['ds'])

    train_1['extra_weight'] = 1.0  # 기본 가중치
    train_1.loc[train_1['ds'] >= train_1['ds'].max() - pd.Timedelta(days=7), 'extra_weight'] = 1.4  # 마지막 1주일에 더 높은 가중치 설정

    m = Prophet(
    changepoint_prior_scale=0.2,
    weekly_seasonality=3,
    daily_seasonality = True,
    seasonality_mode='additive'
    )
    m.add_regressor('extra_weight')
    m.add_seasonality(name='monthly', period=30.5, fourier_order=3)
    m.fit(train_1)
    future = m.make_future_dataframe(periods=21)
    future['extra_weight'] = 1.0  # 미래 날짜에는 기본 가중치로 설정
    forecast = m.predict(future)
    forecast_list.append(forecast['yhat'][-21:])
    fig1 = m.plot(forecast)

In [None]:
t = pd.DataFrame()
for i in range(len(forecast_list)):
    a = pd.DataFrame(forecast_list[i])
    k = a.T
    k.reset_index(inplace=True)
    k['index'] = ID_list[i]
    k.columns = range(0,22)
    t = pd.concat([t,k],axis=0)

t[t<0] = 0

In [None]:
IDlist = t[0]
submission3 = submission.loc[IDlist]
t.columns = submission3.columns

In [None]:
t.reset_index(drop=True,inplace=True)
t = np.round(t).astype(int)

In [None]:
id_lst = t['ID'].tolist()

In [None]:
t = t.set_index('ID')

In [None]:
idx_lst = t.index.tolist()

In [None]:
t = t.replace(0, 1)

In [None]:
submission.iloc[idx_lst,1:] = t.loc[idx_lst]

In [None]:
submission.to_csv('./submission/대분류3번_역작_Prophet.csv', index = False)