# Import

In [None]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats
import pylab 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
sns.set()
%matplotlib inline

In [1]:
train = pd.read_csv('../data/train.csv')
weather = pd.read_csv('../data/weather.csv')
key = pd.read_csv('../data/key.csv')
test = pd.read_csv('../data/test.csv')
sample = pd.read_csv('../data/sampleSubmission.csv')

# 함수

In [2]:
def M_transform(df, column): #moving average (recursive)
    tmp_sorted = df.sort_values(by=['station_nbr','date']).reset_index(drop=True)
    weather_new = pd.DataFrame(columns=['station_nbr', 'date', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint',
           'wetbulb', 'heat', 'cool', 'sunrise', 'sunset', 'codesum', 'snowfall',
           'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir',
           'avgspeed'])
    window = 2
    for j in range(1, 21):
        tmp_station = []
        tmp_station = tmp_sorted[tmp_sorted['station_nbr'] == j].reset_index(drop=True)
        if j == 5:
            weather_new = pd.concat([weather_new, tmp_station])
        else:
            for i in range(len(tmp_station)):
                if tmp_station[column].at[i] == 'M':
                    tmp = 0.0
                    if i == 0:
                        result = 0.0
                    else:
                        for x in range(1, window + 1):
                            tmp += float(tmp_station[column].at[i - x])
                            result = float(round(tmp / window, 3))
                            tmp_station[column].set_value(i, result)
            weather_new = pd.concat([weather_new, tmp_station]).reset_index(drop=True)
    return weather_new

In [3]:
def TM_transform(series, T_replace, M_replace):  # Temporary solution
    """
    데이터내의 T, M을 원하는 값으로 바꿔주는 함수
    TM_transform(series, T_replace)
    """
    series = series.astype(str).map(lambda s: s.strip())
    series[series == 'T'] = T_replace
    series[series == 'M'] = M_replace
    return series.astype('float')

In [4]:
def T_transform(series, T_replace): 
    """
    데이터내의 T, M을 원하는 값으로 바꿔주는 함수
    TM_transform(series, T_replace)
    """
    series = series.astype(str).map(lambda s: s.strip())
    series[series == 'T'] = T_replace
    series[series == 'M'] = 'M'
    return series

In [5]:
def get_item_nbr(df) : # 모든 units이 0이 아닌 item_nbr을 구하는 함수, list형태로 return
    tmp = df.pivot_table(values = 'log_units', index = ['date'], columns = ['item_nbr'])
    tmp = tmp.loc[:, (tmp != 0).any(axis = 0)]
    tmp.loc['2012-12-25'] = 0 # 2012-12-25가 빠져있음 train data에서.. 그래서 log_units = 0으로 넣어줌.
    
    tmp.reset_index(inplace = True)
    tmp.sort_values(by = 'date', inplace = True)
    tmp.drop(['date'], axis = 1, inplace = True)
    
    result = list(tmp.columns)
    
    return result

In [6]:
def match_dateformat(df, year):
    """
    영문 월을 숫자 월로 바꾸어주고 나중에 사용하기 쉽도록 datetime.date 형태로 바꾸어주는 함수
    """
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    for i in range(len(df)):
        dates = df.loc[i][0]
        dates = dates.split(" ")
        for j in range(len(months)):
            if dates[0] == months[j]:
                dates[0] = str(j + 1)
                dates_df = ["{} {} {}".format(year, dates[0], dates[1])]
                dates_df = pd.to_datetime(dates_df)
                df.loc[i][0] = dates_df.date[0]
    return df

In [7]:
def merge_holiday(holiday_df1, holiday_df2, holiday_df3):
    """
    각 연도별 공휴일 리스트 합치기
    """
    frame = [holiday_df1, holiday_df2, holiday_df3]
    holiday = pd.concat(frame).reset_index(drop=True)
    return holiday

def find_holiday(file, year):
    """
    수요에 영향을 미치는 주요 공휴일을 찾아내는 함수
    """
    holidays = ["New Year's Day", "Martin Luther King Jr. Day", "Valentine's Day",  "President's Day", "Easter Sunday", 
                      "Mother's Day", "Memorial Day", "Father's Day", "Independence Day", "Labor Day", "Columbus Day",
                      "Halloween", "Veterans Day", "Thanksgiving Day", "Black Friday", "Christmas Eve", "Christmas Day", "New Year's Eve"]
    
    holi = pd.read_excel(file, year, header=None)
    holi = match_dateformat(holi, year)
    holiday = pd.DataFrame(columns=[0,1,2,3,4])
    for _ in holidays:
        for i in range(len(holi[2])):
            if _ == holi[2][i]:
                holiday = holiday.append(holi.loc[i])
    return holiday

def cs_preprocessing(codesum):
    codesum_temp = []
    for _ in codesum:
        _ = _.replace('+', '')
        _ = _.replace('-', '')
        if len(_) > 2:
            _1 = _[:2]
            codesum_temp.append(_1)
            _2 = _[2:]
            codesum_temp.append(_2)
        else:
            codesum_temp.append(_)
    codesum = codesum_temp
    return codesum

def weather_flagger(weather):
    codesum_ls = ['FC', 'TS', 'GR', 'RA', 'DZ', 'SN', 'SG', 'GS', 'PL', 'IC', 'FG', 'BR', 'UP', 'HZ', 'FU', 'VA', 'DU', 'DS', 'PO', 'SA', 'SS', 'PY', 'SQ', 'DR', 'SH', 'FZ', 'MI', 'PR', 'BC', 'BL', 'VC']
    weather['date'] = pd.to_datetime(weather['date']) #weather는 글로벌변수
    for i in range(len(weather['codesum'])):
        codesum = weather['codesum'][i].split(" ")
        codesum = cs_preprocessing(codesum)
        for _ in codesum:
            flag = any(code in _ for code in codesum_ls)
            if flag == True:
                weather.set_value(i, '{}_flag'.format(_), 1)
            else:
                weather.set_value(i, 'normal_flag', 1)
    weather['snowfall'] = TM_transform(weather['snowfall'], 0.02, 0.0)
    weather['preciptotal'] = TM_transform(weather['preciptotal'], 0.02, 0.0)
    weather['snow_event'] = np.where(np.where(weather['SN_flag'] == 1, 1, 0) + np.where(weather['snowfall'] > 2, 1, 0) == 2, 1, 0)
    weather['rain_event'] = np.where(np.where(weather['RA_flag'] == 1, 1, 0) + np.where(weather['preciptotal'] > 1, 1, 0) == 2, 1, 0)
    weather['event'] = weather['snow_event'] + weather['rain_event']
    weather['event'] = np.where(weather['event'] >= 1, 1, 0)
    return weather

def preprocessing(df, holiday, weather):
    """
    train데이터를 가공하는 함수
    """
    df['log_units'] = np.log(df['units'] + 1) # logged units
    df['date'] = pd.to_datetime(df['date'])
    df['weekday'] = df.date.dt.weekday  # 월요일이 0 일요일이 6
    df['weekend'] = df.date.dt.weekday.isin([5, 6])  # 5: 토요일, 6: 일요일

    df['holiday'] = df.date.isin(holiday[0])
    df['weekday_holiday'] = df.holiday & (df.weekend == False)
    df['weekend_holiday'] = df.holiday & df.weekend
    
    df = pd.merge(df, key, on='store_nbr') #key는 글로벌 변수
    df = pd.merge(df, weather[['date', 'station_nbr', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint', 'wetbulb', 'heat', 'cool',
                               'sunrise', 'sunset', 'codesum', 'snowfall', 'preciptotal', 'stnpressure', 'sealevel', 'resultspeed',
                               'resultspeed', 'resultdir', 'avgspeed' ,'event']], on=['date', 'station_nbr'])
    return df

def test_preprocessing(df, holiday, weather):
    weather = weather_flagger(weather)
    df['date'] = pd.to_datetime(df['date'])
    df['weekday'] = df.date.dt.weekday  # 월요일이 0 일요일이 6
    df['weekend'] = df.date.dt.weekday.isin([5, 6])  # 5: 토요일, 6: 일요일

    df['holiday'] = df.date.isin(holiday[0])
    df['weekday_holiday'] = df.holiday & (df.weekend == False)
    df['weekend_holiday'] = df.holiday & df.weekend
    
    df = pd.merge(df, key, on='store_nbr') #key는 글로벌 변수
    df = pd.merge(df, weather[['date', 'station_nbr', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint', 'wetbulb', 'heat', 'cool',
                               'sunrise', 'sunset', 'codesum', 'snowfall', 'preciptotal', 'stnpressure', 'sealevel', 'resultspeed',
                               'resultspeed', 'resultdir', 'avgspeed' ,'event']], on=['date', 'station_nbr'])
    return df

## 데이터 전처리

In [8]:
train_df = train[train['date'] <= '2013-03-31'].reset_index(drop=True)

In [9]:
weather_df = weather[weather['date'] <= '2013-03-31'].reset_index(drop = True)

In [10]:
holiday12 = find_holiday('../data/holiday.xlsx', '2012')
holiday13 = find_holiday('../data/holiday.xlsx', '2013')
holiday14 = find_holiday('../data/holiday.xlsx', '2014')
holiday = merge_holiday(holiday12, holiday13, holiday14)
weather_df = weather_flagger(weather_df)
processed_train = preprocessing(train, holiday, weather_df)
processed_test = test_preprocessing(test, holiday, weather)

# 데이터 분포 알아보기

### EDA

OLS의 기본 가정인 종속 변수 y가 독립변수 x의 선형 조합으로 결정되는 기댓값과 고정된 분산 $\sigma^{2}$를 가지는 정규 분포인지 알아보자

In [None]:
train_norm_test = train_df.pivot_table(values='units', index=['store_nbr', 'date'], columns=['item_nbr'])
train_df['log_units'] = np.log(train_df['units'] + 1)
train_norm_test_log = train_df.pivot_table(values='log_units', index=['store_nbr', 'date'], columns=['item_nbr'])

In [None]:
# Let's draw per item_nbr per store_nbr * with units
for i in range(1,2):  #원래는 1~45
    temp = []
    temp2 = []
    t = []
    temp = train_norm_test.loc[i]
    temp2 = train_norm_test_log.loc[i]
    temp = temp.loc[:, (temp !=0).any(axis=0)]
    temp2 = temp2.loc[:, (temp2 !=0).any(axis=0)]
    t = list(temp.columns)
    for j in t:
        plt.figure(figsize = (30, 20))
        plt.subplot(4,2,1)
        sns.distplot(temp[j], kde=True, fit=scipy.stats.norm)
        plt.title('Units')
        plt.subplot(4,2,2)
        sns.distplot(temp2[j], kde=True, fit=scipy.stats.norm)
        plt.title('Log_unit')
        plt.subplot(4,2,3)
        scipy.stats.probplot(temp[j], dist="norm", plot=pylab)
        plt.subplot(4,2,4)
        scipy.stats.probplot(temp2[j], dist="norm", plot=pylab)
        pylab.show()
        result_ks = scipy.stats.kstest(temp[j], cdf='norm')
        result_ks_log = scipy.stats.kstest(temp2[j], cdf='norm')
        print('Unit - test statistic: {}, p-value: {}'.format(result_ks[0], result_ks[1]))
        print('Log Unit - test statistic: {}, p-value: {}'.format(result_ks_log[0], result_ks_log[1]))
        print("Unit - Skewness: %f" % temp[j].skew())
        print("Unit - Kurtosis: %f" % temp[j].kurt())
        print("Log Unit - Skewness: %f" % temp2[j].skew())
        print("Log Unit - Kurtosis: %f" % temp2[j].kurt())
    # It seems like improving the normality!

로그를 취하지 않은 종속변수 y값은 

로그를 취한 유닛으로 시간별 판매량을 플롯함으로써 추세, 계절성이 있는지 알아보자

여기에서 UCL은 평균값에 2 Sigma를 더한값이라 하고, 범위안에 속하지 못하는 데이터는 odd하다고 가정한다. (95.45%) 

In [None]:
for i in range(1, 2): #원래는 1~45
    tmp = []
    tmp = train_df[train_df['store_nbr'] == i]
    tmp_sold = tmp[tmp['units'] > 0]
    tmp = pd.concat([tmp[tmp['item_nbr'] == num] for num in tmp_sold['item_nbr'].unique()])
    for j in tmp['item_nbr'].unique():
        tmp_item = []
        tmp_item = tmp[tmp['item_nbr'] == j]
        tmp_item['index'] = [k for k in range(len(tmp_item))]
        mean = tmp_item['log_units'].mean()
        std = tmp_item['log_units'].std()
        sig = np.sqrt(std)
        UCL = (sig*2) + mean # 2sigma 95.45%
        tmp_item['UCL'] = UCL
        tmp_item_odd = tmp_item[tmp_item['log_units'] > UCL]
        ax = tmp_item.plot(x='date', y='log_units', kind='line', figsize=(20,2), title=('{} Store, {} Item'.format(i, j)))
        tmp_item.plot(x='date', y='UCL', kind='line', style=':', ax=ax)
        if len(tmp_item_odd) != 0:
            tmp_item_odd.plot(x='index', y='log_units', kind='scatter', color='r', ax=ax)
#          plt.title('{} Store, {} Item'.format(i, j))
        plt.show()

이 oddity는 어디서 나온 것일까? 요일별로 판매량이 다른지 알아보자

In [None]:
processed_train_f1 = processed_train.pivot_table(values='units', index=['weekday'], aggfunc=np.sum)
processed_train_f2 = processed_train[processed_train['units'] > 0].reset_index(drop=True)
processed_train_f3 = processed_train[processed_train['log_units'] > 0].reset_index(drop=True)


fig, axes = plt.subplots(nrows=1, ncols=3, figsize = (25, 7))
# unit vs weekday
processed_train_f1.plot(kind='line', style='r', ax=axes[0])
# axes[1].set_ylim([0,200])
plt.xticks(rotation=0)
axes[0].set_ylabel('Total Units')
axes[0].set_xlabel('Weekday')
processed_train_f2.boxplot("units", "weekday", ax=axes[1])
axes[1].set_ylim([0,200])
plt.xticks(rotation=0)
axes[1].set_ylabel('Units')
axes[1].set_xlabel('Weekday')
processed_train_f3.boxplot("log_units", "weekday", ax=axes[2])
axes[2].set_ylim([0,7.5])
plt.xticks(rotation=0)
axes[2].set_ylabel('Units')
axes[2].set_xlabel('Weekday')
plt.show()

유닛의 총량으로 보았을때는 차이가 분명히 나타나지만 boxplot으로 보았을때 커다란 차이점을 보기는 힘들다. (특히 유닛에 로그를 취했을 경우)

In [11]:
processed_train = preprocessing(train_df, holiday, weather_df)
processed_train['date'] = processed_train['date'].apply(lambda x:x.date().strftime('%Y-%m-%d'))

In [None]:
for i in range(1, 2):  #원래는 1~45
    tmp = []
    tmp_sold = []
    tmp = processed_train[processed_train['store_nbr'] == i]
    tmp_sold = tmp[tmp['log_units'] > 0]
    tmp = pd.concat([tmp[tmp['item_nbr'] == num] for num in tmp_sold['item_nbr'].unique()])
    for j in tmp['item_nbr'].unique():
        tmp_item = []
        tmp_item_odd = []
        tmp_item_mon = []
        tmp_item_tue = []
        tmp_item_wed = []
        tmp_item_thu = []
        tmp_item_fri = []
        tmp_item_sat = []
        tmp_item_sun = []
        tmp_item = tmp[tmp['item_nbr'] == j]
        tmp_item['index'] = [k for k in range(len(tmp_item))]
        mean = tmp_item['log_units'].mean()
        std = tmp_item['log_units'].std()
        sig = np.sqrt(std)
        UCL = (sig*2) + mean # 2sigma 95.45%
        tmp_item['UCL'] = UCL
        tmp_item_odd = tmp_item[tmp_item['log_units'] > UCL]
        tmp_item_mon = tmp_item_odd[tmp_item_odd['weekday'] == 0]
        tmp_item_tue = tmp_item_odd[tmp_item_odd['weekday'] == 1]
        tmp_item_wed = tmp_item_odd[tmp_item_odd['weekday'] == 2]
        tmp_item_thu = tmp_item_odd[tmp_item_odd['weekday'] == 3]
        tmp_item_fri = tmp_item_odd[tmp_item_odd['weekday'] == 4]
        tmp_item_sat = tmp_item_odd[tmp_item_odd['weekday'] == 5]
        tmp_item_sun = tmp_item_odd[tmp_item_odd['weekday'] == 6]
        ax = tmp_item.plot(x='date', y='log_units', kind='line', figsize=(20,4), title=('{} Store, {} Item'.format(i, j)))
        tmp_item.plot(x='date', y='UCL', kind='line', style=':', ax=ax)
        if len(tmp_item_odd) != 0:
            tmp_item_odd.plot(x='index', y='log_units', kind='scatter', color='r', alpha='0.0', ax=ax)
            if len(tmp_item_mon) != 0:
                tmp_item_mon.plot(x='index', y='log_units', kind='scatter', color='#FFA500', ax=ax) # orange
                print('Monday: {}'.format(len(tmp_item_mon)))
#                 continue
            if len(tmp_item_tue) != 0:
                tmp_item_tue.plot(x='index', y='log_units', kind='scatter', color='#FF69B4', ax=ax) # pink
                print('Tuesday: {}'.format(len(tmp_item_tue)))
#                 continue
            if len(tmp_item_wed) !=0:
                tmp_item_wed.plot(x='index', y='log_units', kind='scatter', color='y', ax=ax)
                print('Wednsday: {}'.format(len(tmp_item_wed)))
#                 continue
            if len(tmp_item_thu) != 0:
                tmp_item_thu.plot(x='index', y='log_units', kind='scatter', color='g', ax=ax)
                print('Thurday: {}'.format(len(tmp_item_thu)))
#                 continue
            if len(tmp_item_fri) != 0:
                tmp_item_fri.plot(x='index', y='log_units', kind='scatter', color='b', ax=ax)
                print('Friday: {}'.format(len(tmp_item_fri)))
#                 continue
            if len(tmp_item_sat) != 0:
                tmp_item_sat.plot(x='index', y='log_units', kind='scatter', color='#00008B', ax=ax) # darkblue
                print('Satday: {}'.format(len(tmp_item_sat)))
#                 continue
            if len(tmp_item_sun) != 0:    
                tmp_item_sun.plot(x='index', y='log_units', kind='scatter', color='m', ax=ax)            
                print('Sunday: {}'.format(len(tmp_item_sun)))
#                 continue                            
            plt.show()
        plt.show()

공휴일이 미치는 영향도 알아보자

In [None]:
processed_train_f4 = processed_train[processed_train['units'] > 0]
processed_train_f5 = processed_train[processed_train['log_units'] > 0]
fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (10, 7))
processed_train_f4.boxplot("units", "holiday", ax=axes[0])
axes[0].set_ylim([0,150])
processed_train_f5.boxplot("log_units", "holiday", ax=axes[1])
axes[1].set_ylim([0,7])
plt.show()

그렇다면 이 odd한 날이 weather의 영향을 받은 날일까? 

문제에서 preciptotal이 1이상 snowfall이 2이상인날을 weather event, 즉 stormy weather한 날이라고 정의하였다. 

이 날짜에 맞춰 event가 발생하였다고 가정한다.

In [None]:
event_pivot_station_train = processed_train.pivot_table(values='event', index='date', columns='station_nbr')
event_pivot_station_test = processed_test.pivot_table(values='event', index='date', columns='station_nbr')
event_pivot_store_train = processed_train.pivot_table(values='event', index='date', columns='store_nbr')
event_pivot_store_test = processed_test.pivot_table(values='event', index='date', columns='store_nbr')
event_pivot_station_train = pd.DataFrame(event_pivot_station_train.to_records())
event_pivot_station_test = pd.DataFrame(event_pivot_station_test.to_records())
event_pivot_store_train = pd.DataFrame(event_pivot_store_train.to_records())
event_pivot_store_test = pd.DataFrame(event_pivot_store_test.to_records())

### Weather Event by Store

문제에서 정의한 event를 각 스토어 별로 플롯해본다.

In [None]:
# date_index_train = [str(event_pivot_store_train.at[num, 'date'])for num in range(len(event_pivot_store_train))]
# # date_index_test = [str(event_pivot_store_test.at[num, 'date'])for num in range(len(event_pivot_store_test))]
# date_index_test = event_pivot_store_test['date'].apply(lambda x:x.date().strftime('%Y-%m-%d'))

# f,  [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9, ax10, ax11, ax12, ax13, ax14, ax15, ax16, ax17, ax18, ax19, ax20, ax21
#     , ax22, ax23, ax24, ax25, ax26, ax27, ax28, ax29, ax30, ax31, ax32, ax33, ax34, ax35, ax36, ax37, ax38, ax39, ax40
#     , ax41, ax42, ax43, ax44, ax45] = plt.subplots(45, sharex = True, figsize = (30, 30))
# ax_tp = [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9, ax10, ax11, ax12, ax13, ax14, ax15, ax16, ax17, ax18, ax19, ax20, ax21
#     , ax22, ax23, ax24, ax25, ax26, ax27, ax28, ax29, ax30, ax31, ax32, ax33, ax34, ax35, ax36, ax37, ax38, ax39, ax40
#     , ax41, ax42, ax43, ax44, ax45]
# plt.ylim(0, 1)

# for i in range(1, len(ax_tp) + 1):
#     if i != 35:
#         ax_tp[i - 1].scatter(date_index_train, event_pivot_store_train[str(i)])
#         ax_tp[i - 1].scatter(date_index_test, event_pivot_store_test[str(i)])
#         ax_tp[i - 1].set_ylabel('{}'.format(i))   # Y 라벨 
#     else:
#         try:
#             ax_tp[i - 1].scatter(date_index_train, event_pivot_store_train[str(i)])
#             ax_tp[i - 1].scatter(date_index_test, event_pivot_store_test[str(i)])
#             ax_tp[i - 1].set_ylabel('{}'.format(i))   # Y 라벨 
#         except:
#             pass

# ax_tp[0].set_title("Event")
# plt.xlabel('date') # X 라벨
# f.subplots_adjust(hspace = 1.25) 
# # plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible = False)

위에서 보다시피 5번 스테이션은 데이터가 존재하지 않아 그에 해당하는 35번 스토어는 event를 알수 없다.

위에서 플롯한 추세, 계절성 플롯에서 odd한 값들이 event에 영향을 받은것인지 알아보기 위해 같은 플롯을 그려보자

event는 초록색 odd는 빨간색 둘이 겹치는 부분은 두색의 합으로 나타난다.

In [None]:
for i in range(1, 2): #원래는 1~45
    tmp = []
    tmp_item_flag = []
    tmp = processed_train[processed_train['store_nbr'] == i]
    tmp_sold = tmp[tmp['log_units'] > 0]
    tmp = pd.concat([tmp[tmp['item_nbr'] == num] for num in tmp_sold['item_nbr'].unique()])
    for j in tmp['item_nbr'].unique():
        tmp_item = tmp[tmp['item_nbr'] == j]
        tmp_item['index'] = [k for k in range(len(tmp_item))]
        mean = tmp_item['log_units'].mean()
        std = tmp_item['log_units'].std()
        sig = np.sqrt(std)
        UCL = (sig*2) + mean # 2sigma 95.45%
        tmp_item['UCL'] = UCL
        tmp_item_odd = tmp_item[tmp_item['log_units'] > UCL]
        tmp_item_event = tmp_item[tmp_item['event'] > 0]
        tmp_item_flag = tmp_item_event[tmp_item_event['log_units'] > UCL]
        ax = tmp_item.plot(x='date', y='log_units', kind='line', figsize=(20,4), title=('{} Store, {} Item'.format(i, j)))
        tmp_item.plot(x='date', y='UCL', kind='line', style=':', ax=ax)
        if len(tmp_item_odd) != 0:
            tmp_item_event.plot(x='index', y='log_units', kind='scatter', color='g', ax=ax)
            tmp_item_odd.plot(x='index', y='log_units', kind='scatter', color='r', ax=ax)
            if len(tmp_item_flag) !=0:
                tmp_item_flag.plot(x='index', y='log_units', kind='scatter', color='c', ax=ax)
                print('Warning! : {}, Match: {}'.format(tmp_item_flag['date'], len(tmp_item_flag)))
        else:
            tmp_item_event.plot(x='index', y='log_units', kind='scatter', color='g', ax=ax)
        plt.show()

event와 odd가 겹쳐질때 warning을 주도록 하였는데 겹치는 경우가 희소하다. 

그렇다면 다른 weather정보가 unit에 영향을 미치는지 알아보도록 하자

In [None]:
weather_tmp = weather[weather['date'] <= '2013-03-31'].reset_index(drop = True)

In [None]:
w_col_ls = list(weather_tmp.columns)
for col in w_col_ls:
    if col == 'date':
        weather_tmp[col] = weather_tmp[col]
    elif col == 'sunrise':
        weather_tmp[col] = weather_tmp[col]
    elif col == 'sunset':
        weather_tmp[col] = weather_tmp[col]
    elif col == 'codesum':
        weather_tmp[col] = weather_tmp[col]
    else:
        weather_tmp[col] = TM_transform(weather_tmp[col], 0.001, np.nan)

In [None]:
weather_tmp = weather_tmp.replace('M', np.nan)

In [None]:
# weather_tmp['date'] = weather_tmp['date'].apply(lambda x:x.date().strftime('%Y-%m-%d'))
# weather_tmp = weather_tmp.dropna(how='any', axis=0)
# weather_tmp = weather_tmp.astype('float')
weather_tmp = weather_tmp.reset_index(drop=True)

In [None]:
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['tmax'], kind='reg')
plt.show()
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['tmin'], kind='reg')
plt.show()
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['tavg'], kind='reg')
plt.show()
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['dewpoint'], kind='reg')
plt.show()
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['wetbulb'], kind='reg')
plt.show()
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['heat'], kind='reg')
plt.show()
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['cool'], kind='reg')
plt.show()
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['preciptotal'], kind='reg')
plt.show()
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['stnpressure'], kind='reg')
plt.show()
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['resultspeed'], kind='reg')
plt.show()
sns.pairplot(weather_tmp, x_vars=['tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'avgspeed'], y_vars=['avgspeed'], kind='reg')
plt.show()

temperature와 dewpoint wetbulb, cool과 heat외에는 상관관계가 없어보인다. 상관계수를 알아보자

In [None]:
train_df['log_units'] = np.log(train_df['units'] + 1)

In [None]:
tmp = pd.merge(train_df, key, on = 'store_nbr')

In [None]:
tmp = tmp[tmp['units'] != 0]

각 weather station별로 units이 0이 아닌 item_nbr들의 units와 weather column들 간의 correlation을 살펴보자

In [None]:
weather_tmp.drop(['RA_flag', 'FZ_flag', 'FG_flag', 'BR_flag', 'normal_flag',
       'UP_flag', 'MI_flag', 'SN_flag', 'HZ_flag', 'TS_flag', 'VC_flag',
       'DZ_flag', 'BL_flag', 'BC_flag', 'DU_flag', 'SQ_flag', 'PL_flag',
       'FU_flag', 'GR_flag', 'GS_flag', 'SG_flag', 'PR_flag', 'snow_event',
       'rain_event'], axis = 1, inplace = True)

In [None]:
for x in range(1, 2) : #station_nbr은 1~20까지....
    weather_cor = weather_tmp[weather_tmp['station_nbr'] == x]
    tmp_ = tmp[tmp['station_nbr'] == x]
    weather_cor1 = pd.merge(weather_cor, tmp_, on = 'station_nbr')
    item_nbr_list = list(weather_cor1['item_nbr'].unique())

    for num in item_nbr_list :
        weather_cor_ = weather_cor1[weather_cor1['item_nbr'] == num]
        plt.figure(figsize=(20,15))
        sns.heatmap(weather_cor_.corr(), annot = True, fmt = '.2f')

우리가 관심있는 units와 log_units는 weather와는 별로 연관성이 없어보인다. 어떻게 할까?

# Feature Selection

위에서 보았다시피 각 스토어, 각 아이템별로 나누어 모델링을 해야함은 분명하다

weather와 log_units(또는 units)의 큰 상관관계는 없는 것으로 보인다.

그나마 weekday와 holiday가 log_units(또는 units)에 영향을 미치는 것으로 보인다(item_nbr에 따라 다름).

## Modeling

In [12]:
contrast_weekday = np.eye(7)
result_test = [] #outlier 제거 전
result_test2 = [] #outlier 제거 후

In [13]:
processed_train.drop(['weekend', 'weekday_holiday', 'weekend_holiday',
       'station_nbr', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint', 'wetbulb',
       'heat', 'cool', 'sunrise', 'sunset', 'codesum', 'snowfall',
       'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultspeed',
       'resultdir', 'avgspeed'], axis = 1, inplace = True)

In [14]:
processed_test.drop(['weekend', 'weekday_holiday', 'weekend_holiday', 'station_nbr', 'tmax', 'tmin',
       'tavg', 'depart', 'dewpoint', 'wetbulb', 'heat', 'cool', 'sunrise',
       'sunset', 'codesum', 'snowfall', 'preciptotal', 'stnpressure',
       'sealevel', 'resultspeed', 'resultspeed', 'resultdir', 'avgspeed'], axis = 1, inplace = True)

In [15]:
df_1 = processed_train[processed_train['store_nbr'] == 1].reset_index(drop = True)

model1 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_1)
result_model1 = model1.fit()
print(result_model1.summary())

test_1 = processed_test[processed_test['store_nbr'] == 1]

test_1['log_units'] = result_model1.predict(test_1)
test_1['units'] = np.exp(test_1['log_units']) - 1
result_test.append(test_1)

                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.842
Model:                            OLS   Adj. R-squared:                  0.839
Method:                 Least Squares   F-statistic:                     340.5
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:39:46   Log-Likelihood:                 23410.
No. Observations:               50505   AIC:                        -4.527e+04
Df Residuals:                   49728   BIC:                        -3.840e+04
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [16]:
item_nbr_list_1 = get_item_nbr(df_1) #1번 store에서 팔린 item_nbr만 list로 저장

influence1 = result_model1.get_influence()

cooks_d2_1, pvals1 = influence1.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr1 = 4 / (len(df_1) - 778)
idx1 = np.where(cooks_d2_1 > fox_cr1)[0]

print(len(idx1)) #outlier 갯수

for num in idx1 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_1 :
        if item_nbr == df_1.loc[num].item_nbr :
            mean = df_1[df_1['item_nbr'] == item_nbr]['log_units'].mean()
            df_1.set_value(num, 'log_units', mean)
            df_1.set_value(num, 'units', np.exp(mean) - 1)

model1_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_1)
result_model1_new = model1_new.fit()
print(result_model1_new.summary())

test_1_new = processed_test[processed_test['store_nbr'] == 1]

test_1_new['log_units'] = result_model1_new.predict(test_1_new)
test_1_new['units'] = np.exp(test_1_new['log_units']) - 1
result_test2.append(test_1_new)

1506
                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.995
Model:                            OLS   Adj. R-squared:                  0.995
Method:                 Least Squares   F-statistic:                 1.219e+04
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:40:05   Log-Likelihood:             1.1342e+05
No. Observations:               50505   AIC:                        -2.253e+05
Df Residuals:                   49728   BIC:                        -2.184e+05
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [17]:
df_2 = processed_train[processed_train['store_nbr'] == 2].reset_index(drop = True)

model2 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_2)
result_model2 = model2.fit()
print(result_model2.summary())

test_2 = processed_test[processed_test['store_nbr'] == 2]

test_2['log_units'] = result_model2.predict(test_2)
test_2['units'] = np.exp(test_2['log_units']) - 1
result_test.append(test_2)

                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.968
Model:                            OLS   Adj. R-squared:                  0.968
Method:                 Least Squares   F-statistic:                     1947.
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:40:24   Log-Likelihood:                 40354.
No. Observations:               50505   AIC:                        -7.915e+04
Df Residuals:                   49728   BIC:                        -7.229e+04
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [18]:
item_nbr_list_2 = get_item_nbr(df_2) #2번 store에서 팔린 item_nbr만 list로 저장

influence2 = result_model2.get_influence()

cooks_d2_2, pvals2 = influence2.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr2 = 4 / (len(df_2) - 778)
idx2 = np.where(cooks_d2_2 > fox_cr2)[0]

print(len(idx2)) #outlier 갯수

for num in idx2 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_2 :
        if item_nbr == df_2.loc[num].item_nbr :
            mean = df_2[df_2['item_nbr'] == item_nbr]['log_units'].mean()
            df_2.set_value(num, 'log_units', mean)
            df_2.set_value(num, 'units', np.exp(mean) - 1)

model2_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_2)
result_model2_new = model2_new.fit()
print(result_model2_new.summary())

test_2_new = processed_test[processed_test['store_nbr'] == 2]

test_2_new['log_units'] = result_model2_new.predict(test_2_new)
test_2_new['units'] = np.exp(test_2_new['log_units']) - 1
result_test2.append(test_2_new)

1378
                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 7.944e+04
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:40:48   Log-Likelihood:             1.3421e+05
No. Observations:               50505   AIC:                        -2.669e+05
Df Residuals:                   49728   BIC:                        -2.600e+05
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
df_3 = processed_train[processed_train['store_nbr'] == 3].reset_index(drop = True)

model3 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_3)
result_model3 = model3.fit()
print(result_model3.summary())

test_3 = processed_test[processed_test['store_nbr'] == 3]

test_3['log_units'] = result_model3.predict(test_3)
test_3['units'] = np.exp(test_3['log_units']) - 1
result_test.append(test_3)

                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.953
Model:                            OLS   Adj. R-squared:                  0.953
Method:                 Least Squares   F-statistic:                     1310.
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:41:42   Log-Likelihood:                 33270.
No. Observations:               50505   AIC:                        -6.499e+04
Df Residuals:                   49728   BIC:                        -5.813e+04
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [20]:
item_nbr_list_3 = get_item_nbr(df_3) #3번 store에서 팔린 item_nbr만 list로 저장

influence3 = result_model3.get_influence()

cooks_d2_3, pvals3 = influence3.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr3 = 4 / (len(df_3) - 778)
idx3 = np.where(cooks_d2_3 > fox_cr3)[0]

print(len(idx3)) #outlier 갯수

for num in idx3 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_3 :
        if item_nbr == df_3.loc[num].item_nbr :
            mean = df_3[df_3['item_nbr'] == item_nbr]['log_units'].mean()
            df_3.set_value(num, 'log_units', mean)
            df_3.set_value(num, 'units', np.exp(mean) - 1)

model3_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_3)
result_model3_new = model3_new.fit()
print(result_model3_new.summary())

test_3_new = processed_test[processed_test['store_nbr'] == 3]

test_3_new['log_units'] = result_model3_new.predict(test_3_new)
test_3_new['units'] = np.exp(test_3_new['log_units']) - 1
result_test2.append(test_3_new)

1254
                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 6.360e+04
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:42:02   Log-Likelihood:             1.3137e+05
No. Observations:               50505   AIC:                        -2.612e+05
Df Residuals:                   49728   BIC:                        -2.543e+05
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [21]:
df_4 = processed_train[processed_train['store_nbr'] == 4].reset_index(drop = True)

model4 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_4)
result_model4 = model4.fit()
print(result_model4.summary())

test_4 = processed_test[processed_test['store_nbr'] == 4]

test_4['log_units'] = result_model4.predict(test_4)
test_4['units'] = np.exp(test_4['log_units']) - 1
result_test.append(test_4)

                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.932
Model:                            OLS   Adj. R-squared:                  0.931
Method:                 Least Squares   F-statistic:                     873.5
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:42:33   Log-Likelihood:                 30956.
No. Observations:               50505   AIC:                        -6.036e+04
Df Residuals:                   49728   BIC:                        -5.350e+04
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [22]:
item_nbr_list_4 = get_item_nbr(df_4) #4번 store에서 팔린 item_nbr만 list로 저장

influence4 = result_model4.get_influence()

cooks_d2_4, pvals4 = influence4.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr4 = 4 / (len(df_4) - 778)
idx4 = np.where(cooks_d2_4 > fox_cr4)[0]

print(len(idx4)) #outlier 갯수

for num in idx4 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_4 :
        if item_nbr == df_4.loc[num].item_nbr :
            mean = df_4[df_4['item_nbr'] == item_nbr]['log_units'].mean()
            df_4.set_value(num, 'log_units', mean)
            df_4.set_value(num, 'units', np.exp(mean) - 1)

model4_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_4)
result_model4_new = model4_new.fit()
print(result_model4_new.summary())

test_4_new = processed_test[processed_test['store_nbr'] == 4]

test_4_new['log_units'] = result_model4_new.predict(test_4_new)
test_4_new['units'] = np.exp(test_4_new['log_units']) - 1
result_test2.append(test_4_new)

1425
                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.998
Model:                            OLS   Adj. R-squared:                  0.998
Method:                 Least Squares   F-statistic:                 3.880e+04
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:43:09   Log-Likelihood:             1.2641e+05
No. Observations:               50505   AIC:                        -2.513e+05
Df Residuals:                   49728   BIC:                        -2.444e+05
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [23]:
df_5 = processed_train[processed_train['store_nbr'] == 5].reset_index(drop = True)

model5 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_5)
result_model5 = model5.fit()
print(result_model5.summary())

test_5 = processed_test[processed_test['store_nbr'] == 5]

test_5['log_units'] = result_model5.predict(test_5)
test_5['units'] = np.exp(test_5['log_units']) - 1
result_test.append(test_5)

                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.907
Model:                            OLS   Adj. R-squared:                  0.906
Method:                 Least Squares   F-statistic:                     624.8
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:43:29   Log-Likelihood:                 33353.
No. Observations:               50505   AIC:                        -6.515e+04
Df Residuals:                   49728   BIC:                        -5.829e+04
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [24]:
item_nbr_list_5 = get_item_nbr(df_5) #5번 store에서 팔린 item_nbr만 list로 저장

influence5 = result_model5.get_influence()

cooks_d2_5, pvals5 = influence5.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr5 = 4 / (len(df_5) - 778)
idx5 = np.where(cooks_d2_5 > fox_cr5)[0]

print(len(idx5)) #outlier 갯수

for num in idx5 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_5 :
        if item_nbr == df_5.loc[num].item_nbr :
            mean = df_5[df_5['item_nbr'] == item_nbr]['log_units'].mean()
            df_5.set_value(num, 'log_units', mean)
            df_5.set_value(num, 'units', np.exp(mean) - 1)

model5_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_5)
result_model5_new = model5_new.fit()
print(result_model5_new.summary())

test_5_new = processed_test[processed_test['store_nbr'] == 5]

test_5_new['log_units'] = result_model5_new.predict(test_5_new)
test_5_new['units'] = np.exp(test_5_new['log_units']) - 1
result_test2.append(test_5_new)

1566
                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.997
Model:                            OLS   Adj. R-squared:                  0.997
Method:                 Least Squares   F-statistic:                 2.343e+04
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:43:48   Log-Likelihood:             1.2471e+05
No. Observations:               50505   AIC:                        -2.479e+05
Df Residuals:                   49728   BIC:                        -2.410e+05
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [25]:
df_6 = processed_train[processed_train['store_nbr'] == 6].reset_index(drop = True)

model6 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_6)
result_model6 = model6.fit()
print(result_model6.summary())

test_6 = processed_test[processed_test['store_nbr'] == 6]

test_6['log_units'] = result_model6.predict(test_6)
test_6['units'] = np.exp(test_6['log_units']) - 1
result_test.append(test_6)

                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.982
Model:                            OLS   Adj. R-squared:                  0.982
Method:                 Least Squares   F-statistic:                     3561.
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:44:23   Log-Likelihood:                 61644.
No. Observations:               50505   AIC:                        -1.217e+05
Df Residuals:                   49728   BIC:                        -1.149e+05
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [26]:
item_nbr_list_6 = get_item_nbr(df_6) #6번 store에서 팔린 item_nbr만 list로 저장

influence6 = result_model6.get_influence()

cooks_d2_6, pvals6 = influence6.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr6 = 4 / (len(df_6) - 778)
idx6 = np.where(cooks_d2_6 > fox_cr6)[0]

print(len(idx6)) #outlier 갯수

for num in idx6 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_6 :
        if item_nbr == df_6.loc[num].item_nbr :
            mean = df_6[df_6['item_nbr'] == item_nbr]['log_units'].mean()
            df_6.set_value(num, 'log_units', mean)
            df_6.set_value(num, 'units', np.exp(mean) - 1)

model6_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_6)
result_model6_new = model6_new.fit()
print(result_model6_new.summary())

test_6_new = processed_test[processed_test['store_nbr'] == 6]

test_6_new['log_units'] = result_model6_new.predict(test_6_new)
test_6_new['units'] = np.exp(test_6_new['log_units']) - 1
result_test2.append(test_6_new)

1031
                            OLS Regression Results                            
Dep. Variable:              log_units   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 1.027e+05
Date:                Wed, 14 Mar 2018   Prob (F-statistic):               0.00
Time:                        10:44:41   Log-Likelihood:             1.4613e+05
No. Observations:               50505   AIC:                        -2.907e+05
Df Residuals:                   49728   BIC:                        -2.838e+05
Df Model:                         776                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [27]:
df_7 = processed_train[processed_train['store_nbr'] == 7].reset_index(drop = True)

model7 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_7)
result_model7 = model7.fit()
print(result_model7.summary())

test_7 = processed_test[processed_test['store_nbr'] == 7]

test_7['log_units'] = result_model7.predict(test_7)
test_7['units'] = np.exp(test_7['log_units']) - 1
result_test.append(test_7)

MemoryError: 

In [None]:
item_nbr_list_7 = get_item_nbr(df_7) #7번 store에서 팔린 item_nbr만 list로 저장

influence7 = result_model7.get_influence()

cooks_d2_7, pvals7 = influence7.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr7 = 4 / (len(df_7) - 778)
idx7 = np.where(cooks_d2_7 > fox_cr7)[0]

print(len(idx7)) #outlier 갯수

for num in idx7 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_7 :
        if item_nbr == df_7.loc[num].item_nbr :
            mean = df_7[df_7['item_nbr'] == item_nbr]['log_units'].mean()
            df_7.set_value(num, 'log_units', mean)
            df_7.set_value(num, 'units', np.exp(mean) - 1)

model7_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_7)
result_model7_new = model7_new.fit()
print(result_model7_new.summary())

test_7_new = processed_test[processed_test['store_nbr'] == 7]

test_7_new['log_units'] = result_model7_new.predict(test_7_new)
test_7_new['units'] = np.exp(test_7_new['log_units']) - 1
result_test2.append(test_7_new)

In [None]:
df_8 = processed_train[processed_train['store_nbr'] == 8].reset_index(drop = True)

model8 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_8)
result_model8 = model8.fit()
print(result_model8.summary())

test_8 = processed_test[processed_test['store_nbr'] == 8]

test_8['log_units'] = result_model8.predict(test_8)
test_8['units'] = np.exp(test_8['log_units']) - 1
result_test.append(test_8)

In [None]:
item_nbr_list_8 = get_item_nbr(df_8) #8번 store에서 팔린 item_nbr만 list로 저장

influence8 = result_model8.get_influence()

cooks_d2_8, pvals8 = influence8.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr8 = 4 / (len(df_8) - 778)
idx8 = np.where(cooks_d2_8 > fox_cr8)[0]

print(len(idx8)) #outlier 갯수

for num in idx8 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_8 :
        if item_nbr == df_8.loc[num].item_nbr :
            mean = df_8[df_8['item_nbr'] == item_nbr]['log_units'].mean()
            df_8.set_value(num, 'log_units', mean)
            df_8.set_value(num, 'units', np.exp(mean) - 1)

model8_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_8)
result_model8_new = model8_new.fit()
print(result_model8_new.summary())

test_8_new = processed_test[processed_test['store_nbr'] == 8]

test_8_new['log_units'] = result_model8_new.predict(test_8_new)
test_8_new['units'] = np.exp(test_8_new['log_units']) - 1
result_test2.append(test_8_new)

In [None]:
df_9 = processed_train[processed_train['store_nbr'] == 9].reset_index(drop = True)

model9 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_9)
result_model9 = model9.fit()
print(result_model9.summary())

test_9 = processed_test[processed_test['store_nbr'] == 9]

test_9['log_units'] = result_model9.predict(test_9)
test_9['units'] = np.exp(test_9['log_units']) - 1
result_test.append(test_9)

In [None]:
item_nbr_list_9 = get_item_nbr(df_9) #9번 store에서 팔린 item_nbr만 list로 저장

influence9 = result_model9.get_influence()

cooks_d2_9, pvals9 = influence9.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr9 = 4 / (len(df_9) - 778)
idx9 = np.where(cooks_d2_9 > fox_cr9)[0]

print(len(idx9)) #outlier 갯수

for num in idx9 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_9 :
        if item_nbr == df_9.loc[num].item_nbr :
            mean = df_9[df_9['item_nbr'] == item_nbr]['log_units'].mean()
            df_9.set_value(num, 'log_units', mean)
            df_9.set_value(num, 'units', np.exp(mean) - 1)

model9_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_9)
result_model9_new = model9_new.fit()
print(result_model9_new.summary())

test_9_new = processed_test[processed_test['store_nbr'] == 9]

test_9_new['log_units'] = result_model9_new.predict(test_9_new)
test_9_new['units'] = np.exp(test_9_new['log_units']) - 1
result_test2.append(test_9_new)

In [None]:
df_10 = processed_train[processed_train['store_nbr'] == 10].reset_index(drop = True)

model10 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_10)
result_model10 = model10.fit()
print(result_model10.summary())

test_10 = processed_test[processed_test['store_nbr'] == 10]

test_10['log_units'] = result_model10.predict(test_10)
test_10['units'] = np.exp(test_10['log_units']) - 1
result_test.append(test_10)

In [None]:
item_nbr_list_10 = get_item_nbr(df_10) #10번 store에서 팔린 item_nbr만 list로 저장

influence10 = result_model10.get_influence()

cooks_d2_10, pvals10 = influence10.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr10 = 4 / (len(df_10) - 778)
idx10 = np.where(cooks_d2_10 > fox_cr10)[0]

print(len(idx10)) #outlier 갯수

for num in idx10 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_10 :
        if item_nbr == df_10.loc[num].item_nbr :
            mean = df_10[df_10['item_nbr'] == item_nbr]['log_units'].mean()
            df_10.set_value(num, 'log_units', mean)
            df_10.set_value(num, 'units', np.exp(mean) - 1)

model10_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_10)
result_model10_new = model10_new.fit()
print(result_model10_new.summary())

test_10_new = processed_test[processed_test['store_nbr'] == 10]

test_10_new['log_units'] = result_model10_new.predict(test_10_new)
test_10_new['units'] = np.exp(test_10_new['log_units']) - 1
result_test2.append(test_10_new)

In [None]:
df_11 = processed_train[processed_train['store_nbr'] == 11].reset_index(drop = True)

model11 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_11)
result_model11 = model11.fit()
print(result_model11.summary())

test_11 = processed_test[processed_test['store_nbr'] == 11]

test_11['log_units'] = result_model11.predict(test_11)
test_11['units'] = np.exp(test_11['log_units']) - 1
result_test.append(test_11)

In [None]:
item_nbr_list_11 = get_item_nbr(df_11) #11번 store에서 팔린 item_nbr만 list로 저장

influence11 = result_model11.get_influence()

cooks_d2_11, pvals11 = influence11.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr11 = 4 / (len(df_11) - 778)
idx11 = np.where(cooks_d2_11 > fox_cr11)[0]

print(len(idx11)) #outlier 갯수

for num in idx11 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_11 :
        if item_nbr == df_11.loc[num].item_nbr :
            mean = df_11[df_11['item_nbr'] == item_nbr]['log_units'].mean()
            df_11.set_value(num, 'log_units', mean)
            df_11.set_value(num, 'units', np.exp(mean) - 1)

model11_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_11)
result_model11_new = model11_new.fit()
print(result_model11_new.summary())

test_11_new = processed_test[processed_test['store_nbr'] == 11]

test_11_new['log_units'] = result_model11_new.predict(test_11_new)
test_11_new['units'] = np.exp(test_11_new['log_units']) - 1
result_test2.append(test_11_new)

In [None]:
df_12 = processed_train[processed_train['store_nbr'] == 12].reset_index(drop = True)

model12 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_12)
result_model12 = model12.fit()
print(result_model12.summary())

test_12 = processed_test[processed_test['store_nbr'] == 12]

test_12['log_units'] = result_model12.predict(test_12)
test_12['units'] = np.exp(test_12['log_units']) - 1
result_test.append(test_12)

In [None]:
item_nbr_list_12 = get_item_nbr(df_12) #12번 store에서 팔린 item_nbr만 list로 저장

influence12 = result_model12.get_influence()

cooks_d2_12, pvals12 = influence12.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr12 = 4 / (len(df_12) - 778)
idx12 = np.where(cooks_d2_12 > fox_cr12)[0]

print(len(idx12)) #outlier 갯수

for num in idx12 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_12 :
        if item_nbr == df_12.loc[num].item_nbr :
            mean = df_12[df_12['item_nbr'] == item_nbr]['log_units'].mean()
            df_12.set_value(num, 'log_units', mean)
            df_12.set_value(num, 'units', np.exp(mean) - 1)

model12_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_12)
result_model12_new = model12_new.fit()
print(result_model12_new.summary())

test_12_new = processed_test[processed_test['store_nbr'] == 12]

test_12_new['log_units'] = result_model12_new.predict(test_12_new)
test_12_new['units'] = np.exp(test_12_new['log_units']) - 1
result_test2.append(test_12_new)

In [None]:
df_13 = processed_train[processed_train['store_nbr'] == 13].reset_index(drop = True)

model13 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_13)
result_model13 = model13.fit()
print(result_model13.summary())

test_13 = processed_test[processed_test['store_nbr'] == 13]

test_13['log_units'] = result_model13.predict(test_13)
test_13['units'] = np.exp(test_13['log_units']) - 1
result_test.append(test_13)

In [None]:
item_nbr_list_13 = get_item_nbr(df_13) #13번 store에서 팔린 item_nbr만 list로 저장

influence13 = result_model13.get_influence()

cooks_d2_13, pvals13 = influence13.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr13 = 4 / (len(df_13) - 778)
idx13 = np.where(cooks_d2_13 > fox_cr13)[0]

print(len(idx13)) #outlier 갯수

for num in idx13 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_13 :
        if item_nbr == df_13.loc[num].item_nbr :
            mean = df_13[df_13['item_nbr'] == item_nbr]['log_units'].mean()
            df_13.set_value(num, 'log_units', mean)
            df_13.set_value(num, 'units', np.exp(mean) - 1)

model13_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_13)
result_model13_new = model13_new.fit()
print(result_model13_new.summary())

test_13_new = processed_test[processed_test['store_nbr'] == 13]

test_13_new['log_units'] = result_model13_new.predict(test_13_new)
test_13_new['units'] = np.exp(test_13_new['log_units']) - 1
result_test2.append(test_13_new)

In [None]:
df_14 = processed_train[processed_train['store_nbr'] == 14].reset_index(drop = True)

model14 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_14)
result_model14 = model14.fit()
print(result_model14.summary())

test_14 = processed_test[processed_test['store_nbr'] == 14]

test_14['log_units'] = result_model14.predict(test_14)
test_14['units'] = np.exp(test_14['log_units']) - 1
result_test.append(test_14)

In [None]:
item_nbr_list_14 = get_item_nbr(df_14) #14번 store에서 팔린 item_nbr만 list로 저장

influence14 = result_model14.get_influence()

cooks_d2_14, pvals14 = influence14.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr14 = 4 / (len(df_14) - 778)
idx14 = np.where(cooks_d2_14 > fox_cr14)[0]

print(len(idx14)) #outlier 갯수

for num in idx14 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_14 :
        if item_nbr == df_14.loc[num].item_nbr :
            mean = df_14[df_14['item_nbr'] == item_nbr]['log_units'].mean()
            df_14.set_value(num, 'log_units', mean)
            df_14.set_value(num, 'units', np.exp(mean) - 1)

model14_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_14)
result_model14_new = model14_new.fit()
print(result_model14_new.summary())

test_14_new = processed_test[processed_test['store_nbr'] == 14]

test_14_new['log_units'] = result_model14_new.predict(test_14_new)
test_14_new['units'] = np.exp(test_14_new['log_units']) - 1
result_test2.append(test_14_new)

In [None]:
df_15 = processed_train[processed_train['store_nbr'] == 15].reset_index(drop = True)

model15 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_15)
result_model15 = model15.fit()
print(result_model15.summary())

test_15 = processed_test[processed_test['store_nbr'] == 15]

test_15['log_units'] = result_model15.predict(test_15)
test_15['units'] = np.exp(test_15['log_units']) - 1
result_test.append(test_15)

In [None]:
item_nbr_list_15 = get_item_nbr(df_15) #15번 store에서 팔린 item_nbr만 list로 저장

influence15 = result_model15.get_influence()

cooks_d2_15, pvals15 = influence15.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr15 = 4 / (len(df_15) - 778)
idx15 = np.where(cooks_d2_15 > fox_cr15)[0]

print(len(idx15)) #outlier 갯수

for num in idx15 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_15 :
        if item_nbr == df_15.loc[num].item_nbr :
            mean = df_15[df_15['item_nbr'] == item_nbr]['log_units'].mean()
            df_15.set_value(num, 'log_units', mean)
            df_15.set_value(num, 'units', np.exp(mean) - 1)

model15_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_15)
result_model15_new = model15_new.fit()
print(result_model15_new.summary())

test_15_new = processed_test[processed_test['store_nbr'] == 15]

test_15_new['log_units'] = result_model15_new.predict(test_15_new)
test_15_new['units'] = np.exp(test_15_new['log_units']) - 1
result_test2.append(test_15_new)

In [None]:
df_16 = processed_train[processed_train['store_nbr'] == 16].reset_index(drop = True)

model16 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_16)
result_model16 = model16.fit()
print(result_model16.summary())

test_16 = processed_test[processed_test['store_nbr'] == 16]

test_16['log_units'] = result_model16.predict(test_16)
test_16['units'] = np.exp(test_16['log_units']) - 1
result_test.append(test_16)

In [None]:
item_nbr_list_16 = get_item_nbr(df_16) #16번 store에서 팔린 item_nbr만 list로 저장

influence16 = result_model16.get_influence()

cooks_d2_16, pvals16 = influence16.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr16 = 4 / (len(df_16) - 778)
idx16 = np.where(cooks_d2_16 > fox_cr16)[0]

print(len(idx16)) #outlier 갯수

for num in idx16 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_16 :
        if item_nbr == df_16.loc[num].item_nbr :
            mean = df_16[df_16['item_nbr'] == item_nbr]['log_units'].mean()
            df_16.set_value(num, 'log_units', mean)
            df_16.set_value(num, 'units', np.exp(mean) - 1)

model16_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_16)
result_model16_new = model16_new.fit()
print(result_model16_new.summary())

test_16_new = processed_test[processed_test['store_nbr'] == 16]

test_16_new['log_units'] = result_model16_new.predict(test_16_new)
test_16_new['units'] = np.exp(test_16_new['log_units']) - 1
result_test2.append(test_16_new)

In [None]:
df_17 = processed_train[processed_train['store_nbr'] == 17].reset_index(drop = True)

model17 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_17)
result_model17 = model17.fit()
print(result_model17.summary())

test_17 = processed_test[processed_test['store_nbr'] == 17]

test_17['log_units'] = result_model17.predict(test_17)
test_17['units'] = np.exp(test_17['log_units']) - 1
result_test.append(test_17)

In [None]:
item_nbr_list_17 = get_item_nbr(df_17) #17번 store에서 팔린 item_nbr만 list로 저장

influence17 = result_model17.get_influence()

cooks_d2_17, pvals17 = influence17.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr17 = 4 / (len(df_17) - 778)
idx17 = np.where(cooks_d2_17 > fox_cr17)[0]

print(len(idx17)) #outlier 갯수

for num in idx17 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_17 :
        if item_nbr == df_17.loc[num].item_nbr :
            mean = df_17[df_17['item_nbr'] == item_nbr]['log_units'].mean()
            df_17.set_value(num, 'log_units', mean)
            df_17.set_value(num, 'units', np.exp(mean) - 1)

model17_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_17)
result_model17_new = model17_new.fit()
print(result_model17_new.summary())

test_17_new = processed_test[processed_test['store_nbr'] == 17]

test_17_new['log_units'] = result_model17_new.predict(test_17_new)
test_17_new['units'] = np.exp(test_17_new['log_units']) - 1
result_test2.append(test_17_new)

In [None]:
df_18 = processed_train[processed_train['store_nbr'] == 18].reset_index(drop = True)

model18 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_18)
result_model18 = model18.fit()
print(result_model18.summary())

test_18 = processed_test[processed_test['store_nbr'] == 18]

test_18['log_units'] = result_model18.predict(test_18)
test_18['units'] = np.exp(test_18['log_units']) - 1
result_test.append(test_18)

In [None]:
item_nbr_list_18 = get_item_nbr(df_18) #18번 store에서 팔린 item_nbr만 list로 저장

influence18 = result_model18.get_influence()

cooks_d2_18, pvals18 = influence18.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr18 = 4 / (len(df_18) - 778)
idx18 = np.where(cooks_d2_18 > fox_cr18)[0]

print(len(idx18)) #outlier 갯수

for num in idx18 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_18 :
        if item_nbr == df_18.loc[num].item_nbr :
            mean = df_18[df_18['item_nbr'] == item_nbr]['log_units'].mean()
            df_18.set_value(num, 'log_units', mean)
            df_18.set_value(num, 'units', np.exp(mean) - 1)

model18_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_18)
result_model18_new = model18_new.fit()
print(result_model18_new.summary())

test_18_new = processed_test[processed_test['store_nbr'] == 18]

test_18_new['log_units'] = result_model18_new.predict(test_18_new)
test_18_new['units'] = np.exp(test_18_new['log_units']) - 1
result_test2.append(test_18_new)

In [None]:
df_19 = processed_train[processed_train['store_nbr'] == 19].reset_index(drop = True)

model19 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_19)
result_model19 = model19.fit()
print(result_model19.summary())

test_19 = processed_test[processed_test['store_nbr'] == 19]

test_19['log_units'] = result_model19.predict(test_19)
test_19['units'] = np.exp(test_19['log_units']) - 1
result_test.append(test_19)

In [None]:
item_nbr_list_19 = get_item_nbr(df_19) #19번 store에서 팔린 item_nbr만 list로 저장

influence19 = result_model19.get_influence()

cooks_d2_19, pvals19 = influence19.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr19 = 4 / (len(df_19) - 778)
idx19 = np.where(cooks_d2_19 > fox_cr19)[0]

print(len(idx19)) #outlier 갯수

for num in idx19 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_19 :
        if item_nbr == df_19.loc[num].item_nbr :
            mean = df_19[df_19['item_nbr'] == item_nbr]['log_units'].mean()
            df_19.set_value(num, 'log_units', mean)
            df_19.set_value(num, 'units', np.exp(mean) - 1)

model19_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_19)
result_model19_new = model19_new.fit()
print(result_model19_new.summary())

test_19_new = processed_test[processed_test['store_nbr'] == 19]

test_19_new['log_units'] = result_model19_new.predict(test_19_new)
test_19_new['units'] = np.exp(test_19_new['log_units']) - 1
result_test2.append(test_19_new)

In [None]:
df_20 = processed_train[processed_train['store_nbr'] == 20].reset_index(drop = True)

model20 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_20)
result_model20 = model20.fit()
print(result_model20.summary())

test_20 = processed_test[processed_test['store_nbr'] == 20]

test_20['log_units'] = result_model20.predict(test_20)
test_20['units'] = np.exp(test_20['log_units']) - 1
result_test.append(test_20)

In [None]:
item_nbr_list_20 = get_item_nbr(df_20) #20번 store에서 팔린 item_nbr만 list로 저장

influence20 = result_model20.get_influence()

cooks_d2_20, pvals20 = influence20.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr20 = 4 / (len(df_20) - 778)
idx20 = np.where(cooks_d2_20 > fox_cr20)[0]

print(len(idx20)) #outlier 갯수

for num in idx20 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_20 :
        if item_nbr == df_20.loc[num].item_nbr :
            mean = df_20[df_20['item_nbr'] == item_nbr]['log_units'].mean()
            df_20.set_value(num, 'log_units', mean)
            df_20.set_value(num, 'units', np.exp(mean) - 1)

model20_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_20)
result_model20_new = model20_new.fit()
print(result_model20_new.summary())

test_20_new = processed_test[processed_test['store_nbr'] == 20]

test_20_new['log_units'] = result_model20_new.predict(test_20_new)
test_20_new['units'] = np.exp(test_20_new['log_units']) - 1
result_test2.append(test_20_new)

In [None]:
df_21 = processed_train[processed_train['store_nbr'] == 21].reset_index(drop = True)

model21 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_21)
result_model21 = model21.fit()
print(result_model21.summary())

test_21 = processed_test[processed_test['store_nbr'] == 21]

test_21['log_units'] = result_model21.predict(test_21)
test_21['units'] = np.exp(test_21['log_units']) - 1
result_test.append(test_21)

In [None]:
item_nbr_list_21 = get_item_nbr(df_21) #21번 store에서 팔린 item_nbr만 list로 저장

influence21 = result_model21.get_influence()

cooks_d2_21, pvals21 = influence21.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr21 = 4 / (len(df_21) - 778)
idx21 = np.where(cooks_d2_21 > fox_cr21)[0]

print(len(idx21)) #outlier 갯수

for num in idx21 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_21 :
        if item_nbr == df_21.loc[num].item_nbr :
            mean = df_21[df_21['item_nbr'] == item_nbr]['log_units'].mean()
            df_21.set_value(num, 'log_units', mean)
            df_21.set_value(num, 'units', np.exp(mean) - 1)

model21_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_21)
result_model21_new = model21_new.fit()
print(result_model21_new.summary())

test_21_new = processed_test[processed_test['store_nbr'] == 21]

test_21_new['log_units'] = result_model21_new.predict(test_21_new)
test_21_new['units'] = np.exp(test_21_new['log_units']) - 1
result_test2.append(test_21_new)

In [None]:
df_22 = processed_train[processed_train['store_nbr'] == 22].reset_index(drop = True)

model22 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_22)
result_model22 = model22.fit()
print(result_model22.summary())

test_22 = processed_test[processed_test['store_nbr'] == 22]

test_22['log_units'] = result_model22.predict(test_22)
test_22['units'] = np.exp(test_22['log_units']) - 1
result_test.append(test_22)

In [None]:
item_nbr_list_22 = get_item_nbr(df_22) #22번 store에서 팔린 item_nbr만 list로 저장

influence22 = result_model22.get_influence()

cooks_d2_22, pvals22 = influence22.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr22 = 4 / (len(df_22) - 778)
idx22 = np.where(cooks_d2_22 > fox_cr22)[0]

print(len(idx22)) #outlier 갯수

for num in idx22 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_22 :
        if item_nbr == df_22.loc[num].item_nbr :
            mean = df_22[df_22['item_nbr'] == item_nbr]['log_units'].mean()
            df_22.set_value(num, 'log_units', mean)
            df_22.set_value(num, 'units', np.exp(mean) - 1)

model22_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_22)
result_model22_new = model22_new.fit()
print(result_model22_new.summary())

test_22_new = processed_test[processed_test['store_nbr'] == 22]

test_22_new['log_units'] = result_model22_new.predict(test_22_new)
test_22_new['units'] = np.exp(test_22_new['log_units']) - 1
result_test2.append(test_22_new)

In [None]:
df_23 = processed_train[processed_train['store_nbr'] == 23].reset_index(drop = True)

model23 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_23)
result_model23 = model23.fit()
print(result_model23.summary())

test_23 = processed_test[processed_test['store_nbr'] == 23]

test_23['log_units'] = result_model23.predict(test_23)
test_23['units'] = np.exp(test_23['log_units']) - 1
result_test.append(test_23)

In [None]:
item_nbr_list_23 = get_item_nbr(df_23) #23번 store에서 팔린 item_nbr만 list로 저장

influence23 = result_model23.get_influence()

cooks_d2_23, pvals23 = influence23.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr23 = 4 / (len(df_23) - 778)
idx23 = np.where(cooks_d2_23 > fox_cr23)[0]

print(len(idx23)) #outlier 갯수

for num in idx23 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_23 :
        if item_nbr == df_23.loc[num].item_nbr :
            mean = df_23[df_23['item_nbr'] == item_nbr]['log_units'].mean()
            df_23.set_value(num, 'log_units', mean)
            df_23.set_value(num, 'units', np.exp(mean) - 1)

model23_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_23)
result_model23_new = model23_new.fit()
print(result_model23_new.summary())

test_23_new = processed_test[processed_test['store_nbr'] == 23]

test_23_new['log_units'] = result_model23_new.predict(test_23_new)
test_23_new['units'] = np.exp(test_23_new['log_units']) - 1
result_test2.append(test_23_new)

In [None]:
df_24 = processed_train[processed_train['store_nbr'] == 24].reset_index(drop = True)

model24 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_24)
result_model24 = model24.fit()
print(result_model24.summary())

test_24 = processed_test[processed_test['store_nbr'] == 24]

test_24['log_units'] = result_model24.predict(test_24)
test_24['units'] = np.exp(test_24['log_units']) - 1
result_test.append(test_24)

In [None]:
item_nbr_list_24 = get_item_nbr(df_24) #24번 store에서 팔린 item_nbr만 list로 저장

influence24 = result_model24.get_influence()

cooks_d2_24, pvals24 = influence24.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr24 = 4 / (len(df_24) - 778)
idx24 = np.where(cooks_d2_24 > fox_cr24)[0]

print(len(idx24)) #outlier 갯수

for num in idx24 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_24 :
        if item_nbr == df_24.loc[num].item_nbr :
            mean = df_24[df_24['item_nbr'] == item_nbr]['log_units'].mean()
            df_24.set_value(num, 'log_units', mean)
            df_24.set_value(num, 'units', np.exp(mean) - 1)

model24_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_24)
result_model24_new = model24_new.fit()
print(result_model24_new.summary())

test_24_new = processed_test[processed_test['store_nbr'] == 24]

test_24_new['log_units'] = result_model24_new.predict(test_24_new)
test_24_new['units'] = np.exp(test_24_new['log_units']) - 1
result_test2.append(test_24_new)

In [None]:
df_25 = processed_train[processed_train['store_nbr'] == 25].reset_index(drop = True)

model25 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_25)
result_model25 = model25.fit()
print(result_model25.summary())

test_25 = processed_test[processed_test['store_nbr'] == 25]

test_25['log_units'] = result_model25.predict(test_25)
test_25['units'] = np.exp(test_25['log_units']) - 1
result_test.append(test_25)

In [None]:
item_nbr_list_25 = get_item_nbr(df_25) #25번 store에서 팔린 item_nbr만 list로 저장

influence25 = result_model25.get_influence()

cooks_d2_25, pvals25 = influence25.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr25 = 4 / (len(df_25) - 778)
idx25 = np.where(cooks_d2_25 > fox_cr25)[0]

print(len(idx25)) #outlier 갯수

for num in idx25 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_25 :
        if item_nbr == df_25.loc[num].item_nbr :
            mean = df_25[df_25['item_nbr'] == item_nbr]['log_units'].mean()
            df_25.set_value(num, 'log_units', mean)
            df_25.set_value(num, 'units', np.exp(mean) - 1)

model25_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_25)
result_model25_new = model25_new.fit()
print(result_model25_new.summary())

test_25_new = processed_test[processed_test['store_nbr'] == 25]

test_25_new['log_units'] = result_model25_new.predict(test_25_new)
test_25_new['units'] = np.exp(test_25_new['log_units']) - 1
result_test2.append(test_25_new)

In [None]:
df_26 = processed_train[processed_train['store_nbr'] == 26].reset_index(drop = True)

model26 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_26)
result_model26 = model26.fit()
print(result_model26.summary())

test_26 = processed_test[processed_test['store_nbr'] == 26]

test_26['log_units'] = result_model26.predict(test_26)
test_26['units'] = np.exp(test_26['log_units']) - 1
result_test.append(test_26)

In [None]:
item_nbr_list_26 = get_item_nbr(df_26) #26번 store에서 팔린 item_nbr만 list로 저장

influence26 = result_model26.get_influence()

cooks_d2_26, pvals26 = influence26.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr26 = 4 / (len(df_26) - 778)
idx26 = np.where(cooks_d2_26 > fox_cr26)[0]

print(len(idx26)) #outlier 갯수

for num in idx26 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_26 :
        if item_nbr == df_26.loc[num].item_nbr :
            mean = df_26[df_26['item_nbr'] == item_nbr]['log_units'].mean()
            df_26.set_value(num, 'log_units', mean)
            df_26.set_value(num, 'units', np.exp(mean) - 1)

model26_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_26)
result_model26_new = model26_new.fit()
print(result_model26_new.summary())

test_26_new = processed_test[processed_test['store_nbr'] == 26]

test_26_new['log_units'] = result_model26_new.predict(test_26_new)
test_26_new['units'] = np.exp(test_26_new['log_units']) - 1
result_test2.append(test_26_new)

In [None]:
df_27 = processed_train[processed_train['store_nbr'] == 27].reset_index(drop = True)

model27 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_27)
result_model27 = model27.fit()
print(result_model27.summary())

test_27 = processed_test[processed_test['store_nbr'] == 27]

test_27['log_units'] = result_model27.predict(test_27)
test_27['units'] = np.exp(test_27['log_units']) - 1
result_test.append(test_27)

In [None]:
item_nbr_list_27 = get_item_nbr(df_27) #27번 store에서 팔린 item_nbr만 list로 저장

influence27 = result_model27.get_influence()

cooks_d2_27, pvals27 = influence27.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr27 = 4 / (len(df_27) - 778)
idx27 = np.where(cooks_d2_27 > fox_cr27)[0]

print(len(idx27)) #outlier 갯수

for num in idx27 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_27 :
        if item_nbr == df_27.loc[num].item_nbr :
            mean = df_27[df_27['item_nbr'] == item_nbr]['log_units'].mean()
            df_27.set_value(num, 'log_units', mean)
            df_27.set_value(num, 'units', np.exp(mean) - 1)

model27_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_27)
result_model27_new = model27_new.fit()
print(result_model27_new.summary())

test_27_new = processed_test[processed_test['store_nbr'] == 27]

test_27_new['log_units'] = result_model27_new.predict(test_27_new)
test_27_new['units'] = np.exp(test_27_new['log_units']) - 1
result_test2.append(test_27_new)

In [None]:
df_28 = processed_train[processed_train['store_nbr'] == 28].reset_index(drop = True)

model28 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_28)
result_model28 = model28.fit()
print(result_model28.summary())

test_28 = processed_test[processed_test['store_nbr'] == 28]

test_28['log_units'] = result_model28.predict(test_28)
test_28['units'] = np.exp(test_28['log_units']) - 1
result_test.append(test_28)

In [None]:
item_nbr_list_28 = get_item_nbr(df_28) #28번 store에서 팔린 item_nbr만 list로 저장

influence28 = result_model28.get_influence()

cooks_d2_28, pvals28 = influence28.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr28 = 4 / (len(df_28) - 778)
idx28 = np.where(cooks_d2_28 > fox_cr28)[0]

print(len(idx28)) #outlier 갯수

for num in idx28 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_28 :
        if item_nbr == df_28.loc[num].item_nbr :
            mean = df_28[df_28['item_nbr'] == item_nbr]['log_units'].mean()
            df_28.set_value(num, 'log_units', mean)
            df_28.set_value(num, 'units', np.exp(mean) - 1)

model28_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_28)
result_model28_new = model28_new.fit()
print(result_model28_new.summary())

test_28_new = processed_test[processed_test['store_nbr'] == 28]

test_28_new['log_units'] = result_model28_new.predict(test_28_new)
test_28_new['units'] = np.exp(test_28_new['log_units']) - 1
result_test2.append(test_28_new)

In [None]:
df_29 = processed_train[processed_train['store_nbr'] == 29].reset_index(drop = True)

model29 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_29)
result_model29 = model29.fit()
print(result_model29.summary())

test_29 = processed_test[processed_test['store_nbr'] == 29]

test_29['log_units'] = result_model29.predict(test_29)
test_29['units'] = np.exp(test_29['log_units']) - 1
result_test.append(test_29)

In [None]:
item_nbr_list_29 = get_item_nbr(df_29) #29번 store에서 팔린 item_nbr만 list로 저장

influence29 = result_model29.get_influence()

cooks_d2_29, pvals29 = influence29.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr29 = 4 / (len(df_29) - 778)
idx29 = np.where(cooks_d2_29 > fox_cr29)[0]

print(len(idx29)) #outlier 갯수

for num in idx29 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_29 :
        if item_nbr == df_29.loc[num].item_nbr :
            mean = df_29[df_29['item_nbr'] == item_nbr]['log_units'].mean()
            df_29.set_value(num, 'log_units', mean)
            df_29.set_value(num, 'units', np.exp(mean) - 1)

model29_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_29)
result_model29_new = model29_new.fit()
print(result_model29_new.summary())

test_29_new = processed_test[processed_test['store_nbr'] == 29]

test_29_new['log_units'] = result_model29_new.predict(test_29_new)
test_29_new['units'] = np.exp(test_29_new['log_units']) - 1
result_test2.append(test_29_new)

In [None]:
df_30 = processed_train[processed_train['store_nbr'] == 30].reset_index(drop = True)

model30 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_30)
result_model30 = model30.fit()
print(result_model30.summary())

test_30 = processed_test[processed_test['store_nbr'] == 30]

test_30['log_units'] = result_model30.predict(test_30)
test_30['units'] = np.exp(test_30['log_units']) - 1
result_test.append(test_30)

In [None]:
item_nbr_list_30 = get_item_nbr(df_30) #30번 store에서 팔린 item_nbr만 list로 저장

influence30 = result_model30.get_influence()

cooks_d2_30, pvals30 = influence30.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr30 = 4 / (len(df_30) - 778)
idx30 = np.where(cooks_d2_30 > fox_cr30)[0]

print(len(idx30)) #outlier 갯수

for num in idx30 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_30 :
        if item_nbr == df_30.loc[num].item_nbr :
            mean = df_30[df_30['item_nbr'] == item_nbr]['log_units'].mean()
            df_30.set_value(num, 'log_units', mean)
            df_30.set_value(num, 'units', np.exp(mean) - 1)

model30_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_30)
result_model30_new = model30_new.fit()
print(result_model30_new.summary())

test_30_new = processed_test[processed_test['store_nbr'] == 30]

test_30_new['log_units'] = result_model30_new.predict(test_30_new)
test_30_new['units'] = np.exp(test_30_new['log_units']) - 1
result_test2.append(test_30_new)

In [None]:
df_31 = processed_train[processed_train['store_nbr'] == 31].reset_index(drop = True)

model31 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_31)
result_model31 = model31.fit()
print(result_model31.summary())

test_31 = processed_test[processed_test['store_nbr'] == 31]

test_31['log_units'] = result_model31.predict(test_31)
test_31['units'] = np.exp(test_31['log_units']) - 1
result_test.append(test_31)

In [None]:
item_nbr_list_31 = get_item_nbr(df_31) #31번 store에서 팔린 item_nbr만 list로 저장

influence31 = result_model31.get_influence()

cooks_d2_31, pvals31 = influence31.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr31 = 4 / (len(df_31) - 778)
idx31 = np.where(cooks_d2_31 > fox_cr31)[0]

print(len(idx31)) #outlier 갯수

for num in idx31 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_31 :
        if item_nbr == df_31.loc[num].item_nbr :
            mean = df_31[df_31['item_nbr'] == item_nbr]['log_units'].mean()
            df_31.set_value(num, 'log_units', mean)
            df_31.set_value(num, 'units', np.exp(mean) - 1)

model31_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_31)
result_model31_new = model31_new.fit()
print(result_model31_new.summary())

test_31_new = processed_test[processed_test['store_nbr'] == 31]

test_31_new['log_units'] = result_model31_new.predict(test_31_new)
test_31_new['units'] = np.exp(test_31_new['log_units']) - 1
result_test2.append(test_31_new)

In [None]:
df_32 = processed_train[processed_train['store_nbr'] == 32].reset_index(drop = True)

model32 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_32)
result_model32 = model32.fit()
print(result_model32.summary())

test_32 = processed_test[processed_test['store_nbr'] == 32]

test_32['log_units'] = result_model32.predict(test_32)
test_32['units'] = np.exp(test_32['log_units']) - 1
result_test.append(test_32)

In [None]:
item_nbr_list_32 = get_item_nbr(df_32) #32번 store에서 팔린 item_nbr만 list로 저장

influence32 = result_model32.get_influence()

cooks_d2_32, pvals32 = influence32.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr32 = 4 / (len(df_32) - 778)
idx32 = np.where(cooks_d2_32 > fox_cr32)[0]

print(len(idx32)) #outlier 갯수

for num in idx32 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_32 :
        if item_nbr == df_32.loc[num].item_nbr :
            mean = df_32[df_32['item_nbr'] == item_nbr]['log_units'].mean()
            df_32.set_value(num, 'log_units', mean)
            df_32.set_value(num, 'units', np.exp(mean) - 1)

model32_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_32)
result_model32_new = model32_new.fit()
print(result_model32_new.summary())

test_32_new = processed_test[processed_test['store_nbr'] == 32]

test_32_new['log_units'] = result_model32_new.predict(test_32_new)
test_32_new['units'] = np.exp(test_32_new['log_units']) - 1
result_test2.append(test_32_new)

In [None]:
df_33 = processed_train[processed_train['store_nbr'] == 33].reset_index(drop = True)

model33 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_33)
result_model33 = model33.fit()
print(result_model33.summary())

test_33 = processed_test[processed_test['store_nbr'] == 33]

test_33['log_units'] = result_model33.predict(test_33)
test_33['units'] = np.exp(test_33['log_units']) - 1
result_test.append(test_33)

In [None]:
item_nbr_list_33 = get_item_nbr(df_33) #33번 store에서 팔린 item_nbr만 list로 저장

influence33 = result_model33.get_influence()

cooks_d2_33, pvals33 = influence33.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr33 = 4 / (len(df_33) - 778)
idx33 = np.where(cooks_d2_33 > fox_cr33)[0]

print(len(idx33)) #outlier 갯수

for num in idx33 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_33 :
        if item_nbr == df_33.loc[num].item_nbr :
            mean = df_33[df_33['item_nbr'] == item_nbr]['log_units'].mean()
            df_33.set_value(num, 'log_units', mean)
            df_33.set_value(num, 'units', np.exp(mean) - 1)

model33_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_33)
result_model33_new = model33_new.fit()
print(result_model33_new.summary())

test_33_new = processed_test[processed_test['store_nbr'] == 33]

test_33_new['log_units'] = result_model33_new.predict(test_33_new)
test_33_new['units'] = np.exp(test_33_new['log_units']) - 1
result_test2.append(test_33_new)

In [None]:
df_34 = processed_train[processed_train['store_nbr'] == 34].reset_index(drop = True)

model34 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_34)
result_model34 = model34.fit()
print(result_model34.summary())

test_34 = processed_test[processed_test['store_nbr'] == 34]

test_34['log_units'] = result_model34.predict(test_34)
test_34['units'] = np.exp(test_34['log_units']) - 1
result_test.append(test_34)

In [None]:
item_nbr_list_34 = get_item_nbr(df_34) #34번 store에서 팔린 item_nbr만 list로 저장

influence34 = result_model34.get_influence()

cooks_d2_34, pvals34 = influence34.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr34 = 4 / (len(df_34) - 778)
idx34 = np.where(cooks_d2_34 > fox_cr34)[0]

print(len(idx34)) #outlier 갯수

for num in idx34 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_34 :
        if item_nbr == df_34.loc[num].item_nbr :
            mean = df_34[df_34['item_nbr'] == item_nbr]['log_units'].mean()
            df_34.set_value(num, 'log_units', mean)
            df_34.set_value(num, 'units', np.exp(mean) - 1)

model34_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_34)
result_model34_new = model34_new.fit()
print(result_model34_new.summary())

test_34_new = processed_test[processed_test['store_nbr'] == 34]

test_34_new['log_units'] = result_model34_new.predict(test_34_new)
test_34_new['units'] = np.exp(test_34_new['log_units']) - 1
result_test2.append(test_34_new)

In [None]:
df_35 = processed_train[processed_train['store_nbr'] == 35].reset_index(drop = True)

model35 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_35)
result_model35 = model35.fit()
print(result_model35.summary())

test_35 = processed_test[processed_test['store_nbr'] == 35]

test_35['log_units'] = result_model35.predict(test_35)
test_35['units'] = np.exp(test_35['log_units']) - 1
result_test.append(test_35)

In [None]:
item_nbr_list_35 = get_item_nbr(df_35) #35번 store에서 팔린 item_nbr만 list로 저장

influence35 = result_model35.get_influence()

cooks_d2_35, pvals35 = influence35.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr35 = 4 / (len(df_35) - 778)
idx35 = np.where(cooks_d2_35 > fox_cr35)[0]

print(len(idx35)) #outlier 갯수

for num in idx35 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_35 :
        if item_nbr == df_35.loc[num].item_nbr :
            mean = df_35[df_35['item_nbr'] == item_nbr]['log_units'].mean()
            df_35.set_value(num, 'log_units', mean)
            df_35.set_value(num, 'units', np.exp(mean) - 1)

model35_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_35)
result_model35_new = model35_new.fit()
print(result_model35_new.summary())

test_35_new = processed_test[processed_test['store_nbr'] == 35]

test_35_new['log_units'] = result_model35_new.predict(test_35_new)
test_35_new['units'] = np.exp(test_35_new['log_units']) - 1
result_test2.append(test_35_new)

In [None]:
df_36 = processed_train[processed_train['store_nbr'] == 36].reset_index(drop = True)

model36 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_36)
result_model36 = model36.fit()
print(result_model36.summary())

test_36 = processed_test[processed_test['store_nbr'] == 36]

test_36['log_units'] = result_model36.predict(test_36)
test_36['units'] = np.exp(test_36['log_units']) - 1
result_test.append(test_36)

In [None]:
item_nbr_list_36 = get_item_nbr(df_36) #36번 store에서 팔린 item_nbr만 list로 저장

influence36 = result_model36.get_influence()

cooks_d2_36, pvals36 = influence36.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr36 = 4 / (len(df_36) - 778)
idx36 = np.where(cooks_d2_36 > fox_cr36)[0]

print(len(idx36)) #outlier 갯수

for num in idx36 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_36 :
        if item_nbr == df_36.loc[num].item_nbr :
            mean = df_36[df_36['item_nbr'] == item_nbr]['log_units'].mean()
            df_36.set_value(num, 'log_units', mean)
            df_36.set_value(num, 'units', np.exp(mean) - 1)

model36_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_36)
result_model36_new = model36_new.fit()
print(result_model36_new.summary())

test_36_new = processed_test[processed_test['store_nbr'] == 36]

test_36_new['log_units'] = result_model36_new.predict(test_36_new)
test_36_new['units'] = np.exp(test_36_new['log_units']) - 1
result_test2.append(test_36_new)

In [None]:
df_37 = processed_train[processed_train['store_nbr'] == 37].reset_index(drop = True)

model37 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_37)
result_model37 = model37.fit()
print(result_model37.summary())

test_37 = processed_test[processed_test['store_nbr'] == 37]

test_37['log_units'] = result_model37.predict(test_37)
test_37['units'] = np.exp(test_37['log_units']) - 1
result_test.append(test_37)

In [None]:
item_nbr_list_37 = get_item_nbr(df_37) #37번 store에서 팔린 item_nbr만 list로 저장

influence37 = result_model37.get_influence()

cooks_d2_37, pvals37 = influence37.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr37 = 4 / (len(df_37) - 778)
idx37 = np.where(cooks_d2_37 > fox_cr37)[0]

print(len(idx37)) #outlier 갯수

for num in idx37 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_37 :
        if item_nbr == df_37.loc[num].item_nbr :
            mean = df_37[df_37['item_nbr'] == item_nbr]['log_units'].mean()
            df_37.set_value(num, 'log_units', mean)
            df_37.set_value(num, 'units', np.exp(mean) - 1)

model37_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_37)
result_model37_new = model37_new.fit()
print(result_model37_new.summary())

test_37_new = processed_test[processed_test['store_nbr'] == 37]

test_37_new['log_units'] = result_model37_new.predict(test_37_new)
test_37_new['units'] = np.exp(test_37_new['log_units']) - 1
result_test2.append(test_37_new)

In [None]:
df_38 = processed_train[processed_train['store_nbr'] == 38].reset_index(drop = True)

model38 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_38)
result_model38 = model38.fit()
print(result_model38.summary())

test_38 = processed_test[processed_test['store_nbr'] == 38]

test_38['log_units'] = result_model38.predict(test_38)
test_38['units'] = np.exp(test_38['log_units']) - 1
result_test.append(test_38)

In [None]:
item_nbr_list_38 = get_item_nbr(df_38) #38번 store에서 팔린 item_nbr만 list로 저장

influence38 = result_model38.get_influence()

cooks_d2_38, pvals38 = influence38.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr38 = 4 / (len(df_38) - 778)
idx38 = np.where(cooks_d2_38 > fox_cr38)[0]

print(len(idx38)) #outlier 갯수

for num in idx38 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_38 :
        if item_nbr == df_38.loc[num].item_nbr :
            mean = df_38[df_38['item_nbr'] == item_nbr]['log_units'].mean()
            df_38.set_value(num, 'log_units', mean)
            df_38.set_value(num, 'units', np.exp(mean) - 1)

model38_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_38)
result_model38_new = model38_new.fit()
print(result_model38_new.summary())

test_38_new = processed_test[processed_test['store_nbr'] == 38]

test_38_new['log_units'] = result_model38_new.predict(test_38_new)
test_38_new['units'] = np.exp(test_38_new['log_units']) - 1
result_test2.append(test_38_new)

In [None]:
df_39 = processed_train[processed_train['store_nbr'] == 39].reset_index(drop = True)

model39 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_39)
result_model39 = model39.fit()
print(result_model39.summary())

test_39 = processed_test[processed_test['store_nbr'] == 39]

test_39['log_units'] = result_model39.predict(test_39)
test_39['units'] = np.exp(test_39['log_units']) - 1
result_test.append(test_39)

In [None]:
item_nbr_list_39 = get_item_nbr(df_39) #39번 store에서 팔린 item_nbr만 list로 저장

influence39 = result_model39.get_influence()

cooks_d2_39, pvals39 = influence39.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr39 = 4 / (len(df_39) - 778)
idx39 = np.where(cooks_d2_39 > fox_cr39)[0]

print(len(idx39)) #outlier 갯수

for num in idx39 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_39 :
        if item_nbr == df_39.loc[num].item_nbr :
            mean = df_39[df_39['item_nbr'] == item_nbr]['log_units'].mean()
            df_39.set_value(num, 'log_units', mean)
            df_39.set_value(num, 'units', np.exp(mean) - 1)

model39_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_39)
result_model39_new = model39_new.fit()
print(result_model39_new.summary())

test_39_new = processed_test[processed_test['store_nbr'] == 39]

test_39_new['log_units'] = result_model39_new.predict(test_39_new)
test_39_new['units'] = np.exp(test_39_new['log_units']) - 1
result_test2.append(test_39_new)

In [None]:
df_40 = processed_train[processed_train['store_nbr'] == 40].reset_index(drop = True)

model40 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_40)
result_model40 = model40.fit()
print(result_model40.summary())

test_40 = processed_test[processed_test['store_nbr'] == 40]

test_40['log_units'] = result_model40.predict(test_40)
test_40['units'] = np.exp(test_40['log_units']) - 1
result_test.append(test_40)

In [None]:
item_nbr_list_40 = get_item_nbr(df_40) #40번 store에서 팔린 item_nbr만 list로 저장

influence40 = result_model40.get_influence()

cooks_d2_40, pvals40 = influence40.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr40 = 4 / (len(df_40) - 778)
idx40 = np.where(cooks_d2_40 > fox_cr40)[0]

print(len(idx40)) #outlier 갯수

for num in idx40 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_40 :
        if item_nbr == df_40.loc[num].item_nbr :
            mean = df_40[df_40['item_nbr'] == item_nbr]['log_units'].mean()
            df_40.set_value(num, 'log_units', mean)
            df_40.set_value(num, 'units', np.exp(mean) - 1)

model40_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_40)
result_model40_new = model40_new.fit()
print(result_model40_new.summary())

test_40_new = processed_test[processed_test['store_nbr'] == 40]

test_40_new['log_units'] = result_model40_new.predict(test_40_new)
test_40_new['units'] = np.exp(test_40_new['log_units']) - 1
result_test2.append(test_40_new)

In [None]:
df_41 = processed_train[processed_train['store_nbr'] == 41].reset_index(drop = True)

model41 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_41)
result_model41 = model41.fit()
print(result_model41.summary())

test_41 = processed_test[processed_test['store_nbr'] == 41]

test_41['log_units'] = result_model41.predict(test_41)
test_41['units'] = np.exp(test_41['log_units']) - 1
result_test.append(test_41)

In [None]:
item_nbr_list_41 = get_item_nbr(df_41) #41번 store에서 팔린 item_nbr만 list로 저장

influence41 = result_model41.get_influence()

cooks_d2_41, pvals41 = influence41.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr41 = 4 / (len(df_41) - 778)
idx41 = np.where(cooks_d2_41 > fox_cr41)[0]

print(len(idx41)) #outlier 갯수

for num in idx41 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_41 :
        if item_nbr == df_41.loc[num].item_nbr :
            mean = df_41[df_41['item_nbr'] == item_nbr]['log_units'].mean()
            df_41.set_value(num, 'log_units', mean)
            df_41.set_value(num, 'units', np.exp(mean) - 1)

model41_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_41)
result_model41_new = model41_new.fit()
print(result_model41_new.summary())

test_41_new = processed_test[processed_test['store_nbr'] == 41]

test_41_new['log_units'] = result_model41_new.predict(test_41_new)
test_41_new['units'] = np.exp(test_41_new['log_units']) - 1
result_test2.append(test_41_new)

In [None]:
df_42 = processed_train[processed_train['store_nbr'] == 42].reset_index(drop = True)

model42 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_42)
result_model42 = model42.fit()
print(result_model42.summary())

test_42 = processed_test[processed_test['store_nbr'] == 42]

test_42['log_units'] = result_model42.predict(test_42)
test_42['units'] = np.exp(test_42['log_units']) - 1
result_test.append(test_42)

In [None]:
item_nbr_list_42 = get_item_nbr(df_42) #42번 store에서 팔린 item_nbr만 list로 저장

influence42 = result_model42.get_influence()

cooks_d2_42, pvals42 = influence42.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr42 = 4 / (len(df_42) - 778)
idx42 = np.where(cooks_d2_42 > fox_cr42)[0]

print(len(idx42)) #outlier 갯수

for num in idx42 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_42 :
        if item_nbr == df_42.loc[num].item_nbr :
            mean = df_42[df_42['item_nbr'] == item_nbr]['log_units'].mean()
            df_42.set_value(num, 'log_units', mean)
            df_42.set_value(num, 'units', np.exp(mean) - 1)

model42_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_42)
result_model42_new = model42_new.fit()
print(result_model42_new.summary())

test_42_new = processed_test[processed_test['store_nbr'] == 42]

test_42_new['log_units'] = result_model42_new.predict(test_42_new)
test_42_new['units'] = np.exp(test_42_new['log_units']) - 1
result_test2.append(test_42_new)

In [None]:
df_43 = processed_train[processed_train['store_nbr'] == 43].reset_index(drop = True)

model43 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_43)
result_model43 = model43.fit()
print(result_model43.summary())

test_43 = processed_test[processed_test['store_nbr'] == 43]

test_43['log_units'] = result_model43.predict(test_43)
test_43['units'] = np.exp(test_43['log_units']) - 1
result_test.append(test_43)

In [None]:
item_nbr_list_43 = get_item_nbr(df_43) #43번 store에서 팔린 item_nbr만 list로 저장

influence43 = result_model43.get_influence()

cooks_d2_43, pvals43 = influence43.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr43 = 4 / (len(df_43) - 778)
idx43 = np.where(cooks_d2_43 > fox_cr43)[0]

print(len(idx43)) #outlier 갯수

for num in idx43 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_43 :
        if item_nbr == df_43.loc[num].item_nbr :
            mean = df_43[df_43['item_nbr'] == item_nbr]['log_units'].mean()
            df_43.set_value(num, 'log_units', mean)
            df_43.set_value(num, 'units', np.exp(mean) - 1)

model43_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_43)
result_model43_new = model43_new.fit()
print(result_model43_new.summary())

test_43_new = processed_test[processed_test['store_nbr'] == 43]

test_43_new['log_units'] = result_model43_new.predict(test_43_new)
test_43_new['units'] = np.exp(test_43_new['log_units']) - 1
result_test2.append(test_43_new)

In [None]:
df_44 = processed_train[processed_train['store_nbr'] == 44].reset_index(drop = True)

model44 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_44)
result_model44 = model44.fit()
print(result_model44.summary())

test_44 = processed_test[processed_test['store_nbr'] == 44]

test_44['log_units'] = result_model44.predict(test_44)
test_44['units'] = np.exp(test_44['log_units']) - 1
result_test.append(test_44)

In [None]:
item_nbr_list_44 = get_item_nbr(df_44) #44번 store에서 팔린 item_nbr만 list로 저장

influence44 = result_model44.get_influence()

cooks_d2_44, pvals44 = influence44.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr44 = 4 / (len(df_44) - 778)
idx44 = np.where(cooks_d2_44 > fox_cr44)[0]

print(len(idx44)) #outlier 갯수

for num in idx44 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_44 :
        if item_nbr == df_44.loc[num].item_nbr :
            mean = df_44[df_44['item_nbr'] == item_nbr]['log_units'].mean()
            df_44.set_value(num, 'log_units', mean)
            df_44.set_value(num, 'units', np.exp(mean) - 1)

model44_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_44)
result_model44_new = model44_new.fit()
print(result_model44_new.summary())

test_44_new = processed_test[processed_test['store_nbr'] == 44]

test_44_new['log_units'] = result_model44_new.predict(test_44_new)
test_44_new['units'] = np.exp(test_44_new['log_units']) - 1
result_test2.append(test_44_new)

In [None]:
df_45 = processed_train[processed_train['store_nbr'] == 45].reset_index(drop = True)

model45 = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_45)
result_model45 = model45.fit()
print(result_model45.summary())

test_45 = processed_test[processed_test['store_nbr'] == 45]

test_45['log_units'] = result_model45.predict(test_45)
test_45['units'] = np.exp(test_45['log_units']) - 1
result_test.append(test_45)

In [None]:
item_nbr_list_45 = get_item_nbr(df_45) #45번 store에서 팔린 item_nbr만 list로 저장

influence45 = result_model45.get_influence()

cooks_d2_45, pvals45 = influence45.cooks_distance # fox outlier recommendation으로 outlier 판별
fox_cr45 = 4 / (len(df_45) - 778)
idx45 = np.where(cooks_d2_45 > fox_cr45)[0]

print(len(idx45)) #outlier 갯수

for num in idx45 : # outlier들을 각 item_nbr별 평균치로 대체
    for item_nbr in item_nbr_list_45 :
        if item_nbr == df_45.loc[num].item_nbr :
            mean = df_45[df_45['item_nbr'] == item_nbr]['log_units'].mean()
            df_45.set_value(num, 'log_units', mean)
            df_45.set_value(num, 'units', np.exp(mean) - 1)

model45_new = sm.OLS.from_formula('log_units ~ C(item_nbr):C(weekday, contrast_weekday) + 0', data = df_45)
result_model45_new = model45_new.fit()
print(result_model45_new.summary())

test_45_new = processed_test[processed_test['store_nbr'] == 45]

test_45_new['log_units'] = result_model45_new.predict(test_45_new)
test_45_new['units'] = np.exp(test_45_new['log_units']) - 1
result_test2.append(test_45_new)

## Make Submission

In [None]:
sub = pd.concat(result_test)

In [None]:
len(sub), len(test)

In [None]:
sub.drop(['weekday', 'holiday', 'event'], axis = 1, inplace = True)

In [None]:
sub.sort_values(by = ['date', 'store_nbr', 'item_nbr'], inplace = True)
sub.reset_index(drop = True, inplace = True)

In [None]:
units = sub['units']

In [None]:
sub['store_nbr'] = sub['store_nbr'].astype('str')
sub['item_nbr'] = sub['item_nbr'].astype('str')
sub['date'] = sub['date'].astype('str')

In [None]:
sub['id'] = sub['store_nbr'] + '_' + sub['item_nbr'] + '_' + sub['date']

In [None]:
sub.drop(['date', 'store_nbr', 'item_nbr', 'log_units', 'units'], axis = 1, inplace = True)

In [None]:
sub['units'] = units

In [None]:
sub.tail()

In [None]:
sub.to_csv('sub.csv', index = False)

## Result

<img src = '../sub_test11.png'>

<img src = '../sub_test11_rank.png'>

### sub_test 2 formular "log_units ~ C(store_nbr):C(item_nbr) + C(weekday) + C(holiday) + snowfall + preciptotal + 0"

![image.png](attachment:image.png)

### sub_test 3 formular "log_units ~ C(store_nbr):C(item_nbr) + C(weekday) + C(holiday) + C(event) + 0"

![image.png](attachment:image.png)!

### sub_test4 formular "log_units ~ C(store_nbr):C(item_nbr) + C(weekday) + C(holiday) + snowfall + preciptotal + 0" (MA)

![image.png](attachment:image.png)