# Walmart Recruiting II: Sales in Stormy Weather

- 타겟 데이터
    - `units` : 주어진 날짜에 판매 된 수량
    - 주요 기상 이변시기에 walmart의 지점 45곳에서 판매된 날씨에 민감한 111가지 제품의 양을 예측
    - 예측기간은 `2013-04-01 ~ 2014-10-26`
- 특징 데이터
    - `date` : 2012-01-01 ~ 2013-03-31 사이의 판매일
    - `store_nbr` : 45개의 지점 중 하나를 나타내는 ID
    - `station_nbr` : 20개의 기상 관측소 중 하나를 나타내는 ID
    - `item_nbr` : 111가지 제품 중 하나를 나타내는 ID
    - `tmax` : 최고기온
    - `tmin` : 최저기온
    - `tavg` : 평균기온
    - `depart` : (정상범위에서) 온도 편차
    - `dewpoint` : 평균이슬점
    - `wetbulb` : 평균습구
    - `heat` : 뜨거운 기후, 열기 (계절은 7 월과 함께 시작)
    - `cool` :	서늘한 기후 (계절은 1 월과 함께 시작)
    - `sunrise` : 일출 (계산 됨, 관찰되지 않음)
    - `sunset` : 일몰 (계산 됨, 관찰되지 않음)	
    - `codesum` : 중요한 기상 유형 (기상현상 코드화)
    - `snowfall` : 강설량 (T = ?, M = 데이터 누락)
    - `preciptotal` : 강수량 (T = ?, M = 데이터 누락)
    - `stnpressure` : 평균 관측소 기압
    - `sealevel` : 평균 해면기압
    - `resultspeed` : 합성풍속
    - `resultdir` :	합성풍향
    - `avgspeed` : 평균풍속

In [1]:
def get_holidays(fpath):
    
    f = open(fpath)
    lines = f.readlines()
    lines = [line.split(" ")[:3] for line in lines]
    lines = ["{} {} {}".format(line[0], line[1], line[2]) for line in lines]
    lines = pd.to_datetime(lines)
    return pd.DataFrame({"date2":lines})

In [2]:
def get_holiday_names(fpath):
    # holiday_names are holidays + around Black Fridays
    
    f = open(fpath)
    lines = f.readlines()
    lines = [line.strip().split(" ")[:4] for line in lines]
    lines_dt = ["{} {} {}".format(line[0], line[1], line[2]) for line in lines]
    lines_dt = pd.to_datetime(lines_dt)
    lines_hol = [line[3] for line in lines]
    return pd.DataFrame({"date2":lines_dt, "holiday_name":lines_hol})

In [3]:
def preprocess(_df, is_train):
    
    df = _df.copy()
    
    # date
    df['date2'] = pd.to_datetime(df['date'])
    
    # weekday
    df['weekday'] = df.date2.dt.weekday
    df['is_weekend'] = df.date2.dt.weekday.isin([5,6])
    df['is_holiday'] = df.date2.isin(holidays.date2)
    df['is_holiday_weekday'] = df.is_holiday & (df.is_weekend == False)
    df['is_holiday_weekend'] = df.is_holiday &  df.is_weekend

    # bool to int (maybe no meaning)
    df.is_weekend = np.where(df.is_weekend, 1, 0)
    df.is_holiday = np.where(df.is_holiday, 1, 0)
    df.is_holiday_weekday = np.where(df.is_holiday_weekday, 1, 0)
    df.is_holiday_weekend = np.where(df.is_holiday_weekend, 1, 0)
    
    # day, month, year
    df['day'] = df.date2.dt.day
    df['month'] = df.date2.dt.month
    df['year'] = df.date2.dt.year
    
    # around BlackFriday
    df = pd.merge(df, holiday_names, on='date2', how = 'left')
    df.loc[df.holiday_name.isnull(), "holiday_name"] = ""

    around_BlackFriday = ["BlackFridayM3", "BlackFridayM2", "ThanksgivingDay", "BlackFriday",
                          "BlackFriday1", "BlackFriday2", "BlackFriday3"]
    df["around_BlackFriday"] = np.where(df.holiday_name.isin(around_BlackFriday), 
                                        df.holiday_name, "Else")

    return df

In [4]:
# read dataframes
key = pd.read_csv("key.csv")
wtr = pd.read_csv("weather.csv")
holidays = get_holidays("walmart_holidays.txt")
holiday_names = get_holiday_names("walmart_holiday_names.txt")

In [5]:
holiday_names

Unnamed: 0,date2,holiday_name
0,2012-01-01,NewYearsDay
1,2012-01-16,MartinLutherKingDay
2,2012-02-14,ValentinesDay
3,2012-02-20,PresidentsDay
4,2012-04-08,EasterSunday
5,2012-05-13,MothersDay
6,2012-05-28,MemorialDay
7,2012-06-17,FathersDay
8,2012-07-04,IndependenceDay
9,2012-09-03,LaborDay


In [24]:
df_train = pd.read_csv("train.csv")
df_train = preprocess(df_train, True)
df_train

Unnamed: 0,date,store_nbr,item_nbr,units,date2,weekday,is_weekend,is_holiday,is_holiday_weekday,is_holiday_weekend,day,month,year,holiday_name,around_BlackFriday
0,2012-01-01,1,1,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
1,2012-01-01,1,2,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
2,2012-01-01,1,3,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
3,2012-01-01,1,4,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
4,2012-01-01,1,5,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
5,2012-01-01,1,6,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
6,2012-01-01,1,7,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
7,2012-01-01,1,8,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
8,2012-01-01,1,9,29,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
9,2012-01-01,1,10,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else


In [23]:
df_weekendAndholiday = df_train[(df_train["is_weekend"]==1) & (df_train["is_holiday"] == 1)]
df_weekendAndholiday

Unnamed: 0,date,store_nbr,item_nbr,units,date2,weekday,is_weekend,is_holiday,is_holiday_weekday,is_holiday_weekend,day,month,year,holiday_name,around_BlackFriday
0,2012-01-01,1,1,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
1,2012-01-01,1,2,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
2,2012-01-01,1,3,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
3,2012-01-01,1,4,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
4,2012-01-01,1,5,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
5,2012-01-01,1,6,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
6,2012-01-01,1,7,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
7,2012-01-01,1,8,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
8,2012-01-01,1,9,29,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else
9,2012-01-01,1,10,0,2012-01-01,6,1,1,0,1,1,1,2012,NewYearsDay,Else


In [25]:
df_weekendAndNotholiday = df_train[(df_train["is_weekend"]==1) & (df_train["is_holiday"] == 0)]
df_weekendAndNotholiday

Unnamed: 0,date,store_nbr,item_nbr,units,date2,weekday,is_weekend,is_holiday,is_holiday_weekday,is_holiday_weekend,day,month,year,holiday_name,around_BlackFriday
29304,2012-01-07,1,1,0,2012-01-07,5,1,0,0,0,7,1,2012,,Else
29305,2012-01-07,1,2,0,2012-01-07,5,1,0,0,0,7,1,2012,,Else
29306,2012-01-07,1,3,0,2012-01-07,5,1,0,0,0,7,1,2012,,Else
29307,2012-01-07,1,4,0,2012-01-07,5,1,0,0,0,7,1,2012,,Else
29308,2012-01-07,1,5,0,2012-01-07,5,1,0,0,0,7,1,2012,,Else
29309,2012-01-07,1,6,0,2012-01-07,5,1,0,0,0,7,1,2012,,Else
29310,2012-01-07,1,7,0,2012-01-07,5,1,0,0,0,7,1,2012,,Else
29311,2012-01-07,1,8,0,2012-01-07,5,1,0,0,0,7,1,2012,,Else
29312,2012-01-07,1,9,2,2012-01-07,5,1,0,0,0,7,1,2012,,Else
29313,2012-01-07,1,10,0,2012-01-07,5,1,0,0,0,7,1,2012,,Else


In [26]:
df_NotweekendAndholiday = df_train[(df_train["is_weekend"]==0) & (df_train["is_holiday"] == 1)]
df_NotweekendAndholiday

Unnamed: 0,date,store_nbr,item_nbr,units,date2,weekday,is_weekend,is_holiday,is_holiday_weekday,is_holiday_weekend,day,month,year,holiday_name,around_BlackFriday
4884,2012-01-02,1,1,0,2012-01-02,0,0,1,1,0,2,1,2012,,Else
4885,2012-01-02,1,2,0,2012-01-02,0,0,1,1,0,2,1,2012,,Else
4886,2012-01-02,1,3,0,2012-01-02,0,0,1,1,0,2,1,2012,,Else
4887,2012-01-02,1,4,0,2012-01-02,0,0,1,1,0,2,1,2012,,Else
4888,2012-01-02,1,5,0,2012-01-02,0,0,1,1,0,2,1,2012,,Else
4889,2012-01-02,1,6,0,2012-01-02,0,0,1,1,0,2,1,2012,,Else
4890,2012-01-02,1,7,0,2012-01-02,0,0,1,1,0,2,1,2012,,Else
4891,2012-01-02,1,8,0,2012-01-02,0,0,1,1,0,2,1,2012,,Else
4892,2012-01-02,1,9,60,2012-01-02,0,0,1,1,0,2,1,2012,,Else
4893,2012-01-02,1,10,0,2012-01-02,0,0,1,1,0,2,1,2012,,Else
