In [1]:
import pandas as pd
import numpy as np

In [2]:
items = pd.read_csv("data/items.csv")
items_categories = pd.read_csv("data/item_categories.csv")
sales_train = pd.read_csv("data/sales_train.csv")
shops = pd.read_csv("data/shops.csv")

In [3]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [4]:
items_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [5]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [6]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [7]:
df = pd.merge(items, items_categories, on="item_category_id", how="inner")
df = pd.merge(df, sales_train, on="item_id", how="inner")
df.head()

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,Кино - DVD,01.09.2014,20,54,58.0,1.0
1,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,24.08.2014,19,54,58.0,1.0
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,12.11.2014,22,54,58.0,1.0
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,05.07.2014,18,54,100.0,1.0
4,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,26.08.2014,19,54,58.0,1.0


In [10]:
def label_df(df, month_to_label):
    to_label = df[df["date_block_num"] == month_to_label]
    labeled = to_label.groupby(["shop_id", "item_id"]).agg({"item_cnt_day":"sum"}).reset_index()
    return labeled

## DEFINE FEATURES PARAMETERS

In [8]:
month_to_predict = 32
max_train_month = 31

In [9]:
df = df[df["date_block_num"] < month_to_predict]
assert(df["date_block_num"].max() == max_train_month)

## FEATURES

In [10]:
df.head()

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,Кино - DVD,01.09.2014,20,54,58.0,1.0
1,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,24.08.2014,19,54,58.0,1.0
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,12.11.2014,22,54,58.0,1.0
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,05.07.2014,18,54,100.0,1.0
4,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,26.08.2014,19,54,58.0,1.0


In [11]:
df.date_block_num.value_counts().index.max()

31

### Of categories

#### Items sold from the same categorie in the same shop last month

In [15]:
items_same_cat_and_shop = df[df.date_block_num == max_train_month].groupby(["shop_id", "item_category_id"]).agg({"item_cnt_day":"sum"}).reset_index().rename(columns={"item_cnt_day":"sold_of_category_last_month"})
items_same_cat_and_shop.head()

Unnamed: 0,shop_id,item_category_id,sold_of_category_last_month
0,2,3,13.0
1,2,5,2.0
2,2,6,7.0
3,2,7,4.0
4,2,12,1.0


In [16]:
print(len(df))
df = pd.merge(df, items_same_cat_and_shop, on=["shop_id", "item_category_id"], how="left")
print(len(df))
df.sample(10)

2831747
2831747


Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day,sold_of_category_last_month
1306477,"Sims 3 (обновлённое издание) [PC, русская версия]",6466,30,Игры PC - Стандартные издания,22.10.2013,9,15,699.0,1.0,193.0
2489957,"Мягкая игрушка Minecraft Creeper 7""",15284,63,Подарки - Мягкие игрушки,29.08.2014,19,27,798.5,1.0,
1115911,"Battlefield Bad Company 2 [PC, Jewel, русская ...",1871,30,Игры PC - Стандартные издания,16.03.2013,2,59,299.0,1.0,221.0
2196009,"Total War: Rome 2. Классическое издание [PC, р...",7098,28,Игры PC - Дополнительные издания,19.09.2013,8,44,999.0,1.0,13.0
2544696,"Настольная игра Манчкин Цветная версия, арт. 1031",16167,64,Подарки - Настольные игры,27.12.2013,11,38,659.0,1.0,29.0
705056,V/A Dubstep Bass & Movement 3 2CD (digipack),7341,55,Музыка - CD локального производства,05.09.2013,8,50,299.0,1.0,92.0
820570,НОГГАНО Тёплый,15780,55,Музыка - CD локального производства,15.11.2014,22,14,199.0,1.0,125.0
505983,ХОББИТ: ПУСТОШЬ СМАУГА (регион),21386,40,Кино - DVD,29.01.2015,24,54,149.0,1.0,
1361662,"Wargame: AirLand Battle [PC, Jewel, русские су...",7739,30,Игры PC - Стандартные издания,09.06.2013,5,46,499.0,1.0,173.0
2621772,X360: Комплект зарядный для черного геймпада -...,7895,6,Аксессуары - XBOX 360,22.03.2014,14,52,749.0,1.0,7.0


In [17]:
df["sold_of_category_last_month"] = df["sold_of_category_last_month"].fillna(0)
df.head()

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day,sold_of_category_last_month
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,Кино - DVD,01.09.2014,20,54,58.0,1.0,0.0
1,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,24.08.2014,19,54,58.0,1.0,0.0
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,12.11.2014,22,54,58.0,1.0,0.0
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,05.07.2014,18,54,100.0,1.0,0.0
4,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,26.08.2014,19,54,58.0,1.0,0.0


#### Total shops that sell items with that category

In [18]:
shops_of_category = df.groupby("item_category_id").agg({"shop_id":"nunique"}).reset_index().rename(columns={"shop_id":"total_shops_of_category"})
print(len(items_categories))
print(len(shops_of_category))
shops_of_category.sample(15)

84
84


Unnamed: 0,item_category_id,total_shops_of_category
24,24,51
79,79,50
42,42,43
83,83,48
49,49,55
59,59,48
14,14,55
35,35,57
29,29,57
13,13,46


In [19]:
df = pd.merge(df, shops_of_category, on="item_category_id", how="inner")
df.sample(10)

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day,sold_of_category_last_month,total_shops_of_category
1804045,Круг Ирина и Михаил Семейный Альбом (mp3-CD) ...,13607,57,Музыка - MP3,19.05.2013,4,37,299.0,1.0,0.0,50
1308472,"Sims 3 (обновлённое издание) [PC, русская версия]",6466,30,Игры PC - Стандартные издания,31.05.2014,16,21,699.0,1.0,133.0,57
1821025,Третьяков Виктор Ваш Третьяков Официальный б...,20006,57,Музыка - MP3,10.05.2013,4,27,299.0,1.0,0.0,50
2824456,Прием денежных средств для 1С-Онлайн,17717,79,Служебные,13.07.2015,30,37,399.0,1.0,4.0,50
173363,ИГРА В ПРАВДУ,11915,40,Кино - DVD,03.01.2014,12,28,299.0,1.0,659.0,57
1402866,"Машины Сказки. Выпуск 2 [PC, Jewel, русская ве...",15020,30,Игры PC - Стандартные издания,20.04.2013,3,50,149.0,1.0,102.0,57
2632028,PlayStation Plus 3-месячная подписка: Карта оп...,5821,35,Карты оплаты - PSN,27.04.2014,15,3,649.0,2.0,17.0,57
2732144,Фирменный пакет майка 1С Интерес белый (34*42)...,20949,71,"Подарки - Сумки, Альбомы, Коврики д/мыши",23.06.2014,17,31,5.0,9.0,436.0,53
85610,ВОЙНА МИРОВ Z,9983,40,Кино - DVD,04.11.2013,10,28,399.0,4.0,659.0,57
332996,ОТТЕПЕЛЬ (4DVD),16700,40,Кино - DVD,14.06.2014,17,29,599.0,1.0,0.0,57


### Of shops 

#### Total categories by shop

In [24]:
t_cat_by_shop = df.groupby("shop_id").agg({"item_category_id":"nunique"}).reset_index().rename(columns={"item_category_id":"total_categories_of_shop"})
t_cat_by_shop.head()

print(len(df))
df = pd.merge(df, t_cat_by_shop, on="shop_id", how="inner")
print(len(df))
df.head()

2831747
2831747


Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day,sold_of_category_last_month,total_shops_of_category,total_categories_of_shop
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,Кино - DVD,01.09.2014,20,54,58.0,1.0,0.0,57,59
1,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,24.08.2014,19,54,58.0,1.0,0.0,57,59
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,12.11.2014,22,54,58.0,1.0,0.0,57,59
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,05.07.2014,18,54,100.0,1.0,0.0,57,59
4,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,26.08.2014,19,54,58.0,1.0,0.0,57,59


#### Number of items sold by shop in the last month

In [25]:
items_sold_last_month = df[df["date_block_num"] == max_train_month].groupby("shop_id").agg({"item_cnt_day":"sum"}).reset_index().rename(columns={"item_cnt_day":"items_sold_last_month"})
items_sold_last_month.head()

Unnamed: 0,shop_id,items_sold_last_month
0,2,942.0
1,3,666.0
2,4,947.0
3,5,1294.0
4,6,1575.0


In [26]:
df = pd.merge(df, items_sold_last_month, on="shop_id", how="left")
df["items_sold_last_month"] = df["items_sold_last_month"].fillna(0) 
df.head()

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day,sold_of_category_last_month,total_shops_of_category,total_categories_of_shop,items_sold_last_month
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,Кино - DVD,01.09.2014,20,54,58.0,1.0,0.0,57,59,0.0
1,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,24.08.2014,19,54,58.0,1.0,0.0,57,59,0.0
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,12.11.2014,22,54,58.0,1.0,0.0,57,59,0.0
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,05.07.2014,18,54,100.0,1.0,0.0,57,59,0.0
4,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,26.08.2014,19,54,58.0,1.0,0.0,57,59,0.0


#### Item sold in shop last month

In [27]:
item_purchases_by_shop_last_month = df[df["date_block_num"] == max_train_month].groupby(["shop_id", "item_id"]).agg({"item_cnt_day":"sum"}).reset_index().rename(columns={"item_cnt_day":"purchases_item_in_shop_last_month"})
item_purchases_by_shop_last_month.head()

Unnamed: 0,shop_id,item_id,purchases_item_in_shop_last_month
0,2,32,1.0
1,2,70,1.0
2,2,482,1.0
3,2,792,2.0
4,2,806,1.0


In [28]:
df = pd.merge(df, item_purchases_by_shop_last_month, on=["shop_id", "item_id"], how="left")
df["purchases_item_in_shop_last_month"] = df["purchases_item_in_shop_last_month"].fillna(0)
df.sample(10)

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day,sold_of_category_last_month,total_shops_of_category,total_categories_of_shop,items_sold_last_month,purchases_item_in_shop_last_month
2391257,СУМЕРКИ. САГА. РАССВЕТ: ЧАСТЬ 2 (2DVD),19196,40,Кино - DVD,11.04.2013,3,28,398.5,1.0,659.0,57,60,3749.0,0.0
412565,Playstation Store пополнение бумажника: Карта ...,5822,35,Карты оплаты - PSN,23.06.2015,29,25,1149.0,1.0,33.0,57,65,4675.0,15.0
221366,LINKIN PARK Road To Revolution Live At Milto...,4430,55,Музыка - CD локального производства,09.03.2013,2,44,399.0,1.0,87.0,55,62,769.0,0.0
230156,"Mass Effect 3 [PC, русские субтитры]",4779,30,Игры PC - Стандартные издания,27.06.2013,5,44,299.0,1.0,86.0,57,62,769.0,0.0
1138418,Комплект «Sony PS3 Super Slim Red (500 Gb) (CE...,13496,11,Игровые консоли - PS3,15.04.2013,3,18,12490.0,1.0,1.0,54,62,1184.0,0.0
2284211,СБ. Танцевальный рай 32,18475,55,Музыка - CD локального производства,04.01.2014,12,30,199.0,1.0,0.0,55,59,0.0,0.0
1929111,ПАТРУЛЬ ВРЕМЕНИ (BD),16923,37,Кино - Blu-Ray,27.02.2015,25,46,599.0,1.0,99.0,57,60,1670.0,0.0
312503,Naruto Shippuden: Ultimate Ninja Storm 3 Day 1...,5208,19,Игры - PS3,24.08.2013,7,25,1223.99,1.0,153.0,57,65,4675.0,0.0
324270,"Resident Evil 6 [PC, Jewel, русские субтитры]",6121,30,Игры PC - Стандартные издания,02.11.2014,22,25,299.0,1.0,264.0,57,65,4675.0,0.0
755851,Фирменный пакет майка 1С Интерес белый (34*42)...,20949,71,"Подарки - Сумки, Альбомы, Коврики д/мыши",16.05.2013,4,52,5.0,1.0,10.0,53,60,932.0,10.0


#### Percentage of shop corresponding to specfic item

In [29]:
df["item_share"] = df["purchases_item_in_shop_last_month"] / df["items_sold_last_month"]
df["item_share"] = df["item_share"].fillna(0)
df.sample(7)

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day,sold_of_category_last_month,total_shops_of_category,total_categories_of_shop,items_sold_last_month,purchases_item_in_shop_last_month,item_share
2084561,"Sims 4 [PC, русская версия]",6503,30,Игры PC - Стандартные издания,04.01.2015,24,38,1799.0,1.0,197.0,57,63,1781.0,9.0,0.005053
1763715,Управление заказами в системе программ 1С:Пред...,20185,49,Книги - Методические материалы 1С,11.01.2013,0,12,240.0,1.0,122.0,55,63,1471.0,0.0,0.0
243236,Мягкая игрушка Plants vs Zombies ГорохоСтрел 3...,15297,63,Подарки - Мягкие игрушки,05.05.2015,28,44,959.0,1.0,38.0,58,62,769.0,0.0,0.0
2242589,"Call of Duty: Black Ops II [Xbox 360, русская ...",2309,23,Игры - XBOX 360,16.03.2013,2,26,2799.0,1.0,43.0,57,61,1189.0,0.0,0.0
274751,ПУАРО. Сезон 11,17509,40,Кино - DVD,14.11.2014,22,25,499.0,1.0,665.0,57,65,4675.0,0.0,0.0
2497102,МТГ(РУС): Лабиринт Дракона (Dragon`s Maze): Бу...,14841,65,Подарки - Настольные игры (компактные),25.07.2013,6,28,199.0,3.0,103.0,58,60,3749.0,0.0,0.0
2193099,"Мяч-лизун с прорисовкой ""Сердитые птички""",15527,70,Подарки - Сувениры (в навеску),05.10.2013,9,27,299.0,1.0,0.0,58,60,0.0,0.0,0.0


#### Historical average of sales for shop in specific item

In [30]:
sales_of_item_in_shop_by_month = df.groupby(["item_id", "shop_id", "date_block_num"]).agg({"item_cnt_day":"sum"}).reset_index().rename(columns={"item_cnt_day":"sales_in_month"})
sales_of_item_in_shop_by_month.sample(5)

Unnamed: 0,item_id,shop_id,date_block_num,sales_in_month
153817,2545,42,6,1.0
40874,1104,45,13,1.0
1182864,16220,51,16,1.0
3855,53,31,12,1.0
1432012,20019,31,17,1.0


In [31]:
# careful: only considering months that have al least one purchase.
avg_sales_of_item_in_shop = sales_of_item_in_shop_by_month.groupby(["item_id", "shop_id"]).agg({"sales_in_month": "mean"}).reset_index().rename(columns={"sales_in_month":"average_sales_for_item_and_shop_by_month"})
avg_sales_of_item_in_shop.sample(10)

Unnamed: 0,item_id,shop_id,average_sales_for_item_and_shop_by_month
162855,9468,43,1.6
47918,3181,57,1.2
82342,5045,7,1.166667
139747,8132,54,1.0
315392,17043,58,2.0
351539,18795,56,2.0
294462,16025,38,1.0
376656,20031,19,1.0
95626,5832,43,2.5
252517,14021,46,1.0


In [32]:
df = pd.merge(df, avg_sales_of_item_in_shop, on=["item_id", "shop_id"], how="inner")
df.sample(7)

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day,sold_of_category_last_month,total_shops_of_category,total_categories_of_shop,items_sold_last_month,purchases_item_in_shop_last_month,item_share,average_sales_for_item_and_shop_by_month
2101304,Набор для плетения браслетов оригинальный Rain...,15857,70,Подарки - Сувениры (в навеску),01.05.2015,28,38,1499.0,1.0,59.0,58,63,1781.0,1.0,0.000561,3.6
710909,"Комплект Праздник Спорта 2 [PS3, русская верси...",13517,2,Аксессуары - PS3,27.02.2013,1,50,2699.0,1.0,7.0,54,59,1081.0,0.0,0.0,2.666667
2770343,КУЗЬМИН ВЛАДИМИР Ангелы - мечты,13024,55,Музыка - CD локального производства,12.10.2014,21,33,199.0,1.0,0.0,55,47,0.0,0.0,0.0,1.0
2508870,БОЛЬШАЯ СВАДЬБА,8974,40,Кино - DVD,01.09.2013,8,57,399.0,1.0,809.0,57,61,2780.0,0.0,0.0,5.0
1520343,"World of Warcraft + Burning Crusade [PC, Jewel...",7850,30,Игры PC - Стандартные издания,27.06.2013,5,4,199.0,1.0,139.0,57,61,947.0,0.0,0.0,3.875
942802,"Sims 3: Райские острова (дополнение) [PC, русс...",6498,28,Игры PC - Дополнительные издания,18.04.2014,15,47,699.0,1.0,47.0,57,60,2003.0,0.0,0.0,7.47619
2477589,Assassin's Creed IV. Черный флаг. Black Chest ...,1498,29,Игры PC - Коллекционные издания,22.11.2013,10,28,4299.0,1.0,0.0,57,60,3749.0,0.0,0.0,1.5


### Of items

#### Time since last purchase of item in shop

In [33]:
with_last_purchase_month = df.groupby(["item_id", "shop_id"]).agg({"date_block_num":"max"}).reset_index().rename(columns={"date_block_num":"month_last_purchase"})
with_last_purchase_month.sample(10)

Unnamed: 0,item_id,shop_id,month_last_purchase
140553,8179,25,13
389246,20717,47,31
20290,1708,57,21
122027,7245,28,18
257315,14258,47,23
9229,1061,46,23
306079,16556,48,26
149573,8666,27,17
69105,4337,10,23
393026,21285,9,21


In [34]:
df = pd.merge(df, with_last_purchase_month, on=["shop_id", "item_id"], how="inner")
df.sample(10)

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day,sold_of_category_last_month,total_shops_of_category,total_categories_of_shop,items_sold_last_month,purchases_item_in_shop_last_month,item_share,average_sales_for_item_and_shop_by_month,month_last_purchase
85951,ТЕРМИНАТОР (BD),19650,37,Кино - Blu-Ray,21.09.2013,8,54,499.0,1.0,0.0,57,59,0.0,0.0,0.0,1.0,21
1197869,PlayStation Plus 3-месячная подписка: Карта оп...,5821,35,Карты оплаты - PSN,03.07.2015,30,19,1099.0,1.0,20.0,57,60,1533.0,8.0,0.005219,5.15625,31
2644287,БАРБИ И ЕЕ СЕСТРЫ В СКАЗКЕ О ПОНИ,8624,40,Кино - DVD,14.08.2015,31,58,149.0,1.0,433.0,57,58,1738.0,5.0,0.002877,2.882353,31
1779974,Доставка до постомата (PickPoint),11372,9,Доставка товара,05.06.2015,29,12,260.666667,6.0,276.0,1,63,1471.0,33.0,0.022434,44.555556,31
2286498,Medieval Moves Боевые кости (Essentials) (толь...,4814,19,Игры - PS3,06.01.2013,0,30,899.0,1.0,0.0,57,59,0.0,0.0,0.0,1.4,14
290334,GREEN DAY Dos,3620,55,Музыка - CD локального производства,09.07.2013,6,25,299.0,1.0,421.0,55,65,4675.0,0.0,0.0,1.555556,28
41534,OFFSPRING Days Go By,5306,55,Музыка - CD локального производства,09.08.2014,19,54,299.0,1.0,0.0,55,59,0.0,0.0,0.0,1.818182,22
2674445,"Metal Gear Solid V: Ground Zeroes [PS3, русски...",4835,19,Игры - PS3,03.04.2014,15,58,1699.0,1.0,77.0,57,58,1738.0,0.0,0.0,3.0,23
2503864,Футболка WOT ИС-4-4 мужская черная L (100026),21274,61,Подарки - Атрибутика,12.09.2014,20,28,449.0,1.0,68.0,54,60,3749.0,0.0,0.0,2.25,23
711486,PS4: Контроллер игровой беспроводной черный (D...,5672,3,Аксессуары - PS4,12.04.2014,15,50,3190.0,1.0,18.0,56,59,1081.0,0.0,0.0,6.238095,30


#### Max, mean and min item price

In [35]:
with_price_features = df.groupby("item_id").agg({"item_price":["max", "mean", "min"]}).reset_index()
with_price_features.columns = ['_'.join(col).strip() for col in with_price_features.columns.values]
with_price_features.head()

Unnamed: 0,item_id_,item_price_max,item_price_mean,item_price_min
0,0,58.0,58.0,58.0
1,1,4490.0,4490.0,4490.0
2,2,58.0,58.0,58.0
3,3,100.0,79.0,58.0
4,4,58.0,58.0,58.0


In [36]:
with_price_features["diff_price_max_min"] = with_price_features["item_price_max"] - with_price_features["item_price_min"]
with_price_features.head()

Unnamed: 0,item_id_,item_price_max,item_price_mean,item_price_min,diff_price_max_min
0,0,58.0,58.0,58.0,0.0
1,1,4490.0,4490.0,4490.0,0.0
2,2,58.0,58.0,58.0,0.0
3,3,100.0,79.0,58.0,42.0
4,4,58.0,58.0,58.0,0.0


In [37]:
df = pd.merge(df, with_price_features, left_on="item_id", right_on="item_id_", how="inner")
df.sample(7)

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,date,date_block_num,shop_id,item_price,item_cnt_day,sold_of_category_last_month,...,items_sold_last_month,purchases_item_in_shop_last_month,item_share,average_sales_for_item_and_shop_by_month,month_last_purchase,item_id_,item_price_max,item_price_mean,item_price_min,diff_price_max_min
812088,СБ. Лучшие песни шансона за 20 лет,18364,55,Музыка - CD локального производства,02.10.2013,9,52,199.0,1.0,110.0,...,932.0,0.0,0.0,1.25,20,18364,199.0,190.124084,110.0,89.0
1643871,ПРОЕКТ Х: ДОРВАЛИСЬ (BD),17464,37,Кино - Blu-Ray,20.12.2013,11,21,999.0,1.0,105.0,...,1622.0,0.0,0.0,1.0,11,17464,999.0,567.092,159.84,839.16
1291647,"Syndicate [PC, русские субтитры]",6777,30,Игры PC - Стандартные издания,30.12.2013,11,50,98.0,1.0,102.0,...,1081.0,0.0,0.0,1.4,13,6777,499.0,185.853854,59.0,440.0
733814,ЗЕМФИРА 3CD (фирм.),11707,55,Музыка - CD локального производства,02.02.2013,1,50,699.0,1.0,92.0,...,1081.0,0.0,0.0,1.333333,11,11707,699.0,696.911189,558.6,140.4
1712154,ЗФОК МЮЗИКЛЫ Ч.1,11800,41,Кино - Коллекционное,11.10.2014,21,42,199.0,1.0,26.0,...,3832.0,0.0,0.0,1.0,27,11800,199.0,177.644444,135.0,64.0
874411,"Call of Duty. Ghosts [PS3, русская версия]",2267,19,Игры - PS3,11.09.2014,20,58,1999.0,1.0,77.0,...,1738.0,1.0,0.000575,1.947368,31,2267,2599.0,2310.020818,1260.0,1339.0
2392908,Мягкая игрушка СОЮЗМУЛЬТФИЛЬМ Пятачок музыкальный,15455,63,Подарки - Мягкие игрушки,13.04.2015,27,16,849.0,1.0,62.0,...,1231.0,0.0,0.0,1.0,29,15455,849.0,555.124959,300.0,549.0


In [38]:
features = df.groupby(["item_id", "shop_id"]).agg({
    "item_category_id": "max",
    "total_categories_of_shop": "max",
    "sold_of_category_last_month": "max",
    "total_shops_of_category": "max",
    "items_sold_last_month": "max",
    "purchases_item_in_shop_last_month": "max",
    "item_share": "max",
    "average_sales_for_item_and_shop_by_month": "max",
    "month_last_purchase": "max",
    "item_price_max": "max",
    "item_price_mean": "mean",
    "item_price_min": "min",
    "diff_price_max_min": "max"
}).reset_index()

In [40]:
features.to_csv("generated/features_train.csv", index=False)