In [1]:
import pandas as pd
import joblib as jb

from math import modf

In [2]:
def downgrade_or_upgrade_if_necessary(x):
    m = modf(x)
    if m[0] <= 0.5:
        return m[1]
    if m[0] > 0.5:
        return m[1] + 1
    return x

def round_submission_dataframe(sub):
    sub["item_cnt_month"] = sub["item_cnt_month"].map(lambda x: downgrade_or_upgrade_if_necessary(x))
    return sub.sort_values(by="ID")

In [3]:
items = pd.read_csv("data/items.csv")

In [4]:
df = pd.read_csv("data/test.csv")
df.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [5]:
df = pd.merge(df, items, on="item_id", how="inner").drop(columns=["item_name"])
df.sample(10)

Unnamed: 0,ID,shop_id,item_id,item_category_id
71600,164904,36,2883,25
133977,202089,44,15870,64
205435,71191,24,15595,40
56069,210434,45,1389,20
204657,173172,37,17448,40
48582,154156,55,8816,64
30036,31315,10,13701,69
117644,13001,6,19900,40
26444,133229,48,3916,55
182957,29856,7,6439,27


In [6]:
features_folder = "generated/features_prediction/"

features_dict = {
    features_folder + "features_shop_and_categories.csv":["shop_id", "item_category_id"],
    features_folder + "features_categories.csv":"item_category_id",
    features_folder + "features_shops.csv":"shop_id",
    features_folder + "features_items_and_shop.csv": ["item_id", "shop_id"],
    features_folder + "features_items.csv":"item_id"    
}

In [7]:
print(len(df))
for features_file in features_dict:
    df_features = pd.read_csv(features_file)
    joining_cols = features_dict[features_file]
    df = pd.merge(df, df_features, on=joining_cols, how="left")
print(len(df))
df.sample(10)

214200
214200


Unnamed: 0,ID,shop_id,item_id,item_category_id,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,...,diff_price_max_min,total_shops_item_is_sell,sales_item_historically_mean,sales_item_mean_ten,months_item_has_sales,total_sales_with_1_units,total_sales_with_2_units,total_sales_with_3_units,total_sales_with_4_units,total_sales_with_5_units
120242,196662,41,10405,38,1378.0,40.529412,19.248353,13.0,119.0,16.0,...,901.0,24.0,2.375,1.444444,24.0,57.0,,,,
4963,35818,12,9355,70,520.0,16.774194,22.467324,1.0,106.0,15.0,...,199.0,51.0,295.642857,320.0,14.0,1706.0,538.0,221.0,101.0,58.0
159703,100702,19,3335,23,2440.0,71.764706,26.356597,20.0,158.0,38.0,...,1900.0,50.0,128.884615,34.181818,26.0,2218.0,324.0,93.0,34.0,14.0
180380,167494,36,10215,31,,,,,,,...,0.0,1.0,13.333333,13.333333,6.0,43.0,7.0,2.0,3.0,1.0
58275,108487,50,5065,55,8990.0,264.411765,129.318755,86.0,501.0,111.0,...,129.0,44.0,9.205882,6.363636,34.0,311.0,1.0,,,
209254,55982,26,2664,58,107.0,3.821429,2.776269,1.0,10.0,2.0,...,200.0,4.0,2.0,2.0,2.0,4.0,,,,
209291,30483,7,454,45,4.0,1.333333,0.57735,1.0,2.0,,...,,,,,,,,,,
47261,57225,25,5581,5,468.0,13.764706,9.065615,2.0,39.0,5.0,...,612.06,52.0,18.058824,14.0,34.0,588.0,13.0,,,
103418,73862,21,4956,75,512.0,15.058824,6.527059,3.0,26.0,11.0,...,230.0,18.0,3.0,3.0,11.0,33.0,,,,
202233,20115,3,1541,29,167.0,5.21875,5.116289,1.0,26.0,1.0,...,1000.0,48.0,45.4,45.4,10.0,297.0,35.0,16.0,6.0,3.0


In [8]:
month_to_predict = 34

df["hist_sales_sum_cat_by_shop"] = df["hist_sales_sum_cat_by_shop"].fillna(0)
df["hist_sales_mean_cat_by_shop"] = df["hist_sales_mean_cat_by_shop"].fillna(0)
df["hist_sales_max_cat_by_shop"] = df["hist_sales_max_cat_by_shop"].fillna(0)
df["hist_sales_min_cat_by_shop"] = df["hist_sales_min_cat_by_shop"].fillna(0)
df["sold_cat_last_month_by_shop"] = df["sold_cat_last_month_by_shop"].fillna(0)
df["tot_cat_by_shops"] = df["tot_cat_by_shops"].fillna(df["tot_cat_by_shops"].mean())
df["items_sold_last_month"] = df["items_sold_last_month"].fillna(0)
df["hist_sales_mean_item_by_shop"] = df["hist_sales_mean_item_by_shop"].fillna(0)
df["hist_sales_sum_item_by_shop"] = df["hist_sales_sum_item_by_shop"].fillna(0)
df["hist_sales_max_item_by_shop"] = df["hist_sales_max_item_by_shop"].fillna(0)
df["hist_sales_min_item_by_shop"] = df["hist_sales_min_item_by_shop"].fillna(0)
df["purchases_item_in_shop_last_month"] = df["purchases_item_in_shop_last_month"].fillna(0)
for x in range(5):
    df["purchases_item_in_shop_month_" + str(x+2)].fillna(0, inplace=True)
df["total_months_with_sells_by_item_and_shop"] = df["total_months_with_sells_by_item_and_shop"].fillna(0)    
for x in range(2, 7):
    df["items_sold_by_shop_last_" + str(x) + "_months"] = df["items_sold_by_shop_last_" + str(x) + "_months"].fillna(0) 
for x in range(1, 6):
    df["total_sales_with_" + str(x) +"_units"] = df["total_sales_with_" + str(x) +"_units"].fillna(0)
df.sample(10)

Unnamed: 0,ID,shop_id,item_id,item_category_id,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,...,diff_price_max_min,total_shops_item_is_sell,sales_item_historically_mean,sales_item_mean_ten,months_item_has_sales,total_sales_with_1_units,total_sales_with_2_units,total_sales_with_3_units,total_sales_with_4_units,total_sales_with_5_units
171712,85688,16,11309,61,131.0,6.238095,5.252664,1.0,17.0,6.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
125054,104977,42,7741,30,12199.0,358.794118,110.161804,160.0,609.0,160.0,...,251.0,49.0,26.333333,18.181818,18.0,428.0,18.0,2.0,1.0,0.0
146743,192193,46,6238,55,8048.0,236.705882,83.769355,108.0,403.0,108.0,...,126.0,49.0,14.411765,15.363636,34.0,488.0,1.0,0.0,0.0,0.0
164382,187513,34,1090,55,56.0,3.733333,2.987275,1.0,12.0,1.0,...,134.0,26.0,11.2,11.2,5.0,54.0,1.0,0.0,0.0,0.0
161238,3839,5,16885,37,2022.0,61.272727,18.850187,32.0,104.0,39.0,...,950.0,35.0,4.142857,1.0,21.0,87.0,0.0,0.0,0.0,0.0
75459,139496,57,15459,63,1325.0,41.40625,35.72553,19.0,159.0,28.0,...,1006.63,45.0,11.852941,8.363636,34.0,384.0,8.0,1.0,0.0,0.0
211974,5047,5,9012,55,4892.0,148.242424,35.075838,84.0,213.0,106.0,...,125.0,41.0,7.947368,4.090909,19.0,146.0,1.0,1.0,0.0,0.0
9655,188929,46,7462,55,8048.0,236.705882,83.769355,108.0,403.0,108.0,...,0.0,3.0,4.0,4.0,1.0,4.0,0.0,0.0,0.0,0.0
4723,97012,19,4054,19,6226.0,183.117647,80.322745,63.0,388.0,80.0,...,0.0,6.0,10.0,10.0,1.0,8.0,1.0,0.0,0.0,0.0
161285,29340,7,16953,40,9678.0,284.647059,115.85751,103.0,519.0,103.0,...,103.18,48.0,8.205882,6.090909,34.0,279.0,0.0,0.0,0.0,0.0


In [9]:
df.isna().sum()

ID                            0
shop_id                       0
item_id                       0
item_category_id              0
hist_sales_sum_cat_by_shop    0
                             ..
total_sales_with_1_units      0
total_sales_with_2_units      0
total_sales_with_3_units      0
total_sales_with_4_units      0
total_sales_with_5_units      0
Length: 70, dtype: int64

### Merge with mean encodings

In [10]:
category_encoding = jb.load('encodings/category_id_me.pkl')
df.loc[:,'item_category_id_m_e'] = df['item_category_id'].map(category_encoding)

item_encoding = jb.load('encodings/item_id_me.pkl')
df.loc[:,'item_id_m_e'] = df['item_id'].map(item_encoding)

shop_encoding = jb.load('encodings/shop_id_me.pkl')
df.loc[:,'shop_id_m_e'] = df['shop_id'].map(shop_encoding)

df[['item_category_id', 'item_category_id_m_e', 'item_id', 'item_id_m_e', 'shop_id', 'shop_id_m_e']].sample(5)

Unnamed: 0,item_category_id,item_category_id_m_e,item_id,item_id_m_e,shop_id,shop_id_m_e
141151,64,1.366438,16179,1.550729,56,1.528745
26947,19,1.924056,3868,1.749698,47,1.85745
155508,72,1.471186,20669,1.758552,52,1.461682
17390,24,1.567871,2297,1.565676,6,1.58092
66151,24,1.567871,5042,1.576043,4,1.433148


In [11]:
df["item+shop"] = df["item_id"].apply(str) + "_" + df["shop_id"].apply(str)
df["item_cat+shop"] = df["item_category_id"].apply(str) + "_" + df["shop_id"].apply(str)

item_cat_shop_encoding = jb.load("encodings/item_cat_shop_encoding_me.pkl")

df.loc[:, 'item_cat+shop_m_e'] = df['item_cat+shop'].map(item_cat_shop_encoding)

df.head()

Unnamed: 0,ID,shop_id,item_id,item_category_id,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,...,total_sales_with_2_units,total_sales_with_3_units,total_sales_with_4_units,total_sales_with_5_units,item_category_id_m_e,item_id_m_e,shop_id_m_e,item+shop,item_cat+shop,item_cat+shop_m_e
0,0,5,5037,19,3141.0,95.181818,34.12885,48.0,206.0,76.0,...,95.0,10.0,9.0,5.0,1.924056,2.355717,1.567931,5037_5,19_5,1.912984
1,5100,4,5037,19,3112.0,91.529412,49.3431,16.0,215.0,34.0,...,95.0,10.0,9.0,5.0,1.924056,2.355717,1.433148,5037_4,19_4,1.622122
2,10200,6,5037,19,5946.0,174.882353,100.191563,45.0,447.0,54.0,...,95.0,10.0,9.0,5.0,1.924056,2.355717,1.58092,5037_6,19_6,1.652957
3,15300,3,5037,19,2718.0,79.941176,25.399763,40.0,134.0,45.0,...,95.0,10.0,9.0,5.0,1.924056,2.355717,1.544825,5037_3,19_3,1.580529
4,20400,2,5037,19,4434.0,130.411765,55.938341,43.0,283.0,70.0,...,95.0,10.0,9.0,5.0,1.924056,2.355717,1.633023,5037_2,19_2,2.043604


In [12]:
bag_of_words = pd.read_csv("generated/generals/features_items_bow.csv")
bag_of_words.head()

Unnamed: 0,1с аудиокниги,1с образовательная,1с познавательная,1с школа,2cd digipack,360 английская,360 русская,3d bd,adventure time,assassin creed,...,мягкая игрушка,настольная игра,образовательная коллекция,познавательная коллекция,русская версия,русская документация,русские субтитры,только для,цифровая версия,item_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [13]:
df = pd.merge(df, bag_of_words, on="item_id",how="inner")

In [14]:
features = df

In [15]:
features.drop(columns=['shop_id', 'item_id', 'item_category_id', 'item_cat+shop', 'item+shop'], inplace=True)

# MODELS THAT ACCEPT NANS AS INPUT

## XGBOOST

In [16]:
model = jb.load("models/35-more_features.pkl")

In [17]:
ids_cols = ["ID"]
preds = [x for x in features.columns if x not in ids_cols]

predictors = jb.load("model_parameters/predictors_xgboost.pkl")

sorted(preds) == sorted(predictors)

True

In [18]:
predictors

['hist_sales_sum_cat_by_shop',
 'hist_sales_mean_cat_by_shop',
 'hist_sales_std_cat_by_shop',
 'hist_sales_min_cat_by_shop',
 'hist_sales_max_cat_by_shop',
 'sold_cat_last_month_by_shop',
 'total_shops_of_category',
 'total_items_of_category',
 'mean_sales_of_category_by_month',
 'tot_cat_by_shops',
 'items_sold_last_month',
 'items_sold_by_shop_last_2_months',
 'items_sold_by_shop_last_3_months',
 'items_sold_by_shop_last_4_months',
 'items_sold_by_shop_last_5_months',
 'items_sold_by_shop_last_6_months',
 'items_sold_by_shop_last_7_months',
 'items_sold_by_shop_last_8_months',
 'items_sold_by_shop_last_9_months',
 'items_sold_by_shop_last_10_months',
 'items_sold_by_shop_last_11_months',
 'items_sold_by_shop_last_12_months',
 'items_sold_by_shop_last_13_months',
 'ten_month_mean_item_by_shop',
 'ten_month_sales_sum_item_by_shop',
 'ten_month_std_item_by_shop',
 'ten_month_max_item_by_shop',
 'ten_month_min_item_by_shop',
 'hist_sales_mean_item_by_shop',
 'hist_sales_sum_item_by_shop'

In [19]:
[x for x in preds if x not in predictors]

[]

In [20]:
X = df[predictors]
ids = df[ids_cols]
X.head()

Unnamed: 0,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,total_shops_of_category,total_items_of_category,mean_sales_of_category_by_month,tot_cat_by_shops,...,издание pc,мягкая игрушка,настольная игра,образовательная коллекция,познавательная коллекция,русская версия,русская документация,русские субтитры,только для,цифровая версия
0,3141.0,95.181818,34.12885,48.0,206.0,76.0,58,625,1.142927,60,...,0,0,0,0,0,0,0,1,0,0
1,3112.0,91.529412,49.3431,16.0,215.0,34.0,58,625,1.142927,61,...,0,0,0,0,0,0,0,1,0,0
2,5946.0,174.882353,100.191563,45.0,447.0,54.0,58,625,1.142927,63,...,0,0,0,0,0,0,0,1,0,0
3,2718.0,79.941176,25.399763,40.0,134.0,45.0,58,625,1.142927,58,...,0,0,0,0,0,0,0,1,0,0
4,4434.0,130.411765,55.938341,43.0,283.0,70.0,58,625,1.142927,59,...,0,0,0,0,0,0,0,1,0,0


In [21]:
X["predictions"] = model.predict(X)
ids["item_cnt_month"] = X["predictions"]
ids.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,ID,item_cnt_month
0,0,1.896
1,5100,1.57177
2,10200,1.746669
3,15300,1.659123
4,20400,1.935698


In [22]:
sub_1 = ids
sub_1.head()

Unnamed: 0,ID,item_cnt_month
0,0,1.896
1,5100,1.57177
2,10200,1.746669
3,15300,1.659123
4,20400,1.935698


In [23]:
rounded = round_submission_dataframe(sub_1)
rounded.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,ID,item_cnt_month
0,0,2.0
42,1,1.0
84,2,2.0
126,3,1.0
168,4,1.0


In [24]:
rounded["item_cnt_month"].mean()

1.1459243697478991

In [25]:
rounded.to_csv("submissions/39.csv", index=False)

# MODELS THAT DOESN'T ACCEPT NA AS INPUTS OR JUST DONT USE NA'S

## Analyze and clean NA values

In [13]:
df.isna().sum()

ID                            0
shop_id                       0
item_id                       0
item_category_id              0
hist_sales_sum_cat_by_shop    0
                             ..
русская версия                0
русская документация          0
русские субтитры              0
только для                    0
цифровая версия               0
Length: 106, dtype: int64

In [14]:
# For item_price features, take main of category
grouped = df.groupby("item_category_id").agg({"item_price_max":"mean", "item_price_mean":"mean", "item_price_min":"mean", "item_price_std":"mean", "diff_price_max_min":"mean"}).reset_index()

df["id_help"] = [x for x in range(len(df))]

df_item_price_na = df[df.item_price_max.isna()]
df_item_price_na.drop(columns=["item_price_max", "item_price_mean", "item_price_std", "item_price_min", "diff_price_max_min"], inplace=True)
print(len(df_item_price_na))

19278


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [15]:
df_item_price_na = pd.merge(df_item_price_na, grouped, on="item_category_id", how="inner")
print(len(df_item_price_na))

df_item_price_na = df_item_price_na[df.columns]
print(len(df_item_price_na))
df_item_price_na.columns == df.columns

19278
19278


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [16]:
print(len(df))
df = df_item_price_na.append(df).drop_duplicates(subset=["id_help"], keep="first").drop(columns=["id_help"])
print(len(df))
df.head()

214200
214200


Unnamed: 0,ID,shop_id,item_id,item_category_id,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,...,издание pc,мягкая игрушка,настольная игра,образовательная коллекция,познавательная коллекция,русская версия,русская документация,русские субтитры,только для,цифровая версия
0,1,5,5320,55,4892.0,148.242424,35.075838,84.0,213.0,106.0,...,0,0,0,0,0,0,0,0,0,0
1,5101,4,5320,55,4952.0,145.647059,42.17775,28.0,229.0,106.0,...,0,0,0,0,0,0,0,0,0,0
2,10201,6,5320,55,12104.0,356.0,103.282197,157.0,623.0,221.0,...,0,0,0,0,0,0,0,0,0,0
3,15301,3,5320,55,2885.0,84.852941,24.640607,37.0,126.0,38.0,...,0,0,0,0,0,0,0,0,0,0
4,20401,2,5320,55,363.0,11.709677,22.767218,1.0,99.0,2.0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df["hist_sales_std_cat_by_shop"] = df["hist_sales_std_cat_by_shop"].fillna(0)
df["hist_sales_std_item_by_shop"] = df["hist_sales_std_item_by_shop"].fillna(0)
df["month_last_purchase_of_item_in_shop"] = df["month_last_purchase_of_item_in_shop"].fillna(-1)
df["month_first_purchase_of_item_in_shop"] = df["month_first_purchase_of_item_in_shop"].fillna(-1)
df["item_price_std"] = df["item_price_std"].fillna(0)
df["total_shops_item_is_sell"] = df["total_shops_item_is_sell"].fillna(0)
df["total_shops_of_category"] = df["total_shops_of_category"].fillna(0)
df["total_items_of_category"] = df["total_items_of_category"].fillna(0)
df["total_months_with_sells_by_item_and_shop"] = df["total_months_with_sells_by_item_and_shop"].fillna(0)

df["mean_sales_of_category_by_month"] = df["mean_sales_of_category_by_month"].fillna(0)
df["ten_month_mean_item_by_shop"] = df["ten_month_mean_item_by_shop"].fillna(0)
df["ten_month_sales_sum_item_by_shop"] = df["ten_month_sales_sum_item_by_shop"].fillna(0)
df["ten_month_std_item_by_shop"] = df["ten_month_std_item_by_shop"].fillna(0)
df["ten_month_max_item_by_shop"] = df["ten_month_max_item_by_shop"].fillna(0)
df["ten_month_min_item_by_shop"] = df["ten_month_min_item_by_shop"].fillna(0)
df["sales_item_historically_mean"] = df["sales_item_historically_mean"].fillna(0)
df["sales_item_mean_ten"] = df["sales_item_mean_ten"].fillna(0)

In [18]:
df.isna().sum()

ID                            0
shop_id                       0
item_id                       0
item_category_id              0
hist_sales_sum_cat_by_shop    0
                             ..
русская версия                0
русская документация          0
русские субтитры              0
только для                    0
цифровая версия               0
Length: 106, dtype: int64

In [19]:
df["item_price_max"] = df["item_price_max"].fillna(df["item_price_max"].mean()) 
df["item_price_mean"] = df["item_price_mean"].fillna(df["item_price_mean"].mean()) 
df["item_price_min"] = df["item_price_min"].fillna(df["item_price_min"].mean()) 
df["diff_price_max_min"] = df["diff_price_max_min"].fillna(df["diff_price_max_min"].mean()) 
df["months_item_has_sales"] = df["months_item_has_sales"].fillna(0) 
df["item_id_m_e"] = df["item_id_m_e"].fillna(0)
df["item_category_id_m_e"] = df["item_category_id_m_e"].fillna(0)
df.isna().sum()

ID                            0
shop_id                       0
item_id                       0
item_category_id              0
hist_sales_sum_cat_by_shop    0
                             ..
русская версия                0
русская документация          0
русские субтитры              0
только для                    0
цифровая версия               0
Length: 106, dtype: int64

In [22]:
features = df
features.drop(columns=['shop_id', 'item_id', 'item_category_id'], inplace=True)

## RANDOM FOREST MODEL

In [23]:
ids_cols = ["ID"]
preds = [x for x in features.columns if x not in ids_cols]

predictors = jb.load("model_parameters/predictors_rf.pkl")

sorted(preds) == sorted(predictors)

True

In [24]:
model_rf = jb.load("models/22-rf_bow_2ngrams.pkl")

In [25]:
X = df[predictors]
ids = df[ids_cols]

X["predictions"] = model_rf.predict(X)
ids["item_cnt_month"] = X["predictions"]

sub_2 = ids.sort_values(by="ID")
sub_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,ID,item_cnt_month
0,0,1.959943
0,1,1.132248
84,2,1.988836
126,3,1.45191
1722,4,1.134166


In [None]:
sub_2.to_csv("submissions/12-rf.csv", index=False)

## XGBOOST

In [24]:
model = jb.load("models/17-xgb_with_non_nans.pkl")

In [25]:
ids_cols = ["ID"]
preds = [x for x in features.columns if x not in ids_cols]

predictors = jb.load("model_parameters/predictors_xgboost.pkl")

sorted(preds) == sorted(predictors)

True

In [26]:
X = df[predictors]
ids = df[ids_cols]
X.head()

Unnamed: 0,shop_id,item_id,item_category_id,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,total_shops_of_category,...,sales_item_mean_ten,months_item_has_sales,total_sales_with_1_units,total_sales_with_2_units,total_sales_with_3_units,total_sales_with_4_units,total_sales_with_5_units,item_category_id_m_e,item_id_m_e,shop_id_m_e
0,5,5320,55,4892.0,148.242424,35.075838,84.0,213.0,106.0,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298209,0.0,1.566739
1,4,5320,55,4952.0,145.647059,42.17775,28.0,229.0,106.0,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298209,0.0,1.430398
2,6,5320,55,12104.0,356.0,103.282197,157.0,623.0,221.0,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298209,0.0,1.580137
3,3,5320,55,2885.0,84.852941,24.640607,37.0,126.0,38.0,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298209,0.0,1.54249
4,2,5320,55,363.0,11.709677,22.767218,1.0,99.0,2.0,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298209,0.0,1.632068


In [27]:
X["predictions"] = model.predict(X)
ids["item_cnt_month"] = X["predictions"]
ids.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,ID,item_cnt_month
0,1,1.047386
1,5101,1.042222
2,10201,1.12736
3,15301,0.958129
4,20401,0.980122


In [28]:
sub_1 = ids
sub_1.head()

Unnamed: 0,ID,item_cnt_month
0,1,1.047386
1,5101,1.042222
2,10201,1.12736
3,15301,0.958129
4,20401,0.980122


## MERGE TWO PREDICTIONS FOR DIFFERENT MODELS

In [47]:
def merge_predictions_for_two_models(df_1, df_2):
    df = pd.merge(df_1, df_2, on="ID", how="inner")
    df["item_cnt_month"] = df["item_cnt_month_x"] * 0.5 + df["item_cnt_month_y"] * 0.5
    df.drop(columns=["item_cnt_month_x", "item_cnt_month_y"], inplace=True)
    df.sort_values(by="ID", inplace=True)
    return df

In [48]:
sub = merge_predictions_for_two_models(sub_1, sub_2)

In [49]:
sub = round_submission_dataframe(sub)
sub.head()

Unnamed: 0,ID,item_cnt_month
0,0,2.0
42,1,1.0
84,2,2.0
126,3,1.0
168,4,1.0


In [50]:
sub.to_csv("submissions/29-XGB_RF_2ngrams.csv", index=False)