In [1]:
import pandas as pd
import joblib as jb

from math import modf

In [2]:
def downgrade_or_upgrade_if_necessary(x):
    m = modf(x)
    if m[0] <= 0.65:
        return m[1]
    if m[0] > 0.35:
        return m[1] + 1
    return x

def round_submission_dataframe(sub):
    sub["item_cnt_month"] = sub["item_cnt_month"].map(lambda x: downgrade_or_upgrade_if_necessary(x))
    return sub.sort_values(by="ID")

In [3]:
items = pd.read_csv("data/items.csv")

In [4]:
df = pd.read_csv("data/test.csv")
df.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [5]:
df = pd.merge(df, items, on="item_id", how="inner").drop(columns=["item_name"])
df.sample(10)

Unnamed: 0,ID,shop_id,item_id,item_category_id
100261,38087,12,8063,55
187595,121766,53,14199,31
187728,157469,55,15056,31
18286,82035,16,3243,30
111714,186259,34,18115,37
19800,92271,14,21967,67
136205,212342,45,12840,40
58588,205394,39,8178,55
190483,70835,24,563,78
133308,3174,5,15898,69


In [6]:
features_folder = "generated/features_prediction/"

features_dict = {
    features_folder + "features_shop_and_categories.csv":["shop_id", "item_category_id"],
    features_folder + "features_categories.csv":"item_category_id",
    features_folder + "features_shops.csv":"shop_id",
    features_folder + "features_items_and_shop.csv": ["item_id", "shop_id"],
    features_folder + "features_items.csv":"item_id"    
}

In [7]:
print(len(df))
for features_file in features_dict:
    df_features = pd.read_csv(features_file)
    joining_cols = features_dict[features_file]
    df = pd.merge(df, df_features, on=joining_cols, how="left")
print(len(df))
df.sample(10)

214200
214200


Unnamed: 0,ID,shop_id,item_id,item_category_id,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,...,diff_price_max_min,total_shops_item_is_sell,sales_item_historically_mean,sales_item_mean_ten,months_item_has_sales,total_sales_with_1_units,total_sales_with_2_units,total_sales_with_3_units,total_sales_with_4_units,total_sales_with_5_units
57567,139070,57,19219,55,12194.0,381.0625,127.966462,204.0,615.0,274.0,...,,,,,,,,,,
77653,190548,46,15052,61,117.0,5.571429,5.455011,1.0,19.0,12.0,...,4814.0,40.0,9.8125,5.727273,16.0,130.0,5.0,4.0,,1.0
109905,170916,37,6733,24,366.0,26.142857,15.471349,7.0,68.0,22.0,...,,,,,,,,,,
213771,173389,37,16007,64,310.0,9.117647,5.150844,1.0,23.0,4.0,...,0.0,1.0,1.0,1.0,1.0,1.0,,,,
4684,112311,49,4053,23,1315.0,57.173913,20.089916,23.0,93.0,42.0,...,0.0,16.0,22.0,22.0,1.0,22.0,,,,
77610,185447,34,15027,49,54.0,3.857143,4.865216,1.0,19.0,,...,0.0,6.0,12.333333,12.333333,3.0,29.0,4.0,,,
168818,106019,42,2354,30,12199.0,358.794118,110.161804,160.0,609.0,160.0,...,801.0,49.0,19.571429,4.727273,21.0,286.0,37.0,10.0,4.0,1.0
84241,160105,56,19094,37,3995.0,117.5,49.672412,13.0,221.0,56.0,...,1005.0,42.0,9.272727,5.875,22.0,204.0,,,,
4244,10301,6,3756,20,2770.0,115.416667,51.263104,21.0,251.0,155.0,...,1520.0,35.0,93.0,93.0,1.0,51.0,17.0,1.0,,1.0
4484,163306,36,3898,55,49.0,49.0,,49.0,49.0,49.0,...,109.0,15.0,23.0,23.0,1.0,21.0,1.0,,,


In [8]:
month_to_predict = 34

df["hist_sales_sum_cat_by_shop"] = df["hist_sales_sum_cat_by_shop"].fillna(0)
df["hist_sales_mean_cat_by_shop"] = df["hist_sales_mean_cat_by_shop"].fillna(0)
df["hist_sales_max_cat_by_shop"] = df["hist_sales_max_cat_by_shop"].fillna(0)
df["hist_sales_min_cat_by_shop"] = df["hist_sales_min_cat_by_shop"].fillna(0)
df["sold_cat_last_month_by_shop"] = df["sold_cat_last_month_by_shop"].fillna(0)
df["tot_cat_by_shops"] = df["tot_cat_by_shops"].fillna(df["tot_cat_by_shops"].mean())
df["items_sold_last_month"] = df["items_sold_last_month"].fillna(0)
df["hist_sales_mean_item_by_shop"] = df["hist_sales_mean_item_by_shop"].fillna(0)
df["hist_sales_sum_item_by_shop"] = df["hist_sales_sum_item_by_shop"].fillna(0)
df["hist_sales_max_item_by_shop"] = df["hist_sales_max_item_by_shop"].fillna(0)
df["hist_sales_min_item_by_shop"] = df["hist_sales_min_item_by_shop"].fillna(0)
df["purchases_item_in_shop_last_month"] = df["purchases_item_in_shop_last_month"].fillna(0)
for x in range(5):
    df["purchases_item_in_shop_month_" + str(x+2)].fillna(0, inplace=True)
df["total_months_with_sells_by_item_and_shop"] = df["total_months_with_sells_by_item_and_shop"].fillna(0)    
for x in range(2, 7):
    df["items_sold_by_shop_last_" + str(x) + "_months"] = df["items_sold_by_shop_last_" + str(x) + "_months"].fillna(0) 
for x in range(1, 6):
    df["total_sales_with_" + str(x) +"_units"] = df["total_sales_with_" + str(x) +"_units"].fillna(0)
df.sample(10)

Unnamed: 0,ID,shop_id,item_id,item_category_id,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,...,diff_price_max_min,total_shops_item_is_sell,sales_item_historically_mean,sales_item_mean_ten,months_item_has_sales,total_sales_with_1_units,total_sales_with_2_units,total_sales_with_3_units,total_sales_with_4_units,total_sales_with_5_units
73966,22161,2,10676,67,228.0,6.909091,5.735654,1.0,28.0,3.0,...,299.0,25.0,2.148148,2.909091,27.0,58.0,0.0,0.0,0.0,0.0
36886,51878,26,17387,40,11867.0,349.029412,153.493359,120.0,717.0,185.0,...,159.0,43.0,79.142857,79.142857,7.0,427.0,47.0,7.0,3.0,0.0
174684,34759,10,21080,61,34.0,3.4,2.503331,1.0,8.0,2.0,...,494.486667,10.0,2.923077,2.2,13.0,32.0,0.0,2.0,0.0,0.0
59271,47311,31,3866,28,10274.0,302.176471,121.669814,86.0,595.0,121.0,...,404.85,30.0,20.8,20.8,5.0,90.0,7.0,0.0,0.0,0.0
27231,77148,15,4416,56,101.0,3.607143,2.643501,1.0,12.0,4.0,...,700.0,25.0,17.4,17.4,5.0,57.0,4.0,0.0,3.0,2.0
151818,156614,55,8471,43,0.0,0.0,,0.0,0.0,0.0,...,107.0,15.0,6.0,4.454545,14.0,78.0,3.0,0.0,0.0,0.0
172721,90812,18,19690,41,537.0,15.794118,9.26636,5.0,45.0,5.0,...,1000.0,37.0,11.2,1.5,10.0,100.0,3.0,2.0,0.0,0.0
138798,156304,55,14224,57,0.0,0.0,,0.0,0.0,0.0,...,132.0,21.0,4.65625,2.222222,32.0,149.0,0.0,0.0,0.0,0.0
186321,50336,31,19426,78,0.0,0.0,,0.0,0.0,0.0,...,0.0,1.0,4.235294,4.0,17.0,60.0,6.0,0.0,0.0,0.0
59927,179926,38,12706,40,4470.0,131.470588,53.346912,55.0,236.0,61.0,...,0.0,8.0,8.0,8.0,1.0,8.0,0.0,0.0,0.0,0.0


In [9]:
df.isna().sum()

ID                                               0
shop_id                                          0
item_id                                          0
item_category_id                                 0
hist_sales_sum_cat_by_shop                       0
hist_sales_mean_cat_by_shop                      0
hist_sales_std_cat_by_shop                   29050
hist_sales_min_cat_by_shop                       0
hist_sales_max_cat_by_shop                       0
sold_cat_last_month_by_shop                      0
total_shops_of_category                          0
total_items_of_category                          0
mean_sales_of_category_by_month                  0
tot_cat_by_shops                                 0
items_sold_last_month                            0
items_sold_by_shop_last_2_months                 0
items_sold_by_shop_last_3_months                 0
items_sold_by_shop_last_4_months                 0
items_sold_by_shop_last_5_months                 0
items_sold_by_shop_last_6_month

### Merge with mean encodings

In [10]:
category_encoding = jb.load('encodings/category_id_me.pkl')
df.loc[:,'item_category_id_m_e'] = df['item_category_id'].map(category_encoding)

item_encoding = jb.load('encodings/item_id_me.pkl')
df.loc[:,'item_id_m_e'] = df['item_id'].map(item_encoding)

shop_encoding = jb.load('encodings/shop_id_me.pkl')
df.loc[:,'shop_id_m_e'] = df['shop_id'].map(shop_encoding)

df[['item_category_id', 'item_category_id_m_e', 'item_id', 'item_id_m_e', 'shop_id', 'shop_id_m_e']].sample(5)

Unnamed: 0,item_category_id,item_category_id_m_e,item_id,item_id_m_e,shop_id,shop_id_m_e
106016,75,2.10416,4138,1.806074,28,1.922177
74912,55,1.298209,12469,,48,1.577592
41418,12,1.74391,6674,1.903037,10,1.429144
176783,70,1.831133,18039,1.579471,7,1.734186
200840,40,1.560805,11099,1.564171,41,1.514349


In [11]:
features = df

# MODELS THAT ACCEPT NANS AS INPUT

## XGBOOST

In [12]:
model = jb.load("models/16-xgb_mean_encodings_smoothed-30.pkl")

In [13]:
ids_cols = ["ID"]
preds = [x for x in features.columns if x not in ids_cols]

predictors = jb.load("model_parameters/predictors_xgboost.pkl")

sorted(preds) == sorted(predictors)

True

In [14]:
X = df[predictors]
ids = df[ids_cols]
X.head()

Unnamed: 0,shop_id,item_id,item_category_id,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,total_shops_of_category,...,sales_item_mean_ten,months_item_has_sales,total_sales_with_1_units,total_sales_with_2_units,total_sales_with_3_units,total_sales_with_4_units,total_sales_with_5_units,item_category_id_m_e,item_id_m_e,shop_id_m_e
0,5,5037,19,3141.0,95.181818,34.12885,48.0,206.0,76.0,58,...,83.454545,14.0,958.0,95.0,10.0,9.0,5.0,1.924626,2.44726,1.566739
1,4,5037,19,3112.0,91.529412,49.3431,16.0,215.0,34.0,58,...,83.454545,14.0,958.0,95.0,10.0,9.0,5.0,1.924626,2.44726,1.430398
2,6,5037,19,5946.0,174.882353,100.191563,45.0,447.0,54.0,58,...,83.454545,14.0,958.0,95.0,10.0,9.0,5.0,1.924626,2.44726,1.580137
3,3,5037,19,2718.0,79.941176,25.399763,40.0,134.0,45.0,58,...,83.454545,14.0,958.0,95.0,10.0,9.0,5.0,1.924626,2.44726,1.54249
4,2,5037,19,4434.0,130.411765,55.938341,43.0,283.0,70.0,58,...,83.454545,14.0,958.0,95.0,10.0,9.0,5.0,1.924626,2.44726,1.632068


In [15]:
X["predictions"] = model.predict(X)
ids["item_cnt_month"] = X["predictions"]
ids.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,ID,item_cnt_month
0,0,1.454102
1,5100,1.488011
2,10200,1.575399
3,15300,1.638917
4,20400,1.379768


In [16]:
sub_1 = ids
sub_1.head()

Unnamed: 0,ID,item_cnt_month
0,0,1.454102
1,5100,1.488011
2,10200,1.575399
3,15300,1.638917
4,20400,1.379768


In [38]:
rounded = round_submission_dataframe(sub_1)
rounded.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,ID,item_cnt_month
0,0,1.0
42,1,1.0
84,2,2.0
126,3,1.0
168,4,1.0


In [39]:
rounded.to_csv("submissions/20-xgb_mean.csv", index=False)

# MODELS THAT DOESN'T ACCEPT NA AS INPUTS OR JUST DONT USE NA'S

## Analyze and clean NA values

In [17]:
df.isna().sum()

ID                                               0
shop_id                                          0
item_id                                          0
item_category_id                                 0
hist_sales_sum_cat_by_shop                       0
hist_sales_mean_cat_by_shop                      0
hist_sales_std_cat_by_shop                   29050
hist_sales_min_cat_by_shop                       0
hist_sales_max_cat_by_shop                       0
sold_cat_last_month_by_shop                      0
total_shops_of_category                          0
total_items_of_category                          0
mean_sales_of_category_by_month                  0
tot_cat_by_shops                                 0
items_sold_last_month                            0
items_sold_by_shop_last_2_months                 0
items_sold_by_shop_last_3_months                 0
items_sold_by_shop_last_4_months                 0
items_sold_by_shop_last_5_months                 0
items_sold_by_shop_last_6_month

In [18]:
# For item_price features, take main of category
grouped = df.groupby("item_category_id").agg({"item_price_max":"mean", "item_price_mean":"mean", "item_price_min":"mean", "item_price_std":"mean", "diff_price_max_min":"mean"}).reset_index()

df["id_help"] = [x for x in range(len(df))]

df_item_price_na = df[df.item_price_max.isna()]
df_item_price_na.drop(columns=["item_price_max", "item_price_mean", "item_price_std", "item_price_min", "diff_price_max_min"], inplace=True)
print(len(df_item_price_na))

19278


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [19]:
df_item_price_na = pd.merge(df_item_price_na, grouped, on="item_category_id", how="inner")
print(len(df_item_price_na))

df_item_price_na = df_item_price_na[df.columns]
print(len(df_item_price_na))
df_item_price_na.columns == df.columns

19278
19278


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [20]:
print(len(df))
df = df_item_price_na.append(df).drop_duplicates(subset=["id_help"], keep="first").drop(columns=["id_help"])
print(len(df))
df.head()

214200
214200


Unnamed: 0,ID,shop_id,item_id,item_category_id,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,...,sales_item_mean_ten,months_item_has_sales,total_sales_with_1_units,total_sales_with_2_units,total_sales_with_3_units,total_sales_with_4_units,total_sales_with_5_units,item_category_id_m_e,item_id_m_e,shop_id_m_e
0,1,5,5320,55,4892.0,148.242424,35.075838,84.0,213.0,106.0,...,,,0.0,0.0,0.0,0.0,0.0,1.298209,,1.566739
1,5101,4,5320,55,4952.0,145.647059,42.17775,28.0,229.0,106.0,...,,,0.0,0.0,0.0,0.0,0.0,1.298209,,1.430398
2,10201,6,5320,55,12104.0,356.0,103.282197,157.0,623.0,221.0,...,,,0.0,0.0,0.0,0.0,0.0,1.298209,,1.580137
3,15301,3,5320,55,2885.0,84.852941,24.640607,37.0,126.0,38.0,...,,,0.0,0.0,0.0,0.0,0.0,1.298209,,1.54249
4,20401,2,5320,55,363.0,11.709677,22.767218,1.0,99.0,2.0,...,,,0.0,0.0,0.0,0.0,0.0,1.298209,,1.632068


In [21]:
df["hist_sales_std_cat_by_shop"] = df["hist_sales_std_cat_by_shop"].fillna(0)
df["hist_sales_std_item_by_shop"] = df["hist_sales_std_item_by_shop"].fillna(0)
df["month_last_purchase_of_item_in_shop"] = df["month_last_purchase_of_item_in_shop"].fillna(-1)
df["month_first_purchase_of_item_in_shop"] = df["month_first_purchase_of_item_in_shop"].fillna(-1)
df["item_price_std"] = df["item_price_std"].fillna(0)
df["total_shops_item_is_sell"] = df["total_shops_item_is_sell"].fillna(0)
df["total_shops_of_category"] = df["total_shops_of_category"].fillna(0)
df["total_items_of_category"] = df["total_items_of_category"].fillna(0)
df["total_months_with_sells_by_item_and_shop"] = df["total_months_with_sells_by_item_and_shop"].fillna(0)

df["mean_sales_of_category_by_month"] = df["mean_sales_of_category_by_month"].fillna(0)
df["ten_month_mean_item_by_shop"] = df["ten_month_mean_item_by_shop"].fillna(0)
df["ten_month_sales_sum_item_by_shop"] = df["ten_month_sales_sum_item_by_shop"].fillna(0)
df["ten_month_std_item_by_shop"] = df["ten_month_std_item_by_shop"].fillna(0)
df["ten_month_max_item_by_shop"] = df["ten_month_max_item_by_shop"].fillna(0)
df["ten_month_min_item_by_shop"] = df["ten_month_min_item_by_shop"].fillna(0)
df["sales_item_historically_mean"] = df["sales_item_historically_mean"].fillna(0)
df["sales_item_mean_ten"] = df["sales_item_mean_ten"].fillna(0)

In [22]:
df.isna().sum()

ID                                              0
shop_id                                         0
item_id                                         0
item_category_id                                0
hist_sales_sum_cat_by_shop                      0
hist_sales_mean_cat_by_shop                     0
hist_sales_std_cat_by_shop                      0
hist_sales_min_cat_by_shop                      0
hist_sales_max_cat_by_shop                      0
sold_cat_last_month_by_shop                     0
total_shops_of_category                         0
total_items_of_category                         0
mean_sales_of_category_by_month                 0
tot_cat_by_shops                                0
items_sold_last_month                           0
items_sold_by_shop_last_2_months                0
items_sold_by_shop_last_3_months                0
items_sold_by_shop_last_4_months                0
items_sold_by_shop_last_5_months                0
items_sold_by_shop_last_6_months                0


In [23]:
df["item_price_max"] = df["item_price_max"].fillna(df["item_price_max"].mean()) 
df["item_price_mean"] = df["item_price_mean"].fillna(df["item_price_mean"].mean()) 
df["item_price_min"] = df["item_price_min"].fillna(df["item_price_min"].mean()) 
df["diff_price_max_min"] = df["diff_price_max_min"].fillna(df["diff_price_max_min"].mean()) 
df["months_item_has_sales"] = df["months_item_has_sales"].fillna(0) 
df["item_id_m_e"] = df["item_id_m_e"].fillna(0)
df["item_category_id_m_e"] = df["item_category_id_m_e"].fillna(0)
df.isna().sum()

ID                                          0
shop_id                                     0
item_id                                     0
item_category_id                            0
hist_sales_sum_cat_by_shop                  0
hist_sales_mean_cat_by_shop                 0
hist_sales_std_cat_by_shop                  0
hist_sales_min_cat_by_shop                  0
hist_sales_max_cat_by_shop                  0
sold_cat_last_month_by_shop                 0
total_shops_of_category                     0
total_items_of_category                     0
mean_sales_of_category_by_month             0
tot_cat_by_shops                            0
items_sold_last_month                       0
items_sold_by_shop_last_2_months            0
items_sold_by_shop_last_3_months            0
items_sold_by_shop_last_4_months            0
items_sold_by_shop_last_5_months            0
items_sold_by_shop_last_6_months            0
ten_month_mean_item_by_shop                 0
ten_month_sales_sum_item_by_shop  

In [24]:
features = df

## RANDOM FOREST MODEL

In [25]:
ids_cols = ["ID"]
preds = [x for x in features.columns if x not in ids_cols]

predictors = jb.load("model_parameters/predictors_rf.pkl")

sorted(preds) == sorted(predictors)

True

In [26]:
model_rf = jb.load("models/18-rf_with_encodings.pkl")

In [27]:
X = df[predictors]
ids = df[ids_cols]

X["predictions"] = model_rf.predict(X)
ids["item_cnt_month"] = X["predictions"]

sub_2 = ids.sort_values(by="ID")
sub_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,ID,item_cnt_month
0,0,1.932241
0,1,1.11888
84,2,1.943332
126,3,1.484962
1722,4,1.121837


In [None]:
sub_2.to_csv("submissions/12-rf.csv", index=False)

## XGBOOST

In [24]:
model = jb.load("models/17-xgb_with_non_nans.pkl")

In [25]:
ids_cols = ["ID"]
preds = [x for x in features.columns if x not in ids_cols]

predictors = jb.load("model_parameters/predictors_xgboost.pkl")

sorted(preds) == sorted(predictors)

True

In [26]:
X = df[predictors]
ids = df[ids_cols]
X.head()

Unnamed: 0,shop_id,item_id,item_category_id,hist_sales_sum_cat_by_shop,hist_sales_mean_cat_by_shop,hist_sales_std_cat_by_shop,hist_sales_min_cat_by_shop,hist_sales_max_cat_by_shop,sold_cat_last_month_by_shop,total_shops_of_category,...,sales_item_mean_ten,months_item_has_sales,total_sales_with_1_units,total_sales_with_2_units,total_sales_with_3_units,total_sales_with_4_units,total_sales_with_5_units,item_category_id_m_e,item_id_m_e,shop_id_m_e
0,5,5320,55,4892.0,148.242424,35.075838,84.0,213.0,106.0,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298209,0.0,1.566739
1,4,5320,55,4952.0,145.647059,42.17775,28.0,229.0,106.0,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298209,0.0,1.430398
2,6,5320,55,12104.0,356.0,103.282197,157.0,623.0,221.0,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298209,0.0,1.580137
3,3,5320,55,2885.0,84.852941,24.640607,37.0,126.0,38.0,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298209,0.0,1.54249
4,2,5320,55,363.0,11.709677,22.767218,1.0,99.0,2.0,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.298209,0.0,1.632068


In [27]:
X["predictions"] = model.predict(X)
ids["item_cnt_month"] = X["predictions"]
ids.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,ID,item_cnt_month
0,1,1.047386
1,5101,1.042222
2,10201,1.12736
3,15301,0.958129
4,20401,0.980122


In [28]:
sub_1 = ids
sub_1.head()

Unnamed: 0,ID,item_cnt_month
0,1,1.047386
1,5101,1.042222
2,10201,1.12736
3,15301,0.958129
4,20401,0.980122


## MERGE TWO PREDICTIONS FOR DIFFERENT MODELS

In [28]:
def merge_predictions_for_two_models(df_1, df_2):
    df = pd.merge(df_1, df_2, on="ID", how="inner")
    df["item_cnt_month"] = df["item_cnt_month_x"] * 0.5 + df["item_cnt_month_y"] * 0.5
    df.drop(columns=["item_cnt_month_x", "item_cnt_month_y"], inplace=True)
    df.sort_values(by="ID", inplace=True)
    return df

In [29]:
sub = merge_predictions_for_two_models(sub_1, sub_2)

In [30]:
sub = round_submission_dataframe(sub)
sub.head()

Unnamed: 0,ID,item_cnt_month
0,0,2.0
42,1,1.0
84,2,2.0
126,3,1.0
168,4,1.0


In [31]:
sub.to_csv("submissions/25-xgb_nans_tf_smoothed-30.csv", index=False)