In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv("C:/Users/Admin/Desktop/20212/AppliedStatisticsProject20212/dataset/train.csv")
test = pd.read_csv("C:/Users/Admin/Desktop/20212/AppliedStatisticsProject20212/dataset/test.csv")

In [4]:
train_set = train.drop(['order_id', 'product_id'], axis = 1)
train_set.days_since_ratio.fillna(0, inplace = True)
train_set.head()

Unnamed: 0,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,aisle_id,department_id,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last,reordered
0,11,59,18,19.0,5.363637,8,14.0,0.736842,24,4,13880,9377.0,0.675576,1,0.090909,6.0,0.090909,6,7,0
1,11,59,18,19.0,5.363637,8,14.0,0.736842,77,7,35791,27791.0,0.77648,10,0.909091,1.4,0.909091,1,0,1
2,11,59,18,19.0,5.363637,8,14.0,0.736842,54,17,1214,536.0,0.441516,2,0.181818,5.0,0.181818,7,1,1
3,11,59,18,19.0,5.363637,8,14.0,0.736842,77,7,8558,6953.0,0.812456,3,0.272727,3.0,0.272727,1,0,1
4,11,59,18,19.0,5.363637,8,14.0,0.736842,91,16,15935,12923.0,0.810982,1,0.090909,2.0,0.090909,10,0,0


In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
train_set = reduce_mem_usage(train_set)

Memory usage of dataframe is 1293.13 MB
Memory usage after optimization is: 299.04 MB
Decreased by 76.9%


In [7]:
test = reduce_mem_usage(test)

Memory usage of dataframe is 774.38 MB
Memory usage after optimization is: 202.81 MB
Decreased by 73.8%


In [8]:
from sklearn.model_selection import train_test_split
X = train_set.drop('reordered', axis=1)
y = train_set.reordered
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify = y,
                                                 test_size = 0.3, random_state = 42)

In [9]:
#Using SMOTE to handle imbalanced classes
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy='minority', random_state=42)

# Fit the model to generate the data.
oversampled_trainX, oversampled_trainY = sm.fit_resample(X_train, y_train)
oversampled_train = pd.concat([pd.DataFrame(oversampled_trainY), pd.DataFrame(oversampled_trainX)], axis=1)
oversampled_train.head()

Unnamed: 0,reordered,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,aisle_id,department_id,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last
0,0,27,423,150,7.039062,15.664062,14,3.0,0.42627,83,4,867,367.0,0.42334,3,0.111084,14.664062,0.111084,2,6
1,0,34,350,164,9.148438,10.296875,10,10.0,1.092773,53,16,57,33.0,0.579102,5,0.147095,3.800781,0.147095,15,1
2,0,8,175,109,10.859375,21.875,10,29.0,2.671875,69,15,2420,812.0,0.335449,1,0.125,7.0,0.125,1,10
3,0,5,53,35,8.25,10.601562,12,8.0,0.969727,24,4,79769,57818.0,0.724609,3,0.600098,6.667969,0.600098,1,6
4,0,29,253,163,9.140625,8.726562,13,14.0,1.53125,116,1,2961,1061.0,0.358398,1,0.034485,9.0,0.034485,17,5


In [12]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 50, 
                               max_depth = 50,
                               min_samples_split= 3,
                               min_samples_leaf = 2,
                               max_features = 5,
                               n_jobs=-1, verbose = 2, random_state = 42)
model.fit(oversampled_train.drop('reordered', axis=1), oversampled_train.reordered)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 2 of 50building tree 1 of 50building tree 3 of 50
building tree 4 of 50
building tree 5 of 50

building tree 6 of 50building tree 7 of 50
building tree 8 of 50


building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 17.4min


building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50
building tree 46 of 50
building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 28.4min finished


In [14]:
from sklearn.metrics import classification_report
predictions = model.predict(X_val)
print("Test Dataset")
print(classification_report(y_val, predictions))


predictions_train = model.predict(X_train)
print("Train Dataset")
print(classification_report(y_train, predictions_train))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 23.4min finished


Test Dataset
              precision    recall  f1-score   support

           0       0.93      0.96      0.94   2293752
           1       0.46      0.34      0.39    248647

    accuracy                           0.90   2542399
   macro avg       0.70      0.65      0.67   2542399
weighted avg       0.88      0.90      0.89   2542399



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 17.5min finished


Train Dataset
              precision    recall  f1-score   support

           0       0.99      1.00      0.99   5352085
           1       0.97      0.87      0.92    580177

    accuracy                           0.99   5932262
   macro avg       0.98      0.93      0.96   5932262
weighted avg       0.98      0.99      0.98   5932262



In [None]:
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
# Plot normalized confusion matrix
class_names = [0,1]
plt.figure(figsize=(12,12))
disp = plot_confusion_matrix(model, X_val, y_val,
                             display_labels=class_names,
                             cmap=plt.cm.Blues,
                             normalize="true")
disp.ax_.set_title("Normalized confusion matrix")

print("Normalized confusion matrix")
print(disp.confusion_matrix)
plt.grid(b=None)
fig = plt.gcf()
fig.set_size_inches(5,5)
plt.show()

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  3.3min


In [18]:
preds = model.predict(test.drop(['order_id', 'product_id'], axis = 1))
test['preds'] = preds

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:  2.7min finished


In [19]:
test.head()

Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,...,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last,preds
0,2774568,17668,13,88,33,12.0,6.769531,15,11.0,0.916504,...,2110,1220.0,0.578125,5,0.384521,3.599609,0.384521,2,3,0
1,2774568,44683,13,88,33,12.0,6.769531,15,11.0,0.916504,...,22275,11981.0,0.538086,2,0.153809,9.5,0.153809,7,1,0
2,2774568,48523,13,88,33,12.0,6.769531,15,11.0,0.916504,...,5129,2376.0,0.463135,2,0.153809,6.5,0.153809,4,1,0
3,2774568,21903,13,88,33,12.0,6.769531,15,11.0,0.916504,...,241921,186884.0,0.772461,8,0.615234,4.25,0.615234,1,0,1
4,2774568,14992,13,88,33,12.0,6.769531,15,11.0,0.916504,...,29069,16942.0,0.583008,2,0.153809,7.0,0.153809,6,0,0


In [15]:
def convert(df):
    order_id_list = df.order_id.unique()
    map_order_id_products = {}
    for i, row in df.iterrows():
        if not (row.order_id in map_order_id_products):
                map_order_id_products[row.order_id] = ""
        if row.preds == 1:
            map_order_id_products[row.order_id] += " " + str(int(row.product_id))
            map_order_id_products[row.order_id] = map_order_id_products[row.order_id].lstrip()
    return map_order_id_products

In [22]:
map_order_id_products_test = convert(test)

In [23]:
tmp = pd.DataFrame(map_order_id_products_test.items())

In [24]:
tmp.columns = ['order_id', 'target']
tmp.target.replace("", "None", inplace = True)

In [17]:
sample_submission = pd.read_csv("C:/Users/Admin/Desktop/20212/AppliedStatisticsProject20212/dataset/sample_submission.csv")

In [29]:
sub = sample_submission.merge(tmp, on = ['order_id'])
sub.drop(['products'], axis = 1, inplace = True)
sub.columns = ['order_id', 'products']

In [30]:
sub[['order_id', 'products']].to_csv('sub1.csv', index = False)

In [32]:
import xgboost as xgb
from sklearn.metrics import f1_score

xg_cl = xgb.XGBClassifier(silent=False,
                      scale_pos_weight=8,
                      learning_rate=0.01,  
                      colsample_bytree = 1,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=50, 
                      reg_alpha = 0.1,
#                       reg_lambda = 0.5,
                      max_depth=4, 
                      min_child_weight = 1,
                      gamma=2,
                      n_jobs=4,               
                      seed=42)
eval_set = [(X_train, y_train), (X_val, y_val)]

# Fit the classifier to the training set
xg_cl.fit(X_train,y_train, eval_metric=["auc", "error"],eval_set=eval_set, verbose=True)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-auc:0.81043	validation_0-error:0.21694	validation_1-auc:0.81036	validation_1-error:0.21679
[1]	validation_0-auc:0.81188	validation_0-error:0.21694	validation_1-auc:0.81171	validation_1-error:0.21679
[2]	validation_0-auc:0.81223	validation_0-error:0.22148	validation_1-auc:0.81202	validation_1-error:0.22141
[3]	validation_0-auc:0.81234	validation_0-error:0.22148	validation_1-auc:0.81215	validation_1-error:0.22141
[4]	validation_0-auc:0.81299	validation_0-error:0.22148	validation_1-auc:0.81282	validation_1-error:0.22141
[5]	validation_0-auc:0.81302	validation_0-error:0.21974	validation_1-auc:0.81283	validation_1-error:0.21965
[6]	validation_0-auc:0.81348	validation

In [40]:
test_copy = test.drop(['preds'], axis= 1)

In [41]:
preds = xg_cl.predict(test_copy.drop(['order_id', 'product_id'], axis = 1))
test_copy['preds'] = preds

In [42]:
map_order_id_products_test = convert(test_copy)

In [43]:
tmp1 = pd.DataFrame(map_order_id_products_test.items())
tmp1.columns = ['order_id', 'target']
tmp1.target.replace("", "None", inplace = True)

In [44]:
sub_xgb = sample_submission.merge(tmp1, on = ['order_id'])
sub_xgb.drop(['products'], axis = 1, inplace = True)
sub_xgb.columns = ['order_id', 'products']

In [45]:
sub_xgb[['order_id', 'products']].to_csv('sub_xgb_0.csv', index = False)