In [1]:
#connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
cd 'drive/My Drive/feature file'

/content/drive/My Drive/feature file


In [95]:
#importing necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [3]:
#https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def memory_decrease_by_column(df):
  '''This function helps to reduce the memory taken by the dataframe by storing the data in data types of best fit'''  
  col_type_dict={'order_id':np.uint32,'user_id':np.uint32,'order_number':np.uint8,
                 'order_dow':np.uint8,'order_hour_of_day':np.uint8,'days_since_prior_order':np.float16,
                 'product_id':np.uint16,'add_to_cart_order':np.uint8,'reordered':np.uint8,
                 'aisle_id':np.uint8,'department_id':np.uint8,'reordered_new':np.uint8,
                 'user_max_ono':np.uint8,'user_sum_reord':np.uint16,'user_reord_prop':np.float16,
                  'user_prod_reord_prop':np.float16,'user_uniqpr':np.uint16,'user_uniqpr_prop':np.float16,
                  'user_order_reord_prop':np.float16,'user_dsp_mean':np.float16,'user_min_order_size':np.uint8,
                  'user_max_order_size':np.uint8,'user_mean_order_size':np.float16,
                  'product_ratios_users_oneshot':np.float16,'product_cart_mean':np.float16,
                  'product_reord_count':np.uint32,'product_reord_prop':np.float16,
                  'prod_uniq_us':np.uint32,'prod_uniq_us_prop':np.float16,
                  'prod_us_reord_prop':np.float16,'user_days_since_product':np.float16,'user_product_hod_mean':np.float16,
'user_product_dow_mean':np.float16,'user_product_prop':np.float16,
'user_product_cnt':np.uint8,'user_product_atc_mode_min':np.uint8,
'user_product_atc_mode_max':np.uint8,'user_product_atc_min':np.uint8,
'user_product_atc_max':np.uint8,'user_product_atc_mean':np.float16,
'aisle_reordered':np.uint32,'aisle_reordered_prop':np.float16,
'dep_reordered':np.uint32,'dep_reordered_prop':np.float16,
'order_dow_reordered':np.uint32,'order_dow_reordered_prop':np.float16,
'order_hod_reordered':np.uint32,'order_hod_reordered_prop':np.float16,
'order_dow_hod_reord_count':np.uint32,'ono_dsp_reord':np.uint32,
'order_dow_hod_reord_prop':np.float16,'ono_dsp_reord_prop':np.float16,
'atc_reordered':np.uint32,
'atc_reordered_prop':np.float16,'product_ordered_today':np.uint8,
'user_days_since_product_corrected':np.float16}

  for i in df.columns:
    if i!='eval_set':
      df[i]=df[i].astype(col_type_dict[i])
  return df


# Preparing data required for deployment

In [4]:
#loading user features and reducing its memory
user_features=pd.read_csv('user_features.csv')
user_features=memory_decrease_by_column(user_features)

In [5]:
#loading user_products_prior features and reducing its memory
user_products_prior=pd.read_csv('user_products_prior.csv')
user_products_prior=memory_decrease_by_column(user_products_prior)
user_all_info=pd.merge(user_features,user_products_prior,on='user_id',how='left')
user_all_info.head(3)

Unnamed: 0,user_id,user_max_ono,user_sum_reord,user_reord_prop,user_prod_reord_prop,user_uniqpr,user_uniqpr_prop,user_order_reord_prop,user_dsp_mean,user_min_order_size,user_max_order_size,user_mean_order_size,product_id
0,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,196
1,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,14084
2,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,12427


In [6]:
#loading user_product features and reducing its memory
user_product_features=pd.read_csv('user_product_features.csv')
user_product_features=memory_decrease_by_column(user_product_features)
user_all_info=pd.merge(user_all_info,user_product_features,on=['user_id','product_id'],how='left')
user_all_info.head(3)

Unnamed: 0,user_id,user_max_ono,user_sum_reord,user_reord_prop,user_prod_reord_prop,user_uniqpr,user_uniqpr_prop,user_order_reord_prop,user_dsp_mean,user_min_order_size,user_max_order_size,user_mean_order_size,product_id,user_days_since_product,user_product_hod_mean,user_product_dow_mean,user_product_prop,user_product_cnt,user_product_atc_mode_min,user_product_atc_mode_max,user_product_atc_min,user_product_atc_max,user_product_atc_mean
0,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,196,0.0,10.296875,2.5,0.169434,10,1,1,1,4,1.400391
1,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,14084,176.0,8.0,2.0,0.016953,1,2,2,2,2,2.0
2,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,12427,0.0,10.296875,2.5,0.169434,10,2,2,1,9,3.300781


In [7]:
#loading all product related features and reducing their memory
product_features=pd.read_csv('product_features.csv')
product_features=memory_decrease_by_column(product_features)
products=pd.read_csv('products.csv',usecols=['product_id','aisle_id','department_id'])
aisle_features=pd.read_csv('aisle_features.csv')
aisle_features=memory_decrease_by_column(aisle_features)
dep_features=pd.read_csv('dep_features.csv')
dep_features=memory_decrease_by_column(dep_features)
product_features=pd.merge(product_features,products,on='product_id',how='left')
product_features=pd.merge(product_features,aisle_features,on='aisle_id',how='left')
product_features=pd.merge(product_features,dep_features,on='department_id',how='left')
product_features.head(5)

Unnamed: 0,product_id,product_ratios_users_oneshot,product_cart_mean,product_reord_count,product_reord_prop,prod_uniq_us,prod_uniq_us_prop,prod_us_reord_prop,aisle_id,department_id,aisle_reordered,aisle_reordered_prop,dep_reordered,dep_reordered_prop
0,1,0.614746,5.800781,1136,0.613281,716,0.386719,0.385498,61,19,128431,0.548828,1657973,0.574219
1,2,0.897461,9.890625,12,0.133301,78,0.866699,0.102539,104,13,32321,0.152344,650301,0.34668
2,3,0.513672,6.414062,203,0.73291,74,0.26709,0.486572,94,7,131556,0.527832,1757892,0.65332
3,4,0.648438,9.507812,147,0.446777,182,0.553223,0.351562,38,1,217262,0.556641,1211890,0.541992
4,5,0.333252,6.464844,9,0.600098,6,0.399902,0.666504,5,13,17542,0.280518,650301,0.34668


In [9]:
#combining product features with existing features
user_all_info=pd.merge(user_all_info,product_features,on='product_id',how='left')
user_all_info.head(5)

Unnamed: 0,user_id,user_max_ono,user_sum_reord,user_reord_prop,user_prod_reord_prop,user_uniqpr,user_uniqpr_prop,user_order_reord_prop,user_dsp_mean,user_min_order_size,user_max_order_size,user_mean_order_size,product_id,user_days_since_product,user_product_hod_mean,user_product_dow_mean,user_product_prop,user_product_cnt,user_product_atc_mode_min,user_product_atc_mode_max,user_product_atc_min,user_product_atc_max,user_product_atc_mean,product_ratios_users_oneshot,product_cart_mean,product_reord_count,product_reord_prop,prod_uniq_us,prod_uniq_us_prop,prod_us_reord_prop,aisle_id,department_id,aisle_reordered,aisle_reordered_prop,dep_reordered,dep_reordered_prop
0,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,196,0.0,10.296875,2.5,0.169434,10,1,1,1,4,1.400391,0.41748,3.722656,27791,0.776367,8000,0.223511,0.58252,77,7,228406,0.638672,1757892,0.65332
1,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,14084,176.0,8.0,2.0,0.016953,1,2,2,2,2,2.0,0.37085,5.792969,12923,0.811035,3012,0.188965,0.628906,91,16,442023,0.692383,3627221,0.669922
2,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,12427,0.0,10.296875,2.5,0.169434,10,2,2,1,9,3.300781,0.470459,4.761719,4797,0.740723,1679,0.259277,0.529297,23,19,96804,0.591797,1657973,0.574219
3,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,26088,161.0,7.5,2.5,0.033905,2,4,5,4,5,4.5,0.595215,6.496094,1360,0.539062,1163,0.460938,0.405029,23,19,96804,0.591797,1657973,0.574219
4,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,26405,111.0,7.5,3.0,0.033905,2,5,5,5,5,5.0,0.637207,3.117188,536,0.441406,678,0.558594,0.362793,54,17,128303,0.527832,297075,0.4021


In [11]:
#loading add_to_cart_order features and combining with existing features
atc_features=pd.read_csv('atc_features.csv')
atc_features.columns=['user_product_atc_mode_min','atc_reordered','atc_reordered_prop']
atc_features=memory_decrease_by_column(atc_features)
user_all_info=pd.merge(user_all_info,atc_features,on='user_product_atc_mode_min',how='left')
user_all_info.head(5)

Unnamed: 0,user_id,user_max_ono,user_sum_reord,user_reord_prop,user_prod_reord_prop,user_uniqpr,user_uniqpr_prop,user_order_reord_prop,user_dsp_mean,user_min_order_size,user_max_order_size,user_mean_order_size,product_id,user_days_since_product,user_product_hod_mean,user_product_dow_mean,user_product_prop,user_product_cnt,user_product_atc_mode_min,user_product_atc_mode_max,user_product_atc_min,user_product_atc_max,user_product_atc_mean,product_ratios_users_oneshot,product_cart_mean,product_reord_count,product_reord_prop,prod_uniq_us,prod_uniq_us_prop,prod_us_reord_prop,aisle_id,department_id,aisle_reordered,aisle_reordered_prop,dep_reordered,dep_reordered_prop,atc_reordered,atc_reordered_prop
0,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,196,0.0,10.296875,2.5,0.169434,10,1,1,1,4,1.400391,0.41748,3.722656,27791,0.776367,8000,0.223511,0.58252,77,7,228406,0.638672,1757892,0.65332,2178183,0.677734
1,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,14084,176.0,8.0,2.0,0.016953,1,2,2,2,2,2.0,0.37085,5.792969,12923,0.811035,3012,0.188965,0.628906,91,16,442023,0.692383,3627221,0.669922,2068060,0.67627
2,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,12427,0.0,10.296875,2.5,0.169434,10,2,2,1,9,3.300781,0.470459,4.761719,4797,0.740723,1679,0.259277,0.529297,23,19,96804,0.591797,1657973,0.574219,2068060,0.67627
3,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,26088,161.0,7.5,2.5,0.033905,2,4,5,4,5,4.5,0.595215,6.496094,1360,0.539062,1163,0.460938,0.405029,23,19,96804,0.591797,1657973,0.574219,1696923,0.636719
4,1,10,41,0.694824,0.555664,18,0.305176,0.899902,20.265625,4,9,5.898438,26405,111.0,7.5,3.0,0.033905,2,5,5,5,5,5.0,0.637207,3.117188,536,0.441406,678,0.558594,0.362793,54,17,128303,0.527832,297075,0.4021,1507665,0.617188


In [12]:
#saving all features that can be generated using user id to disk
user_all_info.to_csv('user_all_info.csv',index=False)

# Imitating deployment scenario

In [13]:
%%time
#loading all files required for feature generation, to be executed when starting the server
user_all_info=pd.read_csv('user_all_info.csv')
user_all_info=memory_decrease_by_column(user_all_info)
user_all_info.head(5)
order_dow_features=pd.read_csv('order_dow_features.csv')
order_dow_features=memory_decrease_by_column(order_dow_features)
order_hod_features=pd.read_csv('order_hod_features.csv')
order_hod_features=memory_decrease_by_column(order_hod_features)
order_dow_hod_features=pd.read_csv('order_dow_hod_features.csv')
order_dow_hod_features=memory_decrease_by_column(order_dow_hod_features)
ono_dsp_features=pd.read_csv('ono_dsp_features.csv')
ono_dsp_features=memory_decrease_by_column(ono_dsp_features)
products=pd.read_csv('products.csv',usecols=['product_id','product_name'])
train_columns=pd.read_csv('train_columns.csv')
train_columns=train_columns.train_columns.values
X_train_statistics=pd.read_csv('X_train_statistics.csv')
X_train_min=X_train_statistics.X_train_min
X_train_min.index=train_columns
X_train_max=X_train_statistics.X_train_max
X_train_max.index=train_columns
X_train_max_min=X_train_statistics.X_train_max_min
X_train_max_min.index=train_columns

CPU times: user 41 s, sys: 7.18 s, total: 48.2 s
Wall time: 49.6 s


In [None]:
#https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def memory_decrease_by_column(df):
  '''This function helps to reduce the memory taken by the dataframe by storing the data in data types of best fit'''  
  col_type_dict={'order_id':np.uint32,'user_id':np.uint32,'order_number':np.uint8,
                 'order_dow':np.uint8,'order_hour_of_day':np.uint8,'days_since_prior_order':np.float16,
                 'product_id':np.uint16,'add_to_cart_order':np.uint8,'reordered':np.uint8,
                 'aisle_id':np.uint8,'department_id':np.uint8,'reordered_new':np.uint8,
                 'user_max_ono':np.uint8,'user_sum_reord':np.uint16,'user_reord_prop':np.float16,
                  'user_prod_reord_prop':np.float16,'user_uniqpr':np.uint16,'user_uniqpr_prop':np.float16,
                  'user_order_reord_prop':np.float16,'user_dsp_mean':np.float16,'user_min_order_size':np.uint8,
                  'user_max_order_size':np.uint8,'user_mean_order_size':np.float16,
                  'product_ratios_users_oneshot':np.float16,'product_cart_mean':np.float16,
                  'product_reord_count':np.uint32,'product_reord_prop':np.float16,
                  'prod_uniq_us':np.uint32,'prod_uniq_us_prop':np.float16,
                  'prod_us_reord_prop':np.float16,'user_days_since_product':np.float16,'user_product_hod_mean':np.float16,
                'user_product_dow_mean':np.float16,'user_product_prop':np.float16,
                'user_product_cnt':np.uint8,'user_product_atc_mode_min':np.uint8,
                'user_product_atc_mode_max':np.uint8,'user_product_atc_min':np.uint8,
                'user_product_atc_max':np.uint8,'user_product_atc_mean':np.float16,
                'aisle_reordered':np.uint32,'aisle_reordered_prop':np.float16,
                'dep_reordered':np.uint32,'dep_reordered_prop':np.float16,
                'order_dow_reordered':np.uint32,'order_dow_reordered_prop':np.float16,
                'order_hod_reordered':np.uint32,'order_hod_reordered_prop':np.float16,
                'order_dow_hod_reord_count':np.uint32,'ono_dsp_reord':np.uint32,
                'order_dow_hod_reord_prop':np.float16,'ono_dsp_reord_prop':np.float16,
                'atc_reordered':np.uint32,
                'atc_reordered_prop':np.float16,'product_ordered_today':np.uint8,
                'user_days_since_product_corrected':np.float16}

  for i in df.columns:
    if i!='eval_set':
      df[i]=df[i].astype(col_type_dict[i])
  return df

In [21]:
def prepare_data(x):
  '''function to make a list of products as expected in the competition'''
  return ' '.join(list(x.astype(str)))

In [85]:
def mean_f1score(X):
  '''this function returns the mean of f1 scores calculated over different orders'''
  f1_scores=[]
  y_true=X.true_labels
  y_pred=X.products
  for i in range(len(y_true)):
    true_products=set(y_true[i].split(' '))
    if(len(true_products)==0):
      f1_scores.append(0.0)
      break
    pred_products=set(y_pred[i].split(' '))
    pr=len(pred_products.intersection(true_products))/len(pred_products)
    re=len(pred_products.intersection(true_products))/len(true_products)
    if(pr+re==0):
      f1_scores.append(0.0)
    else:
      f1_scores.append((2*pr*re)/(pr+re))
  return np.mean(f1_scores)

In [110]:
def final_fun_1(X):
  '''This function returns the products that the users might reorder in the given orders'''
  #preparing features
  all_info=pd.merge(X,user_all_info[user_all_info.user_id.isin(X.user_id)],on='user_id',how='left')
  all_info=pd.merge(all_info,order_dow_features,on='order_dow',how='left')
  all_info=pd.merge(all_info,order_hod_features,on='order_hour_of_day',how='left')
  all_info=pd.merge(all_info,order_dow_hod_features,on=['order_dow','order_hour_of_day'],how='left')
  all_info=pd.merge(all_info,ono_dsp_features,on=['order_number','days_since_prior_order'],how='left')
  all_info['user_days_since_product_corrected']=all_info['user_days_since_product']+all_info['days_since_prior_order']
  all_info['product_ordered_today']=all_info['user_days_since_product_corrected'].apply(lambda x: 1 if x==0 else 0)
  #data cleaning
  all_info.fillna(0,inplace=True)
  X_test=all_info[train_columns]
  X_test=(X_test-X_train_min)/(X_train_max_min)
  #model evaluation
  best_model=tf.keras.models.load_model('nn_models/conv_model_f10.3654403235929674')
  pred_test_y=(best_model.predict(X_test,batch_size=1000)>=0.2)
  #output preparation
  all_info['pred_reordered']=pred_test_y
  submission=all_info[all_info.pred_reordered==1][['order_id','product_id']]
  submission.columns=['order_id','products']
  submission=submission.groupby('order_id')['products'].agg(prepare_data).reset_index()
  submission=pd.merge(X[['order_id']],submission,how='left',on='order_id')
  submission.fillna('None',inplace=True)  
  submission['len']=submission.products.apply(lambda x: 0 if x=='None' else len(x.split(' ')))
  submission['products']=submission.apply(lambda x: x.products+' None' if (x.len==1 or x.len==2) else x.products ,axis=1)
  return submission.products.values

In [89]:
def final_fun_2(X,Y):
  '''this function calculates the mean f1 score across the orders'''
  #preparing features
  all_info=pd.merge(X,user_all_info[user_all_info.user_id.isin(X.user_id)],on='user_id',how='left')
  all_info=pd.merge(all_info,order_dow_features,on='order_dow',how='left')
  all_info=pd.merge(all_info,order_hod_features,on='order_hour_of_day',how='left')
  all_info=pd.merge(all_info,order_dow_hod_features,on=['order_dow','order_hour_of_day'],how='left')
  all_info=pd.merge(all_info,ono_dsp_features,on=['order_number','days_since_prior_order'],how='left')
  all_info['user_days_since_product_corrected']=all_info['user_days_since_product']+all_info['days_since_prior_order']
  all_info['product_ordered_today']=all_info['user_days_since_product_corrected'].apply(lambda x: 1 if x==0 else 0)
  #data cleaning
  all_info.fillna(0,inplace=True)
  X_test=all_info[train_columns]
  #data scaling
  X_test=(X_test-X_train_min)/(X_train_max_min)
  #model evaluation
  best_model=tf.keras.models.load_model('nn_models/conv_model_f10.3654403235929674')
  pred_test_y=(best_model.predict(X_test,batch_size=1000)>=0.2)

  #output preparation
  all_info['pred_reordered']=pred_test_y
  submission=all_info[all_info.pred_reordered==1][['order_id','product_id']]
  submission.columns=['order_id','products']
  submission=submission.groupby('order_id')['products'].agg(prepare_data).reset_index()
  submission=pd.merge(X[['order_id']],submission,how='left',on='order_id')
  submission.fillna('None',inplace=True)  
  submission['len']=submission.products.apply(lambda x: 0 if x=='None' else len(x.split(' ')))
  submission['products']=submission.apply(lambda x: x.products+' None' if (x.len==1 or x.len==2) else x.products ,axis=1)
  submission=pd.merge(submission,Y,on='order_id',how='left')
  score=mean_f1score(submission)
  return score

**Testing**

In [90]:
#reading files required for testing the above functions
orders=pd.read_csv('orders.csv')
orders=memory_decrease_by_column(orders)
cnn_submission=pd.read_csv('cnn_submission.csv')

In [113]:
#defining sample inputs and sample true outputs
sample_X=orders[orders.eval_set=='test'].head(3)
sample_Y=cnn_submission[cnn_submission.order_id.isin(orders[orders.eval_set=='test'].head(3).order_id)]
sample_Y.columns=['order_id','true_labels']

In [116]:
%%time
#testing final_fun_1
final_fun_1(sample_X)

CPU times: user 1.15 s, sys: 176 ms, total: 1.33 s
Wall time: 1.44 s


array(['17668 39190 47766 21903 24810 23650 43961 18599', 'None',
       '38293 21903 None'], dtype=object)

In [99]:
%%time
#testing final_fun_2
final_fun_2(sample_X,sample_Y)

CPU times: user 924 ms, sys: 78 ms, total: 1 s
Wall time: 1.12 s


1.0

**Summary**

We have succesfully defined functions to calculate predicted labels and to calculate mean f1 score incompliance with the instructions.

Note:
Here instead of predicted labels to be 1 or 0. We have to predict products that the user might reorder so modified the requirement of final_fun_1 accordingly