In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy import stats
from sklearn.neural_network import *
from sklearn.decomposition import *
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn import linear_model
import statsmodels.formula.api as sm 
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [2]:
IDIR = 'C://Kaggle/Customer/input/'


print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int64,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint64,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [3]:
###
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

computing product f


In [4]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

add order info to priors


In [5]:
prods = pd.DataFrame()
priors['user_buy_product_times'] = priors.groupby(['user_id', 'product_id']).cumcount() + 1
prods['avg_order_number'] = priors['order_number'].groupby(priors.product_id).mean().astype(np.float32)
prods['prod_avg_order_hour'] = priors['order_hour_of_day'].groupby(priors.product_id).mean().astype(np.float32)
prods['prod_avg_order_dow'] = priors['order_dow'].groupby(priors.product_id).mean().astype(np.float32)
prods['prod_days_since_prior'] = priors['days_since_prior_order'].groupby(priors.product_id).mean().astype(np.float32)
prods['prod_preferred_order_hour'] = priors['order_hour_of_day'].groupby(priors.product_id).agg(lambda x: stats.mode(x)[0]).astype(np.float32)
prods['prod_preferred_dow'] = priors['order_dow'].groupby(priors.product_id).agg(lambda x: stats.mode(x)[0]).astype(np.float32)
prods['prod_first_time_total_count'] = priors.groupby(priors.product_id).apply(lambda x: \
                                        sum(priors.ix[x.index,'user_buy_product_times']==1))
prods['prod_second_time_total_count'] = priors.groupby(priors.product_id).apply(lambda x: \
                                        sum(priors.ix[x.index,'user_buy_product_times']==2))   
prods['prod_reorder_prob'] = prods.prod_second_time_total_count / prods.prod_first_time_total_count
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
products['prod_reorder_times'] = 1 + products.reorders / products.prod_first_time_total_count

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if __name__ == '__main__':


In [6]:
### user features
print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['sum_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].sum().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users['user_avg_order_hour'] = priors['order_hour_of_day'].groupby(priors.user_id).mean().astype(np.float32)
users['user_std_order_hour'] = priors['order_hour_of_day'].groupby(priors.user_id).std().astype(np.float32)
users['user_avg_order_dow'] = priors['order_dow'].groupby(priors.user_id).mean().astype(np.float32)
users['user_std_order_dow'] = priors['order_dow'].groupby(priors.user_id).mean().astype(np.float32)
users['user_preferred_order_hour'] = priors['order_hour_of_day'].groupby(priors.user_id).agg(lambda x: stats.mode(x)[0]).astype(np.float32)
users['user_preferred_dow'] = priors['order_dow'].groupby(priors.user_id).agg(lambda x: stats.mode(x)[0]).astype(np.float32)
user_order_size = pd.DataFrame(priors.groupby(['user_id','order_id']).size())
users['user_avg_order_size'] = user_order_size.reset_index().groupby('user_id')[0].mean()
users['user_max_order_size'] = user_order_size.reset_index().groupby('user_id')[0].max()
users['user_min_order_size'] = user_order_size.reset_index().groupby('user_id')[0].min()
users['user_reorder_ratio'] = priors.groupby(priors.user_id).apply(lambda x: sum(priors.ix[x.index,'reordered']==1)/
                         sum(priors.ix[x.index,'order_number'] > 1))

users = users.join(usr)
del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

computing user f
user f (206209, 17)


In [7]:
### userXproduct features

print('compute userXproduct f - this is long...')
priors['user_product'] = priors.product_id + priors.user_id * 100000

# This was to slow !!
#def last_order(order_group):
#    ix = order_group.order_number.idxmax
#    return order_group.shape[0], order_group.order_id[ix],  order_group.add_to_cart_order.mean()
#userXproduct = pd.DataFrame()
#userXproduct['tmp'] = df.groupby('user_product').apply(last_order)

d= dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1,
                (row.order_number, row.order_id),
                row.add_to_cart_order,
                (row.order_number, row.order_id))
    else:
        d[z] = (d[z][0] + 1,
                max(d[z][1], (row.order_number, row.order_id)),
                d[z][2] + row.add_to_cart_order,
                min(d[z][1], (row.order_number, row.order_id)))

print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart', 'first_order_id']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct.first_order_id = userXproduct.first_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
print('user X product f', len(userXproduct))

#del priors

compute userXproduct f - this is long...
to dataframe (less memory)
user X product f 13307953


In [8]:
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

split orders : train, test


In [9]:
### build list of candidate products to reorder, with features ###

def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    train_index_lookup = dict().fromkeys(train.index.values)
    for row in selected_orders.itertuples():
        i+=1
        if i%100000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train_index_lookup for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_sum_days_between_orders'] = df.user_id.map(users.sum_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    df['user_avg_order_hour'] =  df.user_id.map(users.user_avg_order_hour)
    df['user_avg_order_dow'] =  df.user_id.map(users.user_avg_order_dow)
    df['user_std_order_hour'] =  df.user_id.map(users.user_std_order_hour)
    df['user_std_order_dow'] =  df.user_id.map(users.user_std_order_dow)
    df['user_preferred_order_hour'] =  df.user_id.map(users.user_preferred_order_hour)
    df['user_preferred_dow'] =  df.user_id.map(users.user_preferred_dow)
    df['user_avg_order_size'] =  df.user_id.map(users.user_avg_order_size)
    df['user_max_order_size'] =  df.user_id.map(users.user_max_order_size)
    df['user_min_order_size'] =  df.user_id.map(users.user_min_order_size)
    df['user_distinct_ratio'] = df['total_distinct_items'] / (df['user_total_items'])
    df['user_buynew_ratio'] = (df['user_total_items']-df['total_distinct_items']) / df['user_total_orders']
    df['user_reorder_ratio'] = df.user_id.map(users.user_reorder_ratio)
    
    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
    df['avg_order_number'] = df.product_id.map(products.avg_order_number)
    df['prod_avg_order_hour'] = df.product_id.map(products.prod_avg_order_hour)
    df['prod_avg_order_dow'] = df.product_id.map(products.prod_avg_order_dow)
    df['prod_days_since_prior'] = df.product_id.map(products.prod_days_since_prior)
    df['prod_preferred_order_hour'] = df.product_id.map(products.prod_preferred_order_hour)
    df['prod_preferred_dow'] = df.product_id.map(products.prod_preferred_dow)
    df['prod_first_time_total_count'] = df.product_id.map(products.prod_first_time_total_count)
    df['prod_second_time_total_count'] = df.product_id.map(products.prod_second_time_total_count)
    df['prod_reorder_prob'] = df.product_id.map(products.prod_reorder_prob)
    df['prod_reorder_times'] = df.product_id.map(products.prod_reorder_times)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    #df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_first_order_id'] = df.z.map(userXproduct.first_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == df.order_id.map(orders.order_dow)
    df['UP_order_rate_since_first_order'] = df.UP_orders / (df.user_total_orders - \
                                                            df.UP_first_order_id.map(orders.order_number))
    df['UP_order_rate_bettwen_first_last'] = df.UP_last_order_id.map(orders.order_number) -  df.UP_first_order_id.map(orders.order_number)
    
    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)

In [10]:
df_train, labels = features(train_orders, labels_given=True)
df_train['days_since_ratio'] = df_train['days_since_ratio'].fillna(0)
df_train['days_since_prior_order'] = df_train['days_since_prior_order'].fillna(0)

build candidate list
order row 100000
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_id                               int64
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_sum_days_between_orders        float32
user_average_basket                 float32
user_avg_order_hour                 float32
user_avg_order_dow                  float32
user_std_order_hour                 float32
user_std_order_dow                  float32
user_preferred_order_hour           float32
user_preferred_dow                  float32
user_avg_order_size                 float64
user_max_order_size                   int64
user_min_order_size                   int64
user_distinct_ratio                 float64
user_buynew_ratio       

In [11]:
df_test, _ = features(test_orders)
df_test['days_since_ratio'] = df_test['days_since_ratio'].fillna(0)
df_test['days_since_prior_order'] = df_test['days_since_prior_order'].fillna(0)

from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
df_all = df_train.append(df_test)
sparse_matrix = csr_matrix((df_all.UP_orders_ratio.values, (df_all.user_id.values, df_all.product_id.values)), 
           shape = (df_all.user_id.max()+1, df_all.product_id.max()+1))

model = TruncatedSVD(n_components=3)
model.fit(sparse_matrix)
u_features = pd.DataFrame(model.transform(sparse_matrix), columns = ['u_hidden1','u_hidden2','u_hidden3'])
model.fit(sparse_matrix.T)
p_features = pd.DataFrame(model.transform(sparse_matrix.T), columns = ['p_hidden1','p_hidden2','p_hidden3'])

df_train = pd.merge(df_train, u_features, how = 'left', left_on = 'user_id', right_index = True)
df_train = pd.merge(df_train, p_features, how = 'left', left_on = 'product_id', right_index = True)
df_test = pd.merge(df_test, u_features, how = 'left', left_on = 'user_id', right_index = True)
df_test = pd.merge(df_test, p_features, how = 'left', left_on = 'product_id', right_index = True)

build candidate list
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_id                               int64
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_sum_days_between_orders        float32
user_average_basket                 float32
user_avg_order_hour                 float32
user_avg_order_dow                  float32
user_std_order_hour                 float32
user_std_order_dow                  float32
user_preferred_order_hour           float32
user_preferred_dow                  float32
user_avg_order_size                 float64
user_max_order_size                   int64
user_min_order_size                   int64
user_distinct_ratio                 float64
user_buynew_ratio                   float

In [12]:
del priors
del orders
del df_all

In [13]:
product_embeddings = pd.read_pickle('product_embeddings.pkl')
del product_embeddings['product_name']
del product_embeddings['aisle_id']
del product_embeddings['department_id']

In [14]:
df_train = pd.merge(df_train, product_embeddings, how = 'left', left_on = 'product_id', right_on = 'product_id')
df_test = pd.merge(df_test, product_embeddings, how = 'left', left_on = 'product_id', right_on = 'product_id')

In [15]:
df_train.head()

Unnamed: 0,order_id,product_id,user_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_sum_days_between_orders,user_average_basket,user_avg_order_hour,...,22,23,24,25,26,27,28,29,30,31
0,1187899,17122,1,11,59,18,19.0,190.0,5.363636,10.542373,...,-0.529931,-0.010399,-0.04525,0.131338,1.018782,-0.32321,0.170395,-0.526232,-0.541561,0.423974
1,1187899,196,1,11,59,18,19.0,190.0,5.363636,10.542373,...,0.374344,0.393529,1.774428,1.283232,1.470487,-0.183853,-1.903148,-0.190825,-0.205328,-0.843427
2,1187899,26405,1,11,59,18,19.0,190.0,5.363636,10.542373,...,0.330722,-0.28408,0.989011,1.174724,1.197524,-0.702007,-0.591502,0.884327,-1.137047,0.257529
3,1187899,46149,1,11,59,18,19.0,190.0,5.363636,10.542373,...,-0.970413,-0.516754,0.445781,-0.308037,2.349851,-0.53887,1.224704,0.48502,-0.559203,0.921078
4,1187899,14084,1,11,59,18,19.0,190.0,5.363636,10.542373,...,0.696389,-1.318216,-0.471808,-0.31455,0.574339,-0.799533,-0.47194,-0.30778,-1.208707,-0.844484


In [16]:
#df_train = pd.read_csv('df_train.csv')

In [17]:
#df_test = pd.read_csv('df_test.csv')
#labels = pd.read_csv('labels.csv')

In [18]:
def compare_results(df_gt, df_preds):
    
    df_gt_cut = df_gt.loc[df_preds.index]
    
    f1 = []
    precision_list = []
    recall_list = []
    for gt, pred in zip(df_gt_cut.sort_index().products, df_preds.sort_index().products):
        lgt = gt.replace("None", "-1").split(' ')
        lpred = pred.replace("None", "-1").split(' ')

        rr = (np.intersect1d(lgt, lpred))
        precision = np.float(len(rr)) / len(lpred)
        recall = np.float(len(rr)) / len(lgt)
        
        precision_list.append(precision)
        recall_list.append(recall)

        denom = precision + recall
        f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)

    #print(np.mean(f1))
    return np.mean(f1), np.mean(precision_list), np.mean(recall_list)

In [19]:
def transform_label(df_data, label_prob, order_id):
    df_data['label_prob'] = label_prob
    TRESHOLD = 0.21
    certain_level = 0.37
    d = dict()
    prev_order_id = -1
    max_prob = 0
    for row in df_data.itertuples():
        if row.order_id!=prev_order_id:
            if (max_prob<certain_level) and (max_prob>TRESHOLD):
                d[prev_order_id] += ' None'
            prev_order_id = row.order_id                
            max_prob = 0
            
        if row.label_prob > TRESHOLD:
            if row.label_prob > max_prob:
                max_prob = row.label_prob
            try:
                d[row.order_id] += ' ' + str(row.product_id)
            except:
                d[row.order_id] = str(row.product_id)

    for order in order_id:
        if order not in d:
            d[order] = 'None'
    return pd.DataFrame.from_dict(d, orient='index').reset_index()   

In [20]:
df_train['labels'] = labels
df_train_gt = transform_label(df_train, df_train['labels'], train_orders.order_id)
df_train.index = df_train['order_id']
df_train_gt.columns = ['order_id', 'products']

In [21]:
"""
@author: Faron, cpmp
"""
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from datetime import datetime
from numba import jit

'''
This kernel implements the O(n²) F1-Score expectation maximization algorithm presented in
"Ye, N., Chai, K., Lee, W., and Chieu, H.  Optimizing F-measures: A Tale of Two Approaches. In ICML, 2012."

It solves argmax_(0 <= k <= n,[[None]]) E[F1(P,k,[[None]])]
with [[None]] being the indicator for predicting label "None"
given posteriors P = [p_1, p_2, ... , p_n], where p_1 > p_2 > ... > p_n
under label independence assumption by means of dynamic programming in O(n²).
'''


class F1Optimizer():
    def __init__(self):
        pass

    @staticmethod
    @jit
    def get_expectations(P, pNone=None):
        expectations = []
        P = np.sort(P)[::-1]

        n = np.array(P).shape[0]
        DP_C = np.zeros((n + 2, n + 1))
        if pNone is None:
            pNone = (1.0 - P).prod()

        DP_C[0][0] = 1.0
        for j in range(1, n):
            DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]

        for i in range(1, n + 1):
            DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
            for j in range(i + 1, n + 1):
                DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]

        DP_S = np.zeros((2 * n + 1,))
        DP_SNone = np.zeros((2 * n + 1,))
        for i in range(1, 2 * n + 1):
            DP_S[i] = 1. / (1. * i)
            DP_SNone[i] = 1. / (1. * i + 1)
        for k in range(n + 1)[::-1]:
            f1 = 0
            f1None = 0
            for k1 in range(n + 1):
                f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
                f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
            for i in range(1, 2 * k - 1):
                DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
                DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
            expectations.append([f1None + 2 * pNone / (2 + k), f1])

        return np.array(expectations[::-1]).T

    @staticmethod
    @jit
    def maximize_expectation(P, pNone=None):
        expectations = F1Optimizer.get_expectations(P, pNone)

        ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
        max_f1 = expectations[ix_max]

        predNone = True if ix_max[0] == 0 else False
        best_k = ix_max[1]

        return best_k, predNone, max_f1

    @staticmethod
    def _F1(tp, fp, fn):
        return 2 * tp / (2 * tp + fp + fn)

    @staticmethod
    def _Fbeta(tp, fp, fn, beta=1.0):
        beta_squared = beta ** 2
        return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)


def print_best_prediction(P, pNone=None):
    print("Maximize F1-Expectation")
    print("=" * 23)
    P = np.sort(P)[::-1]
    n = P.shape[0]
    L = ['L{}'.format(i + 1) for i in range(n)]

    if pNone is None:
        print("Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)")
        pNone = (1.0 - P).prod()

    PL = ['p({}|x)={}'.format(l, p) for l, p in zip(L, P)]
    print("Posteriors: {} (n={})".format(PL, n))
    print("p(None|x)={}".format(pNone))

    opt = F1Optimizer.maximize_expectation(P, pNone)
    best_prediction = ['None'] if opt[1] else []
    best_prediction += (L[:opt[0]])
    f1_max = opt[2]

    print("Prediction {} yields best E[F1] of {}\n".format(best_prediction, f1_max))


def save_plot(P, filename='expected_f1.png'):
    E_F1 = pd.DataFrame(F1Optimizer.get_expectations(P).T, columns=["/w None", "/wo None"])
    best_k, _, max_f1 = F1Optimizer.maximize_expectation(P)

    plt.style.use('ggplot')
    plt.figure()
    E_F1.plot()
    plt.title('Expected F1-Score for \n {}'.format("P = [{}]".format(",".join(map(str, P)))), fontsize=12)
    plt.xlabel('k')
    plt.xticks(np.arange(0, len(P) + 1, 1.0))
    plt.ylabel('E[F1(P,k)]')
    plt.plot([best_k], [max_f1], 'o', color='#000000', markersize=4)
    plt.annotate('max E[F1(P,k)] = E[F1(P,{})] = {:.5f}'.format(best_k, max_f1), xy=(best_k, max_f1),
                 xytext=(best_k, max_f1 * 0.8), arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=7),
                 horizontalalignment='center', verticalalignment='top')
    plt.gcf().savefig(filename)



def timeit(P):
    s = datetime.now()
    F1Optimizer.maximize_expectation(P)
    e = datetime.now()
    return (e-s).microseconds / 1E6


def benchmark(n=100, filename='runtimes.png'):
    results = pd.DataFrame(index=np.arange(1,n+1))
    results['runtimes'] = 0

    for i in range(1,n+1):
        runtimes = []
        for j in range(5):
            runtimes.append(timeit(np.sort(np.random.rand(i))[::-1]))
        results.iloc[i-1] = np.mean(runtimes)

    x = results.index
    y = results.runtimes
    results['quadratic fit'] = np.poly1d(np.polyfit(x, y, deg=2))(x)

    plt.style.use('ggplot')
    plt.figure()
    results.plot()
    plt.title('Expectation Maximization Runtimes', fontsize=12)
    plt.xlabel('n = |P|')
    plt.ylabel('time in seconds')
    plt.gcf().savefig(filename)


#if __name__ == '__main__':
    #print_best_prediction([0.3, 0.2])
    #print_best_prediction([0.3, 0.2], 0.57)
    #print_best_prediction([0.9, 0.6])
    #print_best_prediction([0.5, 0.4, 0.3, 0.35, 0.33, 0.31, 0.29, 0.27, 0.25, 0.20, 0.15, 0.10])
    #print_best_prediction([0.5, 0.4, 0.3, 0.35, 0.33, 0.31, 0.29, 0.27, 0.25, 0.20, 0.15, 0.10], 0.2)

    #save_plot([0.45, 0.35, 0.31, 0.29, 0.27, 0.25, 0.22, 0.20, 0.17, 0.15, 0.10, 0.05, 0.02])
    #benchmark()

In [22]:
def transform_label_optimize_F1_score(df_data, label_prob, order_id, order_pred):
    df_data['label_prob'] = label_prob
    d = dict()

    for row in df_data.itertuples():            
        if row.label_prob >= row.threhold:
            try:
                d[row.order_id] += ' ' + str(row.product_id)
            except:
                d[row.order_id] = str(row.product_id)

    for order in order_id:
        if order_pred.loc[order]['None']:
            try:
                d[order] += ' None'
            except:
                d[order] = 'None'
    
    return pd.DataFrame.from_dict(d, orient='index').reset_index()   

In [23]:
#F1Optimizer.maximize_expectation(np.sort(sorted_list)[::-1], None)

In [24]:
order_pred.to_csv('train_maximize_F1_score.csv')
order_pred_test.to_csv('test_maximize_F1_score.csv')

NameError: name 'order_pred' is not defined

In [None]:
df_train.head()['days_since_prior_order']

In [31]:
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', 'user_sum_days_between_orders',
       'user_average_days_between_orders', 'user_average_basket', 'user_avg_order_hour', 'user_avg_order_dow',
       'user_std_order_hour', 'user_std_order_dow', 'user_distinct_ratio', 'user_buynew_ratio',
       'user_preferred_order_hour', 'user_preferred_dow', 'user_avg_order_size', 'user_max_order_size',
       'user_min_order_size','user_reorder_ratio', 'user_reorder_ratio','order_hour_of_day', 'days_since_prior_order', 
       'days_since_ratio','aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'avg_order_number', 'prod_avg_order_hour', 'prod_avg_order_dow',
       'prod_days_since_prior','prod_preferred_order_hour', 'prod_preferred_dow', 'prod_first_time_total_count',
       'prod_second_time_total_count', 'prod_reorder_prob',  'prod_reorder_times', 'UP_order_rate_bettwen_first_last',
       'UP_orders', 'UP_orders_ratio','UP_average_pos_in_cart', 'UP_orders_since_last', 
       'UP_delta_hour_vs_last', 'UP_order_rate_since_first_order', 'dow', 'UP_same_dow_as_last_order', 
       'u_hidden1', 'u_hidden2','u_hidden3', 'p_hidden1', 'p_hidden2', 'p_hidden3', 'user_id', 'product_id',
           0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] #'user_id'
categorical_f = ['aisle_id', 'department_id', 'user_preferred_dow', 'user_preferred_order_hour',
                'prod_preferred_order_hour', 'prod_preferred_dow'] #'user_id'

In [32]:
df_train.head()

Unnamed: 0_level_0,order_id,product_id,user_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_sum_days_between_orders,user_average_basket,user_avg_order_hour,...,24,25,26,27,28,29,30,31,labels,label_prob
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1187899,1187899,17122,1,11,59,18,19.0,190.0,5.363636,10.542373,...,-0.04525,0.131338,1.018782,-0.32321,0.170395,-0.526232,-0.541561,0.423974,0,0
1187899,1187899,196,1,11,59,18,19.0,190.0,5.363636,10.542373,...,1.774428,1.283232,1.470487,-0.183853,-1.903148,-0.190825,-0.205328,-0.843427,1,1
1187899,1187899,26405,1,11,59,18,19.0,190.0,5.363636,10.542373,...,0.989011,1.174724,1.197524,-0.702007,-0.591502,0.884327,-1.137047,0.257529,1,1
1187899,1187899,46149,1,11,59,18,19.0,190.0,5.363636,10.542373,...,0.445781,-0.308037,2.349851,-0.53887,1.224704,0.48502,-0.559203,0.921078,1,1
1187899,1187899,14084,1,11,59,18,19.0,190.0,5.363636,10.542373,...,-0.471808,-0.31455,0.574339,-0.799533,-0.47194,-0.30778,-1.208707,-0.844484,0,0


In [None]:
print('formating for lgb')
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 8,
    
}
ROUNDS = 130
threhold = 0.21


n_fold = 10
kf = KFold(n_splits=n_fold)
f1_score_avg = 0

for train_index, valid_index in kf.split(df_train_gt):
    train_order_ids, valid_order_ids = df_train_gt['order_id'].loc[train_index], df_train_gt['order_id'].loc[valid_index] 
    train_data, valid_data = df_train[f_to_use].loc[train_order_ids].values, df_train[f_to_use].loc[valid_order_ids].values
    train_labels, valid_labels = df_train['labels'].loc[train_order_ids].values, df_train['labels'].loc[valid_order_ids].values
    df_valid = df_train.loc[valid_order_ids]
    d_train = lgb.Dataset(train_data,
                          label=train_labels,
                          categorical_feature=categorical_f)  # , 'order_hour_of_day', 'dow'
    print('light GBM train :-)')
    bst = lgb.train(params, d_train, ROUNDS)
    valid_prob = bst.predict(valid_data)   
    
    df_valid = df_train.loc[valid_order_ids]
    df_valid['label_prob'] = valid_prob
    order_pred_valid = df_valid.groupby('order_id')['label_prob'].apply(list)
    order_pred_valid = pd.DataFrame(order_pred_valid)
    order_pred_valid['opt_k'] = 0
    order_pred_valid['None'] = 0
    for i in range(order_pred_valid.shape[0]):
        opt = F1Optimizer.maximize_expectation(np.sort(order_pred_valid.iloc[i, 0])[::-1], None)
        order_pred_valid.iloc[i, 1] = opt[0]
        order_pred_valid.iloc[i, 2] = opt[1]
        if i%1000==0: print(i)
    threhold_list = []
    for i in range(order_pred_valid.shape[0]):
        if order_pred_valid.iloc[i,1]==0:
            threhold_list.append(1.1)
        else:  
            threhold_list.append(np.sort(order_pred_valid.iloc[i,0])[::-1][order_pred_valid.iloc[i,1]-1])
    order_pred_valid['threhold'] = threhold_list
    threhold_mapping = order_pred_valid[['threhold','None']]
    df_valid = df_valid.drop(['threhold', 'None'], axis=1, errors='ignore')
    df_valid = pd.merge(df_valid, threhold_mapping, how = 'left', left_on = 'order_id', right_index = True)
            
    valid_prediction = transform_label_optimize_F1_score(df_valid, valid_prob, np.unique(valid_order_ids), order_pred_valid)
    valid_prediction.columns = ['order_id', 'products']
    
    print(compare_results(df_train_gt.loc[valid_index].set_index('order_id'), valid_prediction.set_index('order_id')))
    #test_predict += bst.predict(df_test[f_to_use]) / n_fold

formating for lgb


In [None]:
print('formating for lgb')
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 12,
    'feature_fraction': 0.55,
    'bagging_fraction': 0.5,
    'bagging_freq': 5,
    'early_stopping_round': 5,
    'lambda_l2': 0.01
}
ROUNDS = 300
threhold = 0.21

train_data = df_train[f_to_use].values
train_labels = labels
d_train = lgb.Dataset(train_data,
                      label=train_labels,
                      categorical_feature=categorical_f)  # , 'order_hour_of_day', 'dow'
print('light GBM train full set')
bst = lgb.train(params, d_train, ROUNDS)
test_preds = bst.predict(df_test[f_to_use])

In [None]:
train_prob = bst.predict(train_data) 
df_train['label_prob'] = train_prob
df_test['label_prob'] = test_preds

In [None]:
order_pred = df_train.groupby('order_id')['label_prob'].apply(list)
order_pred = pd.DataFrame(order_pred)
order_pred['opt_k'] = 0
order_pred['None'] = 0
for i in range(order_pred.shape[0]):
    opt = F1Optimizer.maximize_expectation(np.sort(order_pred.iloc[i, 0])[::-1], None)
    order_pred.iloc[i, 1] = opt[0]
    order_pred.iloc[i, 2] = opt[1]
    if i%1000==0: print(i)

threhold_list = []
for i in range(order_pred.shape[0]):
    if order_pred.iloc[i,1]==0:
        threhold_list.append(1.1)
    else:    
        threhold_list.append(np.sort(order_pred.iloc[i,0])[::-1][order_pred.iloc[i,1]-1])
order_pred['threhold'] = threhold_list
threhold_mapping = order_pred[['threhold','None']]
df_train = df_train.drop(['threhold', 'None'], errors = 'ignore')
df_train = pd.merge(df_train, threhold_mapping, how = 'left', left_on = 'order_id', right_index = True)

In [61]:
train_prediction = transform_label(df_train, train_prob, np.unique(df_train.order_id))
train_prediction.columns = ['order_id', 'products']
compare_results(df_train_gt.set_index('order_id'), train_prediction.set_index('order_id'))

(0.39263419782111292, 0.36637887029615857, 0.50200248424347549)

In [62]:
train_prediction = transform_label_optimize_F1_score(df_train, train_prob, np.unique(df_train.order_id), order_pred)
train_prediction.columns = ['order_id', 'products']
compare_results(df_train_gt.set_index('order_id'), train_prediction.set_index('order_id'))

AttributeError: 'Pandas' object has no attribute 'threhold'

In [None]:
df_test.head()

In [None]:
## optimize F1-score
order_pred_test = df_test.groupby('order_id')['label_prob'].apply(list)
order_pred_test = pd.DataFrame(order_pred_test)

order_pred_test['opt_k'] = 0
order_pred_test['None'] = 0
for i in range(order_pred_test.shape[0]):
    opt = F1Optimizer.maximize_expectation(np.sort(order_pred_test.iloc[i, 0])[::-1], None)
    order_pred_test.iloc[i, 1] = opt[0]
    order_pred_test.iloc[i, 2] = opt[1]
    if i%1000==0: print(i)
        
threhold_list = []
for i in range(order_pred_test.shape[0]):
    if order_pred_test.iloc[i,1]==0:
        threhold_list.append(1.1)
    else:    
        threhold_list.append(np.sort(order_pred_test.iloc[i,0])[::-1][order_pred_test.iloc[i,1]-1])
order_pred_test['threhold'] = threhold_list
threhold_mapping = order_pred_test[['threhold','None']]
df_test = df_test.drop(['threhold', 'None'], errors = 'ignore')
df_test = pd.merge(df_test, threhold_mapping, how = 'left', left_on = 'order_id', right_index = True)

In [None]:
sub = transform_label_optimize_F1_score(df_test, test_preds, np.unique(df_test['order_id']), order_pred_test)

sub.reset_index(inplace=True)
del sub['level_0']
sub.columns = ['order_id', 'products']
sub.to_csv('sub.csv', index=False)

In [None]:
sub.shape

In [94]:
np.unique(test_orders.order_id).shape

(75000,)

In [188]:
f_train_none = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket', 'user_avg_order_hour',
       'user_avg_order_dow', 'user_preferred_order_hour', 'user_preferred_dow', 'user_avg_order_size',        
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'avg_order_number', 'prod_avg_order_hour', 'prod_avg_order_dow',
       'prod_days_since_prior','prod_preferred_order_hour', 'prod_preferred_dow',
       'UP_orders', 'UP_orders_ratio','UP_average_pos_in_cart', 'UP_orders_since_last',
       'UP_delta_hour_vs_last', 'dow', 'UP_same_dow_as_last_order', 'u_hidden1', 'u_hidden2',
       'u_hidden3', 'u_hidden4', 'u_hidden5', 'p_hidden1', 'p_hidden2', 'p_hidden3', 'p_hidden4',
       'p_hidden5'] #'user_id'
categorical_f = ['aisle_id', 'department_id', 'user_preferred_dow', 'user_preferred_order_hour',
                'prod_preferred_order_hour', 'prod_preferred_dow'] #'user_id'

In [44]:
df_test_order = df_test.groupby('order_id').mean()

In [195]:
df_train_order.head()['reo']

Unnamed: 0_level_0,product_id,user_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,user_avg_order_hour,user_avg_order_dow,user_preferred_order_hour,...,u_hidden3,u_hidden4,u_hidden5,p_hidden1,p_hidden2,p_hidden3,p_hidden4,p_hidden5,labels,label_prob
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,27960.416667,112108,4.0,21.0,12.0,10.333333,5.25,14.142858,1.238095,10.0,...,-0.056825,-0.078663,-0.032743,9.619261,5.734143,1.743224,1.08608,-0.729583,0.333333,0.342076
36,26897.267327,79431,23.0,187.0,101.0,15.681818,8.130435,12.727273,2.26738,10.0,...,0.107277,-0.050765,-0.118251,5.110632,-0.328294,-1.164539,0.504634,-0.230287,0.059406,0.055077
38,26675.967742,42756,6.0,39.0,31.0,22.4,6.5,15.871795,2.717949,15.0,...,0.118195,0.056651,0.079091,3.502176,-0.186552,-1.634829,-0.770435,0.782683,0.032258,0.109278
96,28969.636364,17227,7.0,43.0,33.0,21.0,6.142857,17.534883,2.674419,18.0,...,0.158335,0.073668,0.087426,5.637034,-0.995838,-1.759526,-0.539966,0.188862,0.151515,0.088996
98,24294.207729,56463,41.0,1234.0,207.0,9.05,30.097561,12.894651,3.205024,12.0,...,0.03281,-0.09211,-0.104333,2.305305,-0.905306,-0.392222,0.185142,-0.01617,0.217391,0.146522


In [149]:
df_train

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2539329,2539329,1,prior,1,2,8,
2398795,2398795,1,prior,2,3,7,15.0
473747,473747,1,prior,3,3,12,21.0
2254736,2254736,1,prior,4,4,7,29.0
431534,431534,1,prior,5,4,15,28.0
3367565,3367565,1,prior,6,2,7,19.0
550135,550135,1,prior,7,1,9,20.0
3108588,3108588,1,prior,8,1,14,14.0
2295261,2295261,1,prior,9,1,16,0.0
2550362,2550362,1,prior,10,4,8,30.0


In [62]:
train_prob = pd.read_csv('train_prob.csv')
test_prob = pd.read_csv('test_prob.csv')

In [96]:
order_id_list = np.union1d(np.unique(train_prob['order_id']), np.unique(test_prob['order_id']))
order_id_list = pd.DataFrame([order_id_list, range(order_id_list.shape[0])]).T

In [109]:
product_id_list = np.union1d(np.unique(train_prob['product_id']), np.unique(test_prob['product_id']))
product_id_list = pd.DataFrame([product_id_list, range(product_id_list.shape[0])]).T

In [99]:
order_id_list.columns = ['order_id', 'order_id_index']
train_prob = pd.merge(train_prob, order_id_list, how = 'left', left_on = 'order_id', right_on = 'order_id')
test_prob = pd.merge(test_prob, order_id_list, how = 'left', left_on = 'order_id', right_on = 'order_id')
product_id_list.columns = ['product_id', 'product_id_index']
train_prob = pd.merge(train_prob, product_id_list, how = 'left', left_on = 'product_id', right_on = 'product_id')
test_prob = pd.merge(test_prob, product_id_list, how = 'left', left_on = 'product_id', right_on = 'product_id')

In [118]:
order_index = train_prob['order_id_index'].values
product_index = train_prob['product_id_index'].values
external_features = np.expand_dims(train_prob['label_prob'].values,1)
output = np.expand_dims(train_prob['labels'].values,1)

In [None]:
product_index

In [139]:
## Neural Network Matrix Factorization country by product
from __future__ import print_function
import tensorflow as tf

tf.reset_default_graph()

# Parameters
learning_rate = 0.01
training_epochs = 200
batch_size = 20000
display_step = 1

# Network Parameters
beta = 1e-6
n_hidden_1 = 64
n_hidden_2 = 32
n_hidden_3 = 16
n_orders = order_id_list.shape[0]
n_product = product_id_list.shape[0]
n_hidden_dimension = 2
n_external_inputs = 1
# tf Graph input
x1 = tf.placeholder("int32", None)
x2 = tf.placeholder("int32", None)
external_inputs = tf.placeholder("float", [None, n_external_inputs])
y = tf.placeholder("float", None)

def multilayer_perceptron(x1, x2, external_inputs, weights, biases, hidden):
    out_layer1 = tf.nn.embedding_lookup(hidden['out1'], x1)
    out_layer2 = tf.nn.embedding_lookup(hidden['out2'], x2)   
    f_input_layer = tf.concat(values=[out_layer1, out_layer2, tf.multiply(out_layer1, out_layer2), external_inputs], axis=1)
    
    layer_1 = tf.matmul(f_input_layer, weights['h1']) + biases['b1']
    layer_1 = tf.nn.relu(layer_1)
    # Hidden layer with RELU activation
    layer_2 = tf.matmul(layer_1, weights['h2']) + biases['b2']
    layer_2 = tf.nn.relu(layer_2)
    layer_3 = tf.matmul(layer_2, weights['h3']) + biases['b3']
    layer_3 = tf.nn.relu(layer_3)
    out_layer = tf.reshape(tf.matmul(layer_3, weights['out']) + biases['out'], [-1])
    return out_layer, out_layer1, out_layer2

# Store layers weight & bias
hidden = {'out1': tf.Variable(tf.random_normal([n_orders, n_hidden_dimension], stddev = 0.1)),
          'out2': tf.Variable(tf.random_normal([n_product, n_hidden_dimension], stddev = 0.1)),
}
weights = {    
    'h1': tf.Variable(tf.random_normal([n_hidden_dimension*3+n_external_inputs, n_hidden_1], stddev=0.1)),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2], stddev=0.1)),
    'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3], stddev=0.1)),
    'out': tf.Variable(tf.random_normal([n_hidden_3, 1], stddev=0.1)),
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1], stddev = 0.1)),
    'b2': tf.Variable(tf.random_normal([n_hidden_2], stddev = 0.1)),
    'b3': tf.Variable(tf.random_normal([n_hidden_3], stddev = 0.1)),
    'out': tf.Variable(tf.random_normal([1], stddev = 0.1))
}
eigenUnit = tf.Variable(tf.random_normal([n_hidden_dimension], stddev = 0.1)),

# Construct model
pred, order_low, product_low = multilayer_perceptron(x1, x2, external_inputs, weights, biases, hidden)

reg = tf.nn.l2_loss(weights['h1']) + tf.nn.l2_loss(weights['h2']) + tf.nn.l2_loss(weights['h3']) + tf.nn.l2_loss(weights['out'])

cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=y))+beta*reg
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss=cost)

# Initializing the variables
init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(output.shape[0]/batch_size)+1
        # Loop over all batches
        for i in range(total_batch):
            index = range(i*batch_size, i*batch_size+batch_size)
            if (i*batch_size+batch_size>output.shape[0]):
                index = range(i*batch_size, output.shape[0])
            batch_x1, batch_x2, batch_y, batch_external = order_index[index], \
                                      product_index[index], output[index], external_features[index]
            # Run optimization op (backprop) and cost op (to get loss value)
            prediction = sess.run(pred, feed_dict={x1: batch_x1, x2: batch_x2,
                                                          y: batch_y, external_inputs: batch_external})
            _, c = sess.run([optimizer, cost], feed_dict={x1: batch_x1, x2: batch_x2,
                                                          y: batch_y, external_inputs: batch_external})
            # Compute average loss
            avg_cost += c / total_batch
            accuracy = ((prediction>0.5).astype(int)==batch_y[:,0]).astype(int).sum()/batch_y.shape[0]
            f1_acc = f1_score(batch_y[:,0], (prediction>0.5).astype(int))
            print(accuracy, f1_acc)
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))
    save_path = saver.save(sess, "H:\ECI_revisit\python\model\model_trial.ckpt")
    print("Optimization Finished!")

  'precision', 'predicted', average, warn_for)


0.89 0.0
0.9039 0.0


KeyboardInterrupt: 

In [148]:
order_index.shape

(8474661,)

In [140]:
batch_y[:,0].shape

(10000,)

In [142]:
f1_score(batch_y, (prediction>0.5).astype(int))

  'precision', 'predicted', average, warn_for)


0.0

Maximize F1-Expectation
Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)
Posteriors: ['p(L1|x)=0.3', 'p(L2|x)=0.2'] (n=2)
p(None|x)=0.5599999999999999
Prediction ['None', 'L1'] yields best E[F1] of 0.5633333333333332

Maximize F1-Expectation
Posteriors: ['p(L1|x)=0.3', 'p(L2|x)=0.2'] (n=2)
p(None|x)=0.57
Prediction ['None'] yields best E[F1] of 0.57

Maximize F1-Expectation
Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)
Posteriors: ['p(L1|x)=0.9', 'p(L2|x)=0.6'] (n=2)
p(None|x)=0.039999999999999994
Prediction ['L1', 'L2'] yields best E[F1] of 0.8200000000000001

Maximize F1-Expectation
Estimate p(None|x) as (1-p_1)*(1-p_2)*...*(1-p_n)
Posteriors: ['p(L1|x)=0.5', 'p(L2|x)=0.4', 'p(L3|x)=0.35', 'p(L4|x)=0.33', 'p(L5|x)=0.31', 'p(L6|x)=0.3', 'p(L7|x)=0.29', 'p(L8|x)=0.27', 'p(L9|x)=0.25', 'p(L10|x)=0.2', 'p(L11|x)=0.15', 'p(L12|x)=0.1'] (n=12)
p(None|x)=0.015012410773814995
Prediction ['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9'] yields best E[F1] of 0.4636509302493286

Maxim