In [None]:
import tensorflow as tf
import pickle
import pandas as pd
import random
from collections import Counter
import numpy as np
import math
import time
import warnings

tf.logging.set_verbosity(tf.logging.ERROR)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

data_path = r"..\data\dict_list.p"
labels_path = r"..\data\labels.p"

COLUMNS = ['FST_TFPD_Score','preST_TFPD_Score','postST_TFPD_Score','FST_TFPT_Score','preST_TFPT_Score',
           'postST_TFPT_Score','FST_TFAT_Score','preST_TFAT_Score','postST_TFAT_Score','FST_WFPD_Score',
           'preST_WFPD_Score','postST_WFPD_Score','FST_WFPT_Score','preST_WFPT_Score','postST_WFPT_Score', 
           'FST_WFAT_Score', 'preST_WFAT_Score','postST_WFAT_Score','FST_W2VPD_Score','preST_W2VPD_Score', 
           'postST_W2VPD_Score', 'prodTitle_FsT_prob', 'prodTitle_length_feat','prodTitle_postST_prob', 
           'prodTitle_preST_prob', 'prodDesc_FsT_prob', 'prodDesc_length_feat','prodDesc_postST_prob', 
           'prodDesc_preST_prob', 'prodAttr_FsT_prob', 'prodAttr_length_feat', 'prodAttr_postST_prob', 
           'prodAttr_preST_prob']

dnn_hidden_layers_param = [10,5]
learning_rate_param = 0.1
steps = 10000
adam_op = tf.train.AdamOptimizer(learning_rate=0.1)
adagrad_op = tf.train.ProximalAdagradOptimizer(learning_rate=learning_rate_param,l1_regularization_strength=0.001,
                                               l2_regularization_strength=0.001)

In [None]:
def build_estimator(model_dir=None):
    FST_TFPD_Score= tf.contrib.layers.real_valued_column('FST_TFPD_Score')
    preST_TFPD_Score= tf.contrib.layers.real_valued_column('preST_TFPD_Score')
    postST_TFPD_Score= tf.contrib.layers.real_valued_column('postST_TFPD_Score')
    FST_TFPT_Score= tf.contrib.layers.real_valued_column('FST_TFPT_Score')
    preST_TFPT_Score= tf.contrib.layers.real_valued_column('preST_TFPT_Score')
    postST_TFPT_Score= tf.contrib.layers.real_valued_column('postST_TFPT_Score')
    FST_TFAT_Score= tf.contrib.layers.real_valued_column('FST_TFAT_Score')
    preST_TFAT_Score= tf.contrib.layers.real_valued_column('preST_TFAT_Score')
    postST_TFAT_Score= tf.contrib.layers.real_valued_column('postST_TFAT_Score')
    FST_WFPD_Score= tf.contrib.layers.real_valued_column('FST_WFPD_Score')
    preST_WFPD_Score= tf.contrib.layers.real_valued_column('preST_WFPD_Score')
    postST_WFPD_Score= tf.contrib.layers.real_valued_column('postST_WFPD_Score')
    FST_WFPT_Score= tf.contrib.layers.real_valued_column('FST_WFPT_Score')  
    preST_WFPT_Score= tf.contrib.layers.real_valued_column('preST_WFPT_Score')  
    postST_WFPT_Score= tf.contrib.layers.real_valued_column('postST_WFPT_Score')  
    FST_WFAT_Score= tf.contrib.layers.real_valued_column('FST_WFAT_Score')  
    preST_WFAT_Score= tf.contrib.layers.real_valued_column('preST_WFAT_Score')  
    postST_WFAT_Score= tf.contrib.layers.real_valued_column('postST_WFAT_Score')  
    FST_W2VPD_Score= tf.contrib.layers.real_valued_column('FST_W2VPD_Score')  
    preST_W2VPD_Score= tf.contrib.layers.real_valued_column('preST_W2VPD_Score')  
    postST_W2VPD_Score= tf.contrib.layers.real_valued_column('postST_W2VPD_Score')  
    prodTitle_FsT_prob= tf.contrib.layers.real_valued_column('prodTitle_FsT_prob')  
    prodTitle_length_feat= tf.contrib.layers.real_valued_column('prodTitle_length_feat')
    prodTitle_postST_prob= tf.contrib.layers.real_valued_column('prodTitle_postST_prob')  
    prodTitle_preST_prob= tf.contrib.layers.real_valued_column('prodTitle_preST_prob')  
    prodDesc_FsT_prob= tf.contrib.layers.real_valued_column('prodDesc_FsT_prob')  
    prodDesc_length_feat= tf.contrib.layers.real_valued_column('prodDesc_length_feat')  
    prodDesc_postST_prob= tf.contrib.layers.real_valued_column('prodDesc_postST_prob')  
    prodDesc_preST_prob= tf.contrib.layers.real_valued_column('prodDesc_preST_prob')  
    prodAttr_FsT_prob= tf.contrib.layers.real_valued_column('prodAttr_FsT_prob')  
    prodAttr_length_feat= tf.contrib.layers.real_valued_column('prodAttr_length_feat')  
    prodAttr_postST_prob= tf.contrib.layers.real_valued_column('prodAttr_postST_prob')  
    prodAttr_preST_prob= tf.contrib.layers.real_valued_column('prodAttr_preST_prob')
    
    wide_columns = [FST_TFPD_Score,preST_TFPD_Score,postST_TFPD_Score,FST_TFPT_Score,preST_TFPT_Score,
           postST_TFPT_Score,FST_TFAT_Score,preST_TFAT_Score,postST_TFAT_Score,FST_WFPD_Score,
           preST_WFPD_Score,postST_WFPD_Score,FST_WFPT_Score,preST_WFPT_Score,postST_WFPT_Score, 
           FST_WFAT_Score, preST_WFAT_Score,postST_WFAT_Score,FST_W2VPD_Score,preST_W2VPD_Score, 
           postST_W2VPD_Score, prodTitle_FsT_prob, prodTitle_length_feat,prodTitle_postST_prob, 
           prodTitle_preST_prob, prodDesc_FsT_prob, prodDesc_length_feat,prodDesc_postST_prob, 
           prodDesc_preST_prob, prodAttr_FsT_prob, prodAttr_length_feat, prodAttr_postST_prob, 
           prodAttr_preST_prob]

    deep_columns = [FST_TFPD_Score,preST_TFPD_Score,postST_TFPD_Score,FST_TFPT_Score,preST_TFPT_Score,
           postST_TFPT_Score,FST_TFAT_Score,preST_TFAT_Score,postST_TFAT_Score,FST_WFPD_Score,
           preST_WFPD_Score,postST_WFPD_Score,FST_WFPT_Score,preST_WFPT_Score,postST_WFPT_Score, 
           FST_WFAT_Score, preST_WFAT_Score,postST_WFAT_Score,FST_W2VPD_Score,preST_W2VPD_Score, 
           postST_W2VPD_Score, prodTitle_FsT_prob, prodTitle_length_feat,prodTitle_postST_prob, 
           prodTitle_preST_prob, prodDesc_FsT_prob, prodDesc_length_feat,prodDesc_postST_prob, 
           prodDesc_preST_prob, prodAttr_FsT_prob, prodAttr_length_feat, prodAttr_postST_prob, 
           prodAttr_preST_prob]

    estimator = tf.contrib.learn.DNNLinearCombinedRegressor(
        # wide settings
        linear_feature_columns=wide_columns,
        linear_optimizer=tf.train.FtrlOptimizer(learning_rate=learning_rate_param,
                                                l1_regularization_strength=0.001,
                                                l2_regularization_strength=0.001),
        # deep settings
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=dnn_hidden_layers_param,
        dnn_optimizer=adagrad_op,
        config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1800)
    )

    return estimator

def input_fn(data,labels):
    feature_cols = {}
    for k in COLUMNS:
        feature_cols[k] = tf.constant(data[k])
    
    labels = tf.constant(labels)
    return feature_cols, labels

def train(training_data,training_labels,model_dir=None, train_steps=steps):
    m = build_estimator()
    m.fit(input_fn=lambda: input_fn(training_data,training_labels), steps=train_steps)

    return m

def load_data(path):
    return pickle.load(open(path,"rb"))

In [None]:
print('Loading data')
training_data = load_data(data_path)
training_labels = load_data(labels_path)
print('Done')

In [None]:
# input: an ordered vector of relevance, output: Discountegd Cumulative Gain
def DCG(vec):
    sc = 0
    for i in range(1,len(vec)):
        sc += ((2**vec[i-1])-1)/math.log(i+1, 2)
    return sc
 
################################## Loading dat and matrix###########################################
df_all = pd.read_csv(r"..\data\df_all.csv", encoding="ISO-8859-1")
################################## Main code #######################################################
 
unique_search_term = list(set(df_all.search_term))
# K fold cross validation
K = 3
scores_in_cross_DCG = np.empty(K)
scores_in_cross_RMSE = np.empty(K)
#percentage of the training set set asside for testing in k cross validation
test_percentage = 0.4

for k in range(K):
    print("--------------- Starting round:" + str(k+1) + "/" + str(K) + "---------------")
    print("Spliting data randomly")
    # we select some random query for test set
    test_queries = random.sample(unique_search_term,round(test_percentage*len(unique_search_term)))
    train_queries = list(set(unique_search_term)-set(test_queries))
    # we select the sub data frame with only the train and test_queries
    df_train = df_all.loc[df_all['search_term'].isin(train_queries)]
    df_test = df_all.loc[df_all['search_term'].isin(test_queries)]
    
    # we then get the indexes for the feature matrix split
    ind_train = df_all[df_all['search_term'].isin(train_queries)].index.tolist()
    ind_test = df_all[df_all['search_term'].isin(test_queries)].index.tolist()
    
    features_train = {}
    features_test = {}
    labels_train = [training_labels[i] for i in ind_train]
    labels_test = [training_labels[i] for i in ind_test]
    
    for key in COLUMNS:
        features_train[key] = [training_data[key][i] for i in ind_train]
        features_test[key] = [training_data[key][i] for i in ind_test]
    ####################### train the model on df_train #####################################
    print("Training model")
    start = time.time()
    # TODO train your model here with features_train as an input
    #Y = df_train.relevance
    
    model = train(features_train,labels_train)
    #model = RandomForestRegressor(n_estimators = 10, max_depth=5)
    #model.fit(y=Y, X= features_train)
    end = time.time()
    print("Model trained in",end - start)
    ####################### testing #########################################################
    # TODO apply your model to the df_test and put the predicted relevance in the "est_relevance" column
    estimated_relevance = model.predict(input_fn=lambda: input_fn(features_test,labels_train), 
                                        as_iterable=False)
    print(estimated_relevance)
    #estimated_relevance  = np.random.randint(3, size=(len(df_test.search_term)))
    df_test["est_relevance"] = np.clip(estimated_relevance,1,3) # random solution, np.random.randint(3, size=(len(df_test.search_term)))
    # Computing the final score
    # First we create a list of documents organized by our model
    final_k_score_DCG = 0
    c = 0
    #computing the dcg score
    for query in test_queries:
        df_temp = df_test.loc[df_test.search_term == query]
        df_temp = df_temp.sort(['est_relevance'],ascending = False)
        if len(df_temp.relevance)>1:
            c += 1
            # applying the normalized DCG by computing it and divided it by the perfect DCG score
            final_k_score_DCG += DCG(np.array(df_temp.relevance))/DCG(-np.sort(-np.array(df_temp.relevance)))
            #print(DCG(np.array(df_temp.relevance))/DCG(-np.sort(-np.array(df_temp.relevance))))
    # divding the usm of normaized DCG score by the number of queries with more than 1 document
    final_k_score_DCG = final_k_score_DCG/c
    # computing the root means square error
    #y = np.array(df_test.relevance)
    y = np.array(labels_test)
    y_hat = np.array(df_test.est_relevance)
    final_k_score_RMSE = np.power(np.sum(np.power(y - y_hat, 2)) / len(y), 0.5)
 
    print('DCG:' + str(final_k_score_DCG))
    print('RMSE:' + str(final_k_score_RMSE))
 
    #saving final score
    scores_in_cross_DCG[k] = final_k_score_DCG
    scores_in_cross_RMSE[k] = final_k_score_RMSE

print("Cross validation procedure completed")
print("DCG score vector " + str(scores_in_cross_DCG))
print("DCG mean score " + str(np.mean(scores_in_cross_DCG)))
print("RMSE score vector " + str(scores_in_cross_RMSE))
print("RMSE mean score " + str(np.mean(scores_in_cross_RMSE)))