# Machine Learning project
### Part 3: Test Recommender system for May predictions
##### Machine Learning - MEBDS, UNAV

*Authors:* Daniel Marchán, Francisco Álvarez, Yijie Hsieh and Jaime Fanjul.

##### Import packages and functions needed

In [3]:
#Manejo de data frames
import pandas as pd
# Procesamiento matemático
import numpy as np


def mask_recommendations(users_2_recommend, raw_recommendations, 
                                    df_mask_products):
    
    d = ['ind_prod1', 'ind_prod2', 'ind_prod3',
       'ind_prod4', 'ind_prod5', 'ind_prod6', 'ind_prod7', 'ind_prod8',
       'ind_prod9', 'ind_prod10', 'ind_prod11', 'ind_prod12', 'ind_prod13',
       'ind_prod14', 'ind_prod15', 'ind_prod16', 'ind_prod17', 'ind_prod18',
       'ind_prod19', 'ind_prod20', 'ind_prod21', 'ind_prod22', 'ind_prod23',
       'ind_prod24', 'ind_prod25']
    
    user_2_mask = df_mask_products.index.to_list() #This are the ones that we know they have bought something last month
    rec = {}
    
    for user in users_2_recommend:
        
        if user in user_2_mask:
            recommendation = raw_products_2_recommend(raw_recommendations.loc[user], 
                                                                  np.array(df_mask_products.loc[user]))
        else: 
            recommendation = raw_products_2_recommend(raw_recommendations.loc[user], 
                                                                  np.ones(25, dtype=int))
        
        rec[user] = recommendation 
    
    df_recommendation = pd.DataFrame.from_dict(rec, orient='index', columns=d)
    
    return df_recommendation.rename_axis("cod_persona")


def raw_products_2_recommend(all_recommendations, mask):
    return all_recommendations * mask


def get_selected_month_products(df, selected_month):
    """
    Given the complete dataframe and a selected month as a number, it returns the product purchases of all users for that month    
    """
    keep_cols = ['cod_persona','ind_prod1', 'ind_prod2', 'ind_prod3',
       'ind_prod4', 'ind_prod5', 'ind_prod6', 'ind_prod7', 'ind_prod8',
       'ind_prod9', 'ind_prod10', 'ind_prod11', 'ind_prod12', 'ind_prod13',
       'ind_prod14', 'ind_prod15', 'ind_prod16', 'ind_prod17', 'ind_prod18',
       'ind_prod19', 'ind_prod20', 'ind_prod21', 'ind_prod22', 'ind_prod23',
       'ind_prod24', 'ind_prod25']

    df['mes_standard'] = pd.to_datetime(df["mes"])
    df['month_only'] = df['mes_standard'].apply(lambda row: row.month)
    df['year_only'] = df['mes_standard'].apply(lambda row: row.year)
    df_subset = df[(df['year_only']  == 2016) & (df['month_only'] == selected_month)] #Se filtra por el último año para solo tener una recomendación
    df_subset = df_subset[keep_cols]
    df_subset = df_subset.sort_values('cod_persona', ascending = True)
    df_subset.set_index('cod_persona', inplace = True) 

    return df_subset


def get_inverted_matrix(df):
    """
    Given a matrix of 1s and 0s, it returns a matrix with the values inverted
    """
    df_inverted = df.replace({0:1, 1:0})
    return df_inverted


def users_bought_list(df):
    df_tmp = df 
    df_tmp["prod_total"] = df.iloc[:,1:].sum(axis=1)
    list_persona_ever_bought = df_tmp[df_tmp['prod_total'] > 0].index.to_list()
    
    return list_persona_ever_bought


def apk(actual, predicted, k=7):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    #print('actual')
    #print(actual)

    if len(predicted)>k:
        predicted = predicted[:k]
        
    #print('predicted')
    #print(predicted)

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=7):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])


def apk_score(df_mes, df_final_recommendation):
    
    y_predicted = []
    y_real = []

    users_bought = users_bought_list(df_mes)
    df_mes = df_mes.drop('prod_total', axis = 1)
      
    users1 = [user for user in df_mes.index.to_list() if user in df_final_recommendation.index.to_list()]
    users = [user for user in users1 if user in users_bought]
    
    for u in users:
        y_real.append(df_mes.loc[u].index.where(df_mes.loc[u] > 0).dropna().to_list())
        y_predicted.append(df_final_recommendation.loc[u].to_list())
    
    print(len(y_real), y_real)
    print(len(y_predicted),y_predicted)


    score = mapk(actual = y_real, predicted = y_predicted , k = 7) 

    print(f'Intelligent APK Score: {score} \n')



#### 1. Read real and predictions

In [4]:
# Whole dataset
df = pd.read_csv('df_complete2.csv') 

#Read the real values
df_month_to_eval = pd.read_csv('soluciones.csv')
df_month_to_eval = df_month_to_eval.sort_values('cod_persona').drop(['mes', 'prediction'], axis = 1).set_index('cod_persona')

#Read the predicted values
df_final_recommendation_may_ordered = pd.read_csv('df_final_recommendation_may.csv', index_col = 0)

#### 2. Prepare the data for comparison

In [5]:
#Catch last month bought products
last_month = 4 #Only of 2016 (4==April to predict May) 
df_last_month_products = get_selected_month_products(df, last_month) #Catch the products you have bought this last month (March)
df_mask_bought_products = get_inverted_matrix(df_last_month_products)
# ------ Mask products that were bought last month so it does not interfere in the true set for comparison
users_2_recommend = df_month_to_eval.index.to_list()
df_month_to_eval_masked = mask_recommendations(users_2_recommend, df_month_to_eval, df_mask_bought_products)

#### 3. Compute the score

In [6]:
# --------------------------------------------
apk_score(df_month_to_eval_masked, df_final_recommendation_may_ordered)

1426 [['ind_prod3'], ['ind_prod3'], ['ind_prod3'], ['ind_prod3', 'ind_prod7', 'ind_prod24'], ['ind_prod3'], ['ind_prod3'], ['ind_prod3'], ['ind_prod3'], ['ind_prod7'], ['ind_prod3'], ['ind_prod7'], ['ind_prod3'], ['ind_prod13'], ['ind_prod3'], ['ind_prod3', 'ind_prod7'], ['ind_prod3'], ['ind_prod3'], ['ind_prod3', 'ind_prod24'], ['ind_prod3'], ['ind_prod3'], ['ind_prod3'], ['ind_prod3'], ['ind_prod3', 'ind_prod24'], ['ind_prod7'], ['ind_prod24'], ['ind_prod24'], ['ind_prod3'], ['ind_prod7', 'ind_prod24'], ['ind_prod3'], ['ind_prod24'], ['ind_prod7'], ['ind_prod24'], ['ind_prod3'], ['ind_prod3', 'ind_prod24'], ['ind_prod3', 'ind_prod7'], ['ind_prod3'], ['ind_prod24'], ['ind_prod3', 'ind_prod22', 'ind_prod23'], ['ind_prod5', 'ind_prod24'], ['ind_prod3', 'ind_prod7'], ['ind_prod13'], ['ind_prod3'], ['ind_prod24'], ['ind_prod22', 'ind_prod23'], ['ind_prod24'], ['ind_prod24'], ['ind_prod24'], ['ind_prod3'], ['ind_prod3'], ['ind_prod5'], ['ind_prod7'], ['ind_prod5'], ['ind_prod3'], ['ind_pro