# hw2 task1

## Data Preprocessing

In [None]:
import numpy as np
import pandas as pd
import datetime
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv('ntucsie-sdml2018-2-1/rating_train.csv', header=0)

In [None]:
dataset.head()

In [None]:
print('unique userid:', len(dataset.userid.unique()))
print('unique foodid:', len(dataset.foodid.unique()))

In [None]:
matrix = dataset.as_matrix()

In [None]:
freq = defaultdict(lambda: defaultdict(int))

In [None]:
for row in matrix:
    freq[row[1]][row[2]] = freq[row[1]][row[2]] + 1

In [None]:
train = []
for userid in freq:
    for foodid in freq[userid]:
        train.append([userid, foodid, freq[userid][foodid]])
train = np.array(train)

In [None]:
df = pd.DataFrame()
df['userid'] = train[:,0]
df['foodid'] = train[:,1]
df['freq'] = train[:,2]

In [None]:
df.head()

### For NCF

In [None]:
n_food = dataset['foodid'].max()
n_neg_sample = 99

In [None]:
with open('food.train.rating', 'w') as file:
    for index, row in dataset.iterrows():
        timestamp = int(datetime.datetime.strptime(row['date'], '%Y-%m-%d').strftime("%s"))
        file.write('{}\t{}\t1\t{}\n'.format(row['userid'], row['foodid'], timestamp))

In [None]:
with open('food.test.rating', 'w') as file, open('food.test.negative', 'w') as neg_file:
    for i in range(dataset['userid'].max()):
        tmp_df = dataset[dataset['userid'] == i]
        if tmp_df.shape[0] > 0:
            sample = tmp_df.sample().values[0]
            timestamp = int(datetime.datetime.strptime(sample[0], '%Y-%m-%d').strftime("%s"))
            file.write('{}\t{}\t1\t{}\n'.format(sample[1], sample[2], timestamp))
            
            # Negative sampling
            cnt = 0
            neg_file.write('({},{})'.format(sample[1], sample[2]))
            while (cnt < n_neg_sample):
                n_foodid = np.random.randint(n_food+1)
                if n_foodid not in history[sample[1]]:
                    neg_file.write('\t{}'.format(n_foodid))
                    cnt = cnt + 1
            neg_file.write('\n')

### Side-information

In [None]:
food = pd.read_csv('ntucsie-sdml2018-2-1/food.csv')

In [None]:
food.head()

In [None]:
food_filter = food[['foodid', 
                    'calories', 
                    'fat', 
                    'carbs', 
                    'sodium', 
                    'potassium', 
                    'fiber', 
                    'sugar', 
                    'protein', 
                    'calcium', 
                    'iron']]
food_filter = food_filter.replace('-', 0)
food_v = food_filter.astype(np.int).get_values()
foodmap = {row[0]: row[1:] for row in food_v}

In [None]:
user = pd.read_csv('ntucsie-sdml2018-2-1/user.csv')

In [None]:
user.head()

In [None]:
user_filter = user[['userid', 'age', 'gender', 'location', 'friends_count']]
user_filter = user_filter.replace(np.NAN, 0)
user_one_hot = pd.get_dummies(user_filter, columns=['gender', 'location'])
user_v = user_one_hot.astype(np.int).get_values()
usermap = {row[0]: row[1:] for row in user_v}

In [None]:
import pickle

with open('foodmap.pickle', 'wb') as f_file, open('usermap.pickle', 'wb') as u_file:
    pickle.dump(foodmap, f_file)
    pickle.dump(usermap, u_file)

## Data Analysis

In [None]:
from scipy import stats

In [None]:
x = dataset.groupby('userid').count().index.tolist()
y = dataset.groupby('userid').count()['foodid'].tolist()

In [None]:
scipy.stats.describe(y)

In [None]:
plt.bar(x, y)
plt.title('User histogram')
plt.xlabel('user id')
plt.ylabel('freq')
plt.show()

In [None]:
x = dataset.groupby('foodid').count().index.tolist()
y = dataset.groupby('foodid').count()['userid'].tolist()

In [None]:
scipy.stats.describe(y)

In [None]:
plt.bar(x, y, log=True)
plt.title('Food histogram')
plt.xlabel('food id')
plt.ylabel('freq')
plt.show()

In [None]:
np.array(y).argsort()
print(y[4769])
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(food.loc[food['foodid'] == 4769])

## Matrix factorization

In [None]:
X = np.zeros((len(dataset.userid.unique()), dataset.foodid.max()+1), dtype=np.uint16)

In [None]:
index2userid = dataset.userid.unique()
userid2index = defaultdict(int)
for index, userid in enumerate(index2userid):
    userid2index[userid] = index

In [None]:
for userid in freq:
    for foodid in freq[userid]:
        # Implicit feedback
        if freq[userid][foodid] > 0:
            X[userid2index[userid]][foodid] = 1
        else:
            X[userid2index[userid]][foodid] = 0
        # Explicit feedback
#         X[userid2index[userid]][foodid] = freq[userid][foodid]

In [None]:
from sklearn.decomposition import NMF, TruncatedSVD, PCA

# model = NMF(n_components=20, init='random', random_state=0)
# model = TruncatedSVD(n_components=20, n_iter=100, random_state=0)
model = PCA(n_components=20, random_state=0)
W = model.fit_transform(X)
H = model.components_

In [None]:
pred = np.dot(W, H)

In [None]:
with open('pred.csv', 'w') as file:
    file.write('userid,foodid\n')
    for index, row in enumerate(pred):
        userid = index2userid[index]
        if userid not in freq:
            continue
        file.write('{},'.format(userid))
        row_sorted = row.argsort()[::-1]
        # row_sorted.sort() # foodid from small to large
        cnt = 0
        for foodid in row_sorted:
            if foodid not in freq[userid]:
                file.write('{}'.format(foodid))
                cnt = cnt + 1
                if cnt >= 20:
                    file.write('\n')
                    break
                else:
                    file.write(' ')

## Implicit 

### Loading Data

In [None]:
import implicit
from scipy.sparse import coo_matrix

In [None]:
df['userid'] = df['userid'].astype('category')
df['foodid'] = df['foodid'].astype('category')

In [None]:
# Confidence
alpha = 10
confidence = 1 + alpha * df['freq']

In [None]:
confidence = df['freq']

In [None]:
item_user_data = coo_matrix((confidence.astype(int),
                   (df['foodid'].cat.codes,
                    df['userid'].cat.codes)))

In [None]:
index2user = dict(enumerate(df['userid'].cat.categories))
user2index = dict((user, index) for index, user in index2user.items())

In [None]:
index2food = dict(enumerate(df['foodid'].cat.categories))
food2index = dict((user, index) for index, user in index2user.items())

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.loc[df['userid'] == 6])

### Alternating Least Squares

In [None]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=50,
                                             iterations=1000,
                                             calculate_training_loss=True)

# train the model on a sparse matrix of item/user/confidence weights
model.fit(item_user_data)

# recommend items for a user
userid = 0
user_items = item_user_data.T.tocsr()
recommendations = model.recommend(userid, user_items, N=20)

# find related items
itemid = 0
related = model.similar_items(itemid, N=20)

### Bayesian Personalized Ranking (BPR)

In [None]:
# initialize a model
model = implicit.bpr.BayesianPersonalizedRanking(factors=10, iterations=2000)

# train the model on a sparse matrix of item/user/confidence weights
model.fit(item_user_data)

# recommend items for a user
userid = 0
user_items = item_user_data.T.tocsr()
recommendations = model.recommend(userid, user_items, N=20)

# find related items
related = model.similar_items(itemid, N=20)

### Make Recommendation

In [None]:
foodfreq = dataset.groupby('foodid').count()['userid']
foodfreq = foodfreq.sort_values(ascending=False)

In [None]:
history = defaultdict(set)
for index, row in df.iterrows():
    history[row['userid']].add(row['foodid'])

In [None]:
import pickle

with open('history.pickle', 'wb') as file:
    pickle.dump(history, file)

In [None]:
with open('pred.csv', 'w') as file:
    file.write('userid,foodid\n')
    total_cnt = 0
    total_score = 0
    for index, userid in index2user.items():
        file.write('{},'.format(userid))
        recommendations = model.recommend(index, user_items, N=20)
        cnt = 0
        for (foodid, score) in recommendations:
            if cnt < 10:
                for index, freq in foodfreq.iteritems():
                    if index not in history[userid]:
                        history[userid].add(index)
                        foodid = index
                        break
            file.write('{}'.format(foodid))
            total_cnt = total_cnt + 1
            total_score = total_score + score
            cnt = cnt + 1
            if cnt >= 20:
                file.write('\n')
            else:
                file.write(' ')

In [None]:
avg_score = total_score / total_cnt
print('Average score:', avg_score)

## SURPRISE

### Automatic cross-validation

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [None]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

### Use a custom dataset

In [None]:
recommendations

In [None]:
from surprise import Reader

In [None]:
a = pd.DataFrame([[1, 4], [2, 5], [3, 6]], columns=['user', 'item'])

In [None]:
user_items.toarray()

## NCF Model

In [None]:
from IPython.display import SVG

from keras.regularizers import l1, l2
from keras.models import Sequential, Model
from keras.layers import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Reshape, Flatten, Dropout
from keras.layers import Concatenate, Multiply
from keras.utils.vis_utils import model_to_dot

In [None]:
def get_model(num_users, num_items, num_user_attrs, num_item_attrs, mf_dim=10, layers=[10], reg_layers=[0], reg_mf=0):
    assert len(layers) == len(reg_layers)
    num_layer = len(layers) #Number of layers in the MLP
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
    user_attr = Input(shape=(num_user_attrs,), name='user_attr')
    item_attr = Input(shape=(num_item_attrs,), name='item_attr')
    
    # Embedding layer
    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = mf_dim, name = 'mf_embedding_user',
                                  embeddings_initializer = 'glorot_normal')
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = mf_dim, name = 'mf_embedding_item',
                                  embeddings_initializer = 'glorot_normal')   

    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = layers[0]//2, name = "mlp_embedding_user",
                                  embeddings_initializer = 'glorot_normal')
    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = layers[0]//2, name = 'mlp_embedding_item',
                                  embeddings_initializer = 'glorot_normal')   
    
    # MF part
    mf_user_latent = Flatten()(MF_Embedding_User(user_input))
    mf_item_latent = Flatten()(MF_Embedding_Item(item_input))
    mf_vector = Multiply()([mf_user_latent, mf_item_latent]) # element-wise multiply

    # MLP part 
    mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
    mlp_item_latent = Flatten()(MLP_Embedding_Item(item_input))
    mlp_vector = Concatenate()([mlp_user_latent, mlp_item_latent, user_attr, item_attr])
    for idx in range(1, num_layer):
        layer = Dense(layers[idx], W_regularizer= l2(reg_layers[idx]), activation='relu', name="layer%d" %idx)
        mlp_vector = layer(mlp_vector)

    # Concatenate MF and MLP parts
    #mf_vector = Lambda(lambda x: x * alpha)(mf_vector)
    #mlp_vector = Lambda(lambda x : x * (1-alpha))(mlp_vector)
    predict_vector = Concatenate()([mf_vector, mlp_vector])
    
    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = "prediction")(predict_vector)
    
    model = Model(input=[user_input, item_input, user_attr, item_attr], 
                  output=prediction)
    
    return model

In [None]:
model = get_model(9895, 5532, len(usermap[6]), len(foodmap[6]), layers=[512, 256, 128, 64], reg_layers=[0, 0, 0, 0])

In [None]:
model.summary()

In [None]:
model.predict(b)

In [None]:
SVG(model_to_dot(model, show_shapes=True, show_layer_names=True, rankdir="HB").create(prog="dot", format="svg"))