## Load in data and set up folder hierarchy

In [None]:
# modules
import sys
import numpy as np
import matplotlib 
%matplotlib inline
from surprise import SVD, accuracy, Reader, Dataset
from surprise.model_selection import train_test_split,cross_validate
from surprise.prediction_algorithms.matrix_factorization import SVDpp
from surprise.prediction_algorithms.knns import KNNBasic
from scipy.io import loadmat
import os.path
import pandas as pd
import seaborn as sns


In [None]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
code_dir = os.getcwd()
data_dir = os.path.join(proj_dir,'data')

if not os.path.exists(code_dir):
    os.makedirs(code_dir)
    
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [None]:
##load in the data
raw_data = loadmat(os.path.join(data_dir,'FruitData.mat'))

## Data reshaping and preprocessing

In [None]:
## get ratings
fcm_ratings = np.mean(raw_data['FruitAssoc'], axis=2)

##ordered list of concept names
colnames = ['Mango', 'Watermelon', 'Honeydew', 'Cantaloupe', 'Grapefruit', 'Strawberry', 'Raspberry', 'Blueberry',\
            'Avocado', 'Orange', 'Lime', 'Lemon']


fcm_df = pd.DataFrame(data=fcm_ratings, columns=colnames)



In [None]:
## flatten data and create lists for surprise Reader
concept_id = []
color_id = []

for i in range(58):
    for j in range(len(colnames)):
        concept_id.append(colnames[j])
        color_id.append(i)
ratings = fcm_df.values.flatten() 

In [None]:
ratings_dict = {'color_id':color_id,'concept_id':concept_id, 'ratings':ratings}
ratings_df = pd.DataFrame(ratings_dict)

In [None]:
# Set ratings scale
reader = Reader(rating_scale=(0,1))

In [None]:
ratings_dataset = Dataset.load_from_df(ratings_df[['color_id', 'concept_id', 'ratings']], reader)

## Try different completion methods

### SVD

In [None]:
method = SVD()
trainset, testset = train_test_split(ratings_dataset, test_size=.20)

In [None]:
cv_results = cross_validate(method, ratings_dataset, measures=['RMSE', 'MAE'], cv=10, verbose=True)
np.mean(cv_results['test_rmse'])

In [None]:
method.fit(trainset)
predictions = method.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

### SVD++

In [None]:
method = SVDpp()
trainset, testset = train_test_split(ratings_dataset, test_size=.20)
cv_results = cross_validate(method, ratings_dataset, measures=['RMSE', 'MAE'], cv=10, verbose=True)
np.mean(cv_results['test_rmse'])

### K-NN Basic

In [None]:
method = KNNBasic()
trainset, testset = train_test_split(ratings_dataset, test_size=.20)
cv_results = cross_validate(method, ratings_dataset, measures=['RMSE', 'MAE'], cv=10, verbose=True)
np.mean(cv_results['test_rmse'])

In [None]:
predictions

In [None]:
pred = method.predict(uid, iid, r_ui=0.082269, verbose=True)

In [None]:
fcm_df