## Load in data and set up folder hierarchy

In [1]:
# modules
import sys
import numpy as np
import matplotlib 
%matplotlib inline
from surprise import SVD, accuracy, Reader, Dataset
from surprise.model_selection import train_test_split,cross_validate
from surprise.prediction_algorithms.matrix_factorization import SVDpp
from surprise.prediction_algorithms.knns import KNNBasic
from scipy.io import loadmat
import os.path
import pandas as pd
import seaborn as sns


In [1]:
# directory & file hierarchy
proj_dir = os.path.abspath('..')
code_dir = os.getcwd()
data_dir = os.path.join(proj_dir,'data')

if not os.path.exists(code_dir):
    os.makedirs(code_dir)
    
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

NameError: name 'os' is not defined

In [2]:
##load in the data
raw_data = loadmat('data\FruitData.mat')

NameError: name 'loadmat' is not defined

## Data reshaping and preprocessing

In [4]:
## get ratings
fcm_ratings = np.mean(raw_data['FruitAssoc'], axis=2)

##ordered list of concept names
colnames = ['Mango', 'Watermelon', 'Honeydew', 'Cantaloupe', 'Grapefruit', 'Strawberry', 'Raspberry', 'Blueberry',\
            'Avocado', 'Orange', 'Lime', 'Lemon']


fcm_df = pd.DataFrame(data=fcm_ratings, columns=colnames)



In [5]:
## flatten data and create lists for surprise Reader
concept_id = []
color_id = []

for i in range(58):
    for j in range(len(colnames)):
        concept_id.append(colnames[j])
        color_id.append(i)
ratings = fcm_df.values.flatten() 

In [6]:
ratings_dict = {'color_id':color_id,'concept_id':concept_id, 'ratings':ratings}
ratings_df = pd.DataFrame(ratings_dict)

In [7]:
# Set ratings scale
reader = Reader(rating_scale=(0,1))

In [8]:
ratings_dataset = Dataset.load_from_df(ratings_df[['color_id', 'concept_id', 'ratings']], reader)

## Try different completion methods

### SVD

In [9]:
method = SVD()
trainset, testset = train_test_split(ratings_dataset, test_size=.20)

In [10]:
cv_results = cross_validate(method, ratings_dataset, measures=['RMSE', 'MAE'], cv=10, verbose=True)
np.mean(cv_results['test_rmse'])

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.1916  0.1790  0.2133  0.1746  0.1816  0.1856  0.1663  0.2087  0.2159  0.2137  0.1930  0.0175  
MAE (testset)     0.1502  0.1437  0.1492  0.1256  0.1435  0.1410  0.1286  0.1491  0.1540  0.1579  0.1443  0.0099  
Fit time          0.05    0.04    0.04    0.04    0.04    0.04    0.04    0.04    0.04    0.04    0.04    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    


0.19303962605794936

In [11]:
method.fit(trainset)
predictions = method.test(testset)


accuracy.rmse(predictions)

RMSE: 0.2087


0.20866558340506716

### SVD++

In [12]:
method = SVDpp()
trainset, testset = train_test_split(ratings_dataset, test_size=.20)
cv_results = cross_validate(method, ratings_dataset, measures=['RMSE', 'MAE'], cv=10, verbose=True)
np.mean(cv_results['test_rmse'])

Evaluating RMSE, MAE of algorithm SVDpp on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.1719  0.2108  0.1673  0.1586  0.1442  0.1897  0.1529  0.2133  0.1770  0.1847  0.1770  0.0219  
MAE (testset)     0.1243  0.1522  0.1244  0.1144  0.1102  0.1386  0.1127  0.1370  0.1362  0.1462  0.1296  0.0138  
Fit time          0.18    0.16    0.17    0.28    0.20    0.21    0.20    0.22    0.17    0.17    0.19    0.03    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    


0.17704222470590927

### K-NN Basic

In [13]:
method = KNNBasic()
trainset, testset = train_test_split(ratings_dataset, test_size=.20)
cv_results = cross_validate(method, ratings_dataset, measures=['RMSE', 'MAE'], cv=10, verbose=True)
np.mean(cv_results['test_rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.1725  0.1966  0.1610  0.1655  0.1500  0.1495  0.1777  0.1944  0.1404  0.186

0.16942502240525423

### Hold out one subject, fit model to mean of remaining 53 subjects and compute error on held out subject

In [14]:
fcm_unrolled = raw_data['FruitAssoc']

In [15]:
train_subs = np.mean(fcm_unrolled[:,:,0:53], axis=2)
test_sub = fcm_unrolled[:,:,53]

In [16]:
train_dict = {'color_id':color_id,'concept_id':concept_id, 'ratings':train_subs.flatten()}
train_df = pd.DataFrame(train_dict)
test_dict = {'color_id':color_id,'concept_id':concept_id, 'ratings':test_sub.flatten()}
test_df =  pd.DataFrame(test_dict)

In [17]:
reader = Reader(rating_scale=(0,1))
method = SVD()
train_ds = Dataset.load_from_df(train_df[['color_id', 'concept_id', 'ratings']], reader)
train_ds = train_ds.build_full_trainset()

In [18]:
test_ds  = Dataset.load_from_df(test_df[['color_id', 'concept_id', 'ratings']], reader)
test_ds = train_ds.build_testset()

In [19]:
method.fit(train_ds)
predictions = method.test(test_ds)


accuracy.rmse(predictions)

RMSE: 0.1512


0.1512235225938616