In [1]:
import graphlab as gl

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

import pandas as pd
import numpy as np

In [2]:
musicdata = "./dataset/amazon5core/digitalmusic/10ksample.csv"
healthdata = "./dataset/amazon5core/healthcare/10ksample.csv"

# loading data using gl
# musicsf = gl.SFrame.read_csv(musicdata, delimiter=",",
#                              usecols=["asin","reviewerID","overall","reviewText"],
#                              column_type_hints=[str, str, int, str])

healthsf = gl.SFrame.read_csv(healthdata, delimiter=",",
                              usecols=["asin","reviewerID","overall","reviewText"],
                              column_type_hints=[str, str, int, str])

This non-commercial license of GraphLab Create for academic use is assigned to rahulbali.mecse16@pec.edu.in and will expire on November 23, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\cramdog\AppData\Local\Temp\graphlab_server_1529008964.log.0


In [3]:
# divide the review and rating data
# ratings = musicsf[["asin", "reviewerID", "overall"]]
# reviews = musicsf[["asin", "reviewerID", "reviewText"]]


ratings = healthsf[["asin", "reviewerID", "overall"]]
reviews = healthsf[["asin", "reviewerID", "reviewText"]]

train, test = gl.recommender.util.random_split_by_user(dataset=ratings,
                                                       item_id="asin",
                                                       user_id="reviewerID",
                                                       max_num_users=100)


In [4]:
## using recommender system lsa or svd

model = gl.recommender.factorization_recommender.create(observation_data=train,
                                                              user_id="reviewerID",
                                                              item_id="asin",
                                                              target="overall",
                                                              num_factors=10,
                                                              solver='als',
                                                              verbose=1)

In [5]:
recs = model.recommend(users=["AFNCXMCX7VZWH"])
recs

reviewerID,asin,score,rank
AFNCXMCX7VZWH,B00GBQ3K54,4.2846825158,1
AFNCXMCX7VZWH,B0014YTACK,4.2846825158,2
AFNCXMCX7VZWH,B00K5NEORM,4.2846825158,3
AFNCXMCX7VZWH,B003V32UDI,4.2846825158,4
AFNCXMCX7VZWH,B008BQI8YA,4.2846825158,5
AFNCXMCX7VZWH,B000BABW5Q,4.2846825158,6
AFNCXMCX7VZWH,B004OZGG0K,4.2846825158,7
AFNCXMCX7VZWH,B00EDR6YB0,4.2846825158,8
AFNCXMCX7VZWH,B001J5H92C,4.2846825158,9
AFNCXMCX7VZWH,B002T3DBKW,4.2846825158,10


In [6]:
model['coefficients']

{'asin': Columns:
 	asin	str
 	linear_terms	float
 	factors	array
 
 Rows: 6170
 
 Data:
 +------------+--------------+-------------------------------+
 |    asin    | linear_terms |            factors            |
 +------------+--------------+-------------------------------+
 | B002T3DBKW |     0.0      | [-1.41087621159e-05, 2.896... |
 | B001J5H92C |     0.0      | [-0.0704518184066, -2.1016... |
 | B00EDR6YB0 |     0.0      | [0.118054434657, -3.538033... |
 | B004OZGG0K |     0.0      | [-2.22086382564e-05, -3.48... |
 | B000BABW5Q |     0.0      | [-0.000109896995127, 1.792... |
 | B008BQI8YA |     0.0      | [-0.00374574260786, -0.031... |
 | B003V32UDI |     0.0      | [-8.45319809741e-05, -0.06... |
 | B00K5NEORM |     0.0      | [-0.00424741348252, -0.045... |
 | B0014YTACK |     0.0      | [0.0127840861678, -0.03964... |
 | B00GBQ3K54 |     0.0      | [0.00327599304728, 0.00018... |
 +------------+--------------+-------------------------------+
 [6170 rows x 3 columns]
 Not

In [7]:
# view = model.views.overview(validation_set=)
model.evaluate_precision_recall(dataset=test)

{'precision_recall_by_user': Columns:
 	reviewerID	str
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 522
 
 Data:
 +----------------+--------+-----------+--------+-------+
 |   reviewerID   | cutoff | precision | recall | count |
 +----------------+--------+-----------+--------+-------+
 | A15FXTZ9PODUGO |   1    |    0.0    |  0.0   |   1   |
 | A15FXTZ9PODUGO |   2    |    0.0    |  0.0   |   1   |
 | A15FXTZ9PODUGO |   3    |    0.0    |  0.0   |   1   |
 | A15FXTZ9PODUGO |   4    |    0.0    |  0.0   |   1   |
 | A15FXTZ9PODUGO |   5    |    0.0    |  0.0   |   1   |
 | A15FXTZ9PODUGO |   6    |    0.0    |  0.0   |   1   |
 | A15FXTZ9PODUGO |   7    |    0.0    |  0.0   |   1   |
 | A15FXTZ9PODUGO |   8    |    0.0    |  0.0   |   1   |
 | A15FXTZ9PODUGO |   9    |    0.0    |  0.0   |   1   |
 | A15FXTZ9PODUGO |   10   |    0.0    |  0.0   |   1   |
 +----------------+--------+-----------+--------+-------+
 [522 rows x 5 columns]
 Note: Only the head of the S

In [8]:
model.evaluate_rmse(test, target='overall')

{'rmse_by_item': Columns:
 	asin	str
 	count	int
 	rmse	float
 
 Rows: 31
 
 Data:
 +------------+-------+----------------+
 |    asin    | count |      rmse      |
 +------------+-------+----------------+
 | B0002274FK |   1   |  3.2846825158  |
 | B000VPBNJQ |   1   | 0.715317484201 |
 | B0002IXH0U |   1   | 0.715317484201 |
 | B0002407V6 |   1   | 0.715317484201 |
 | B00CLD7DDG |   1   | 0.239208195149 |
 | B00BZAIARE |   1   | 0.715317484201 |
 | B000LM0X0Y |   1   | 3.23584467897  |
 | B003ULK0HO |   1   | 0.708464774047 |
 | B0049POHK6 |   1   |  2.2846825158  |
 | B000LNI5VC |   1   | 0.715317484201 |
 +------------+-------+----------------+
 [31 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	reviewerID	str
 	count	int
 	rmse	float
 
 Rows: 29
 
 Data:
 +----------------+-------+----------------+
 |   reviewerID   | count |      rmse      |
 +--------

In [None]:
## using topic modelling
