# Load Package

In [17]:
!pip install -U scikit-learn
!pip install surprise
!pip install networkx
import pandas as pd
import numpy as np
import seaborn as sns
import random
import matplotlib.pyplot as plt
import scipy
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm import tqdm
import warnings
import networkx as nx
warnings.filterwarnings('ignore')
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader

Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.23.2)


# Load Data

In [28]:
ratings_Digital_Music= pd.read_csv('ratings_Digital_Music.csv', names=["user","item","rating","timestamp"])

In [29]:
ratings_Digital_Music.head()

Unnamed: 0,user,item,rating,timestamp
0,A2EFCYXHNK06IS,5555991584,5.0,978480000
1,A1WR23ER5HMAA9,5555991584,5.0,953424000
2,A2IR4Q0GPAFJKW,5555991584,4.0,1393545600
3,A2V0KUVAB9HSYO,5555991584,4.0,966124800
4,A1J0GL9HCA7ELW,5555991584,5.0,1007683200


In [30]:
print("There are {} users, {} ratings, {} items.".format(len(ratings_Digital_Music["user"].unique()), ratings_Digital_Music.shape[0], len(ratings_Digital_Music["item"].unique())))

There are 478235 users, 836006 ratings, 266414 items.


In [31]:
user_count = pd.DataFrame(ratings_Digital_Music.value_counts(subset=["user"]), columns=["count"])
user_count.reset_index(drop=False, inplace=True)
user_count.head()

Unnamed: 0,user,count
0,A3W4D8XOGLWUN5,1126
1,A9Q28YTLYREO7,713
2,ABDR6IJ93HFIO,489
3,A3HU0B9XUEVHIM,471
4,A1GN8UJIZLCA59,427


In [32]:
valid_user100 = user_count[user_count["count"] >= 100].user.tolist()
valid_user40 = user_count[user_count["count"] >= 40].user.tolist()

In [33]:
song_count = pd.DataFrame(ratings_Digital_Music.value_counts(subset=["item"]), columns=["count"])
song_count.reset_index(drop=False, inplace=True)
song_count.head()

Unnamed: 0,item,count
0,B004D1GZ2E,1953
1,B0026P3G12,1926
2,B0000AGWEC,1823
3,B004K4AUZW,1527
4,B000BGR18W,1386


In [34]:
valid_song100 = song_count[song_count["count"] >= 500].item.tolist()
valid_song40 = song_count[song_count["count"] >= 200].item.tolist()

# Simplify Data

In [35]:
rating_sample = ratings_Digital_Music.sample(frac=0.025, replace=False, random_state=1)
# rating_sample = rating_sample[rating_sample["item"].isin(valid_song40)]
rating_sample.reset_index(drop=True, inplace=True)
print("There are {} users, {} ratings, {} items.".format(len(rating_sample["user"].unique()), rating_sample.shape[0], len(rating_sample["item"].unique())))

There are 19524 users, 20900 ratings, 14946 items.


In [43]:
rating_small = ratings_Digital_Music[ratings_Digital_Music["user"].isin(valid_user100)]
rating_dense = rating_small.copy()
rating_dense = rating_dense[rating_dense["item"].isin(valid_song100)]
rating_large = ratings_Digital_Music[ratings_Digital_Music["user"].isin(valid_user40)]
rating_sparse = rating_large.copy()
rating_sparse = rating_sparse[rating_sparse["item"].isin(valid_song40)]

In [44]:
rating_small.reset_index(drop=True, inplace=True)
rating_dense.reset_index(drop=True, inplace=True)
rating_large.reset_index(drop=True, inplace=True)
rating_sparse.reset_index(drop=True, inplace=True)
print("There are {} users, {} ratings, {} items.".format(len(rating_small["user"].unique()), rating_small.shape[0], len(rating_small["item"].unique())))
print("There are {} users, {} ratings, {} items.".format(len(rating_dense["user"].unique()), rating_dense.shape[0], len(rating_dense["item"].unique())))
print("There are {} users, {} ratings, {} items.".format(len(rating_large["user"].unique()), rating_large.shape[0], len(rating_large["item"].unique())))
print("There are {} users, {} ratings, {} items.".format(len(rating_sparse["user"].unique()), rating_sparse.shape[0], len(rating_sparse["item"].unique())))

There are 100 users, 18217 ratings, 11961 items.
There are 67 users, 400 ratings, 53 items.
There are 515 users, 41486 ratings, 24025 items.
There are 378 users, 4235 ratings, 303 items.


# Using Surprise Package

In [45]:
df_set = [rating_sample, rating_small, rating_large]
df_name = ["rating_sample", "rating_small", "rating_large"]
for p in range(len(df_set)):
  print("Method: {}".format(df_name[p]))
  selected_data = df_set[p]
  reader = Reader(rating_scale=(1, 5))
  data = Dataset.load_from_df(selected_data[["user","item","rating"]], reader)

  dataset = data.build_full_trainset()
  from surprise.model_selection import train_test_split
  X_train, X_test = train_test_split(data, test_size=0.25)

  selected_algorithm = [SVD(), SVDpp(), KNNBaseline(), KNNWithZScore()]
  algorithm_name = ['SVD', 'SVDpp()', 'KNNBaseline()', 'KNNWithZScore()']

  from surprise import accuracy
  reader = Reader(rating_scale=(1, 5))
  trainset = X_train
  testset = X_test
  for i in range(len(selected_algorithm)):
    algo = selected_algorithm[i]
    name = algorithm_name[i]
    predictions = algo.fit(trainset).test(testset)
    print('{} has MAE {}'.format(name, accuracy.mae(predictions)))

Method: rating_sample
MAE:  0.6691
SVD has MAE 0.6690684940653835
MAE:  0.6677
SVDpp() has MAE 0.6676796774889613
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
MAE:  0.6704
KNNBaseline() has MAE 0.6704250740902837
Computing the msd similarity matrix...
Done computing similarity matrix.
MAE:  0.6751
KNNWithZScore() has MAE 0.6751222017636523
Method: rating_small
MAE:  0.5902
SVD has MAE 0.5901670616364287
MAE:  0.5849
SVDpp() has MAE 0.5849114293853507
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
MAE:  0.6144
KNNBaseline() has MAE 0.6144179103095413
Computing the msd similarity matrix...
Done computing similarity matrix.
MAE:  0.7470
KNNWithZScore() has MAE 0.7470262390847098
Method: rating_large
MAE:  0.5426
SVD has MAE 0.5426486596806863
MAE:  0.5309
SVDpp() has MAE 0.5308948954241041
Estimating biases using als...
Computing the msd similarity matrix...
Done computing s

# Using SVD

In [46]:
# https://zhuanlan.zhihu.com/p/94340100
def compute_svd(M, K):
  u, s, vh = np.linalg.svd(M, full_matrices=True)
  u1 = np.transpose(u)
  u1 = u1[0:K]
  u1 = np.transpose(u1)
  v1 = vh[0:K]
  s1 = np.zeros((K,K))
  # print(M.shape, u1.shape, s1.shape, v1.shape)
  for i in range(K):
    s1[i][i] = s[i]

  ss = np.zeros((u.shape[1], vh.shape[0]))
  for i in range(min(u.shape[1], vh.shape[0])):
    ss[i][i] = s[i]
  a = np.dot(np.dot(u1, s1), v1)
  aa = np.dot(np.dot(u, ss), vh)
  # print(sum(sum(M)))
  # print(sum(sum(a)))
  # print(sum(sum(aa)))
  return a

def check_change(origin_matrix, new_matrix):
  thre = 0.5
  sb = np.subtract(origin_matrix, new_matrix)
  rst = abs(sum(sum(sb)))
  # print("cur_error is {}.".format(rst))
  return rst > thre



In [47]:
from scipy.sparse import coo_matrix
import math as mt
from scipy.sparse.linalg import * #used for matrix multiplication
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix

df_set = [rating_sparse, rating_dense]
df_name = ["rating_sparse", "rating_dense"]

for p in range(len(df_set)):
  small_set = df_set[p]
  user_codes = small_set.user.drop_duplicates().reset_index()
  song_codes = small_set.item.drop_duplicates().reset_index()

  user_codes.rename(columns={'index':'user_index'}, inplace=True)
  song_codes.rename(columns={'index':'song_index'}, inplace=True)
  
  song_codes['so_index_value'] = list(song_codes.index)
  user_codes['us_index_value'] = list(user_codes.index)

  small_set = pd.merge(small_set,song_codes,how='left')
  small_set = pd.merge(small_set,user_codes,how='left')

  mat_candidate = small_set[['us_index_value','so_index_value','rating']]


  df = mat_candidate.copy()

  transform_matrix = np.zeros((len(user_codes), len(song_codes)))
  for i in range(df.shape[0]):
    transform_matrix[df['us_index_value'][i]][df['so_index_value'][i]] = df['rating'][i]
  origin_matrix = transform_matrix.copy()
  test_idx = random.sample(list(range(df.shape[0])), int(df.shape[0]*0.2))
  for i in range(len(test_idx)):
    origin_matrix[df['us_index_value'][i]][df['so_index_value'][i]] = 0

  K = 15
  runtime = 300
  svd_matrix = compute_svd(origin_matrix, K)
  new_matrix = origin_matrix.copy()
  for i in range(len(test_idx)):
    new_matrix[df['us_index_value'][i]][df['so_index_value'][i]] = svd_matrix[df['us_index_value'][i]][df['so_index_value'][i]]
  for cnt in range(runtime):
    svd_matrix = compute_svd(new_matrix, K)
    new_matrix = origin_matrix.copy()
    for i in range(len(test_idx)):
      new_matrix[df['us_index_value'][i]][df['so_index_value'][i]] = svd_matrix[df['us_index_value'][i]][df['so_index_value'][i]]

  mae = 0
  count = 0

  for i in range(len(test_idx)):
    mae += transform_matrix[df['us_index_value'][i]][df['so_index_value'][i]] - new_matrix[df['us_index_value'][i]][df['so_index_value'][i]]
    count += 1
  mae /= count
  print("{} has MAE {}".format(df_name[p], mae))

rating_sparse has MAE 4.57512600781347
rating_dense has MAE 4.434284771028779


In [48]:
# https://www.jianshu.com/p/4e2b20333d38
from numpy import linalg as la
def cosSim(inA, inB):
    # print(np.dot(inA.T, inB))
    num = float(np.dot(inA.T, inB))
    denom = la.norm(inA) * la.norm(inB)
    return 0.5 + 0.5 * (num / denom)

def sigmaPct(sigma, percentage):
    sigma2 = sigma ** 2  
    sumsgm2 = sum(sigma2)  
    thre = sumsgm2 * percentage
    sumsgm3 = 0 
    k = 0
    for i in sigma2:
        sumsgm3 += i
        k += 1
        if sumsgm3 >= thre:
            return k
    return k


def svdEst(dataMat, user, simMeas, item, percentage):
    n = dataMat.shape[1]
    simTotal = 0.0;
    ratSimTotal = 0.0
    u, sigma, vt = la.svd(dataMat)
    k = sigmaPct(sigma, percentage)  
    xformedItems = compute_svd(dataMat, k)
    # print(dataMat.shape, xformedItems.shape)
    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0 or j == item: continue
        similarity = simMeas(xformedItems.T[item], xformedItems.T[j])  
        simTotal += similarity  
        ratSimTotal += similarity * userRating  
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal / simTotal 

In [49]:
df_set = [rating_sparse, rating_dense]
df_name = ["rating_sparse", "rating_dense"]

for p in range(len(df_set)):
  small_set = df_set[p]
  user_codes = small_set.user.drop_duplicates().reset_index()
  song_codes = small_set.item.drop_duplicates().reset_index()

  user_codes.rename(columns={'index':'user_index'}, inplace=True)
  song_codes.rename(columns={'index':'song_index'}, inplace=True)
  
  song_codes['so_index_value'] = list(song_codes.index)
  user_codes['us_index_value'] = list(user_codes.index)

  small_set = pd.merge(small_set,song_codes,how='left')
  small_set = pd.merge(small_set,user_codes,how='left')

  mat_candidate = small_set[['us_index_value','so_index_value','rating']]


  df = mat_candidate.copy()

  transform_matrix = np.zeros((len(user_codes), len(song_codes)))
  for i in range(df.shape[0]):
    transform_matrix[df['us_index_value'][i]][df['so_index_value'][i]] = df['rating'][i]
  origin_matrix = transform_matrix.copy()
  test_idx = random.sample(list(range(df.shape[0])), int(df.shape[0]*0.2))
  for i in range(len(test_idx)):
    origin_matrix[df['us_index_value'][i]][df['so_index_value'][i]] = 0
  for i in range(len(test_idx)):
    estimated_rating = svdEst(origin_matrix, df['us_index_value'][i], cosSim, df['so_index_value'][i], 0.85)
    origin_matrix[df['us_index_value'][i]][df['so_index_value'][i]] = estimated_rating
  
  mae = 0
  count = 0
  for i in range(len(test_idx)):
    mae += transform_matrix[df['us_index_value'][i]][df['so_index_value'][i]] - origin_matrix[df['us_index_value'][i]][df['so_index_value'][i]]
    count += 1
  mae /= count
  print("{} has MAE {}".format(df_name[p], mae))

rating_sparse has MAE 0.48105134036393543
rating_dense has MAE 0.8809364798235082


# Using Sklearn

In [50]:
rating_sample_copy = rating_sample.copy()
rating_small_copy = rating_small.copy()
rating_large_copy = rating_large.copy()

In [51]:
for dataset in [rating_sample_copy, rating_small_copy, rating_large_copy]:
  for col in ["user", "item"]:
    selCol = dataset[col]
    for i in range(len(selCol)):
      temp = ''
      for c in selCol[i]:
        if c.isdigit():
          temp += c
        else:
          temp += str(ord(c))
      selCol[i] = temp

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
for selected_data in [rating_sample_copy, rating_small_copy, rating_large_copy]:
  x_trainML = selected_data[["user","item"]]
  y_trainML = selected_data["rating"]
  X_train, X_test, y_train, y_test =sklearn.model_selection.train_test_split(x_trainML, y_trainML, test_size=0.25)

  selected_algorithm = [LinearSVC(), RandomForestClassifier(), MLPClassifier()]
  algorithm_name = ["LinearSVC()", "RandomForestClassifier()", "MLPClassifier()"]

  # from surprise import accuracy
  # reader = Reader(rating_scale=(1, 5))
  # trainset = X_train
  # testset = X_test
  for i in range(len(selected_algorithm)):
    algo = selected_algorithm[i]
    name = algorithm_name[i]
    algo.fit(X_train, y_train)
    predictions = algo.predict(X_test)
    print('{} has mae {}'.format(name, sklearn.metrics.mean_absolute_error(y_test, predictions, multioutput='uniform_average')))

LinearSVC() has mae 0.9088995215311004
RandomForestClassifier() has mae 0.5619138755980861
MLPClassifier() has mae 1.725933014354067
LinearSVC() has mae 0.8221734357848518
RandomForestClassifier() has mae 0.6331503841931942
MLPClassifier() has mae 0.8230515916575192
LinearSVC() has mae 0.6681450057848053
RandomForestClassifier() has mae 0.5673929811029695
MLPClassifier() has mae 0.6796182028538372


# Using tfidf

In [None]:
import json
data = []
with open("Digital_Music_5.json",'r') as load_f:
    temp = load_f.readline()
    while temp:
        row = []
        temp = json.loads(temp)
        row.append(temp['asin'])
        row.append(temp['reviewText'])
        row.append(temp['overall'])
        data.append(row)
        temp = load_f.readline()
#     break

In [None]:
comments_df = pd.DataFrame(data=data,columns=['item', 'review', 'rating'])
x_trainML = comments_df["review"]
y_trainML = comments_df["rating"]
X_train, X_test, y_train, y_test =sklearn.model_selection.train_test_split(x_trainML, y_trainML, test_size=0.25)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = "word",  preprocessor = None, encoding='utf-8', ngram_range=(1, 4), sublinear_tf = True)
documentsVectors = vectorizer.fit_transform(x_trainML)

In [None]:
from sklearn import svm
size1 = len(X_train)
clf = svm.LinearSVC(C = 2.0)
clf.fit(documentsVectors[0:size1],y_train[0:size1])
prediction = clf.predict(documentsVectors[size1:])

In [None]:
sum = 0
for truth, pred in zip(y_test, prediction):
    sum += abs(truth-pred)
print('The mae is {}'.format(sum/len(prediction)))