## Load Libraries and Initital Dataframe

In [74]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# make a list of column names to import from CSV
fields = ['username','app_name', 'nb_run_app', 'rating']

# Load dataframe
df = pd.read_csv('../CyVerseRecommendationApp/user_all_dataset_2020_05_01.csv',delimiter=',', usecols=fields)


## Calculate New Features through Normalization

#### App Use Frequency
- Feature: nb_run_app, is the number of times a user ran a particular app
- Total_app_use by all users is the sum of all values in the vector nb_run_app
- Where, App Use Frequency = nb_run_app/Total_app_use

#### Percent Ratings
- Feature: rating, this is the rating an individual user gave for a particular app
- Maximum rating is 5, minimum rating is 1 (possibly 0?)
- percent_rating = rating/5

In [75]:
# frequency app was run by user vs. all apps run by all users, as a percent
df['app_frequency'] = df['nb_run_app'].div(df['nb_run_app'].sum(axis=0), axis=0)

# percent ratings

df['percent_rating'] = df['rating'].div(5, axis=0)

## Make Predictions Using Surprise

### *Train-test split and the fit() method with SVD algorithm*

### *RMSE to evaluate train-test*


In [77]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from scipy import stats


#make a dataframe for the app use frequency 
rdf_app = df.drop(columns=['nb_run_app', 'rating', 'percent_rating'])

#define rating scale
reader = Reader(rating_scale=(0, 1))

# Load the app use frequency dataset 
data1 = Dataset.load_from_df(rdf_app, reader)
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset1, testset1 = train_test_split(data1, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset1)
predictions1 = algo.test(testset1)

# Then compute RMSE
appfreqout = accuracy.rmse(predictions1, verbose = False)

#========================================================================
## Now let's try it with the app rating as a percent
#========================================================================

#check for normality before using RMSE to confirm fit
stats.normaltest(df['percent_rating'])

#make a dataframe for user rating frequency
rdf_rat = df.drop(columns=['nb_run_app', 'rating', 'app_frequency'])

#define rating scale
reader = Reader(rating_scale=(0, 1))

# Load the app use frequency dataset 
data2 = Dataset.load_from_df(rdf_rat, reader)
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset2, testset2 = train_test_split(data2, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset2)
predictions2 = algo.test(testset2)

# Then compute RMSE
ratingout = accuracy.rmse(predictions2, verbose = False)

#print results
print('RMSE using SVD on app ratings by percent: ', ratingout, '\n RMSE using SVD on app frequency as a percent: ', appfreqout)

RMSE using SVD on app ratings by percent:  0.24377941046147297 
 RMSE using SVD on app frequency as a percent:  0.04002215530826723
