## Recommendation as a rating prediction/classification problem

### Classification: Given the history of a user and a new beer, whether the user will give overall rating >= 4 (success) or not (failure)
### Regression: Given the history of a user and a new beer, predict the overall rating user will give

#### The approach can be be to predict/classify all aspect-ratings 

In [19]:
import fastparquet
import pandas as pd
import numpy as np
all_users = pd.read_parquet("userFinal.parquet")
# all_prods = pd.read_parquet("prod307Final.parquet")
all_prods = pd.read_parquet("prod2549Final.parquet")

In [20]:
_grouped = all_users.groupby('review/profileName').sum()
_indices = _grouped[_grouped['review/count'] >= 500].index
newUsers = all_users[all_users['review/profileName'].isin(_indices)]

In [21]:
len(newUsers), newUsers['review/profileName'].nunique()

(2501699, 1908)

### Stratified splitting (per user) data into 80-20 ratio

In [22]:
training = newUsers.groupby('review/profileName').sample(frac=0.80, replace=False)
len(training)/len(newUsers)

0.8000027181527434

In [23]:
testing = newUsers.drop(training.index)
len(testing)/len(newUsers)

0.1999972818472566

In [24]:

def get_usrdf(usr_id, sampleDF):
    return sampleDF[sampleDF['user/Id'] == usr_id].sort_values(by='beer/id')
def get_usr_prod_df(usrdf):
    _flag = all_prods['beer/id'].isin(usrdf['beer/id'])
    return all_prods[_flag]
    
get_A = lambda usr_prod_df: usr_prod_df.iloc[:,10:]
get_b = lambda usrdf: usrdf.iloc[:,4] #! only overall score

In [25]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform


def test(usr_id, regressor, verbose=True, classification = False):
    clf = regressor
    Test_usrdf = get_usrdf(usr_id, sampleDF=testing)
    Test_usr_prod_df = get_usr_prod_df(Test_usrdf)
    Test_A = get_A(Test_usr_prod_df)
    Test_b = get_b(Test_usrdf)
    if classification:
        Test_b = np.where(Test_b >= 4, 1, 0)
    score = clf.score(X=Test_A, y=Test_b)
    if verbose:
        print(
            f"[!] [Testing ] usr_id: {usr_id:^5} | Samples: {len(Test_usrdf):^5} |"
            f" Parameters: {Test_A.shape[1]:^5} | Test Score: {round(score,3):>3}", end=""
        )
    return score

In [26]:
usr_ids = newUsers['user/Id'].sample(5)

### Training `Ridge Regressor` with `5-fold cross validation` and `random search`
#### Problem Statement: Given the history of a user and a new beer, predict the overall rating user will give

In [27]:
model_dict_rf = {}

print(r"<==*== Fitting Ridge regressor ==*==>".center(150),end="\n\n")

for usr_id in usr_ids:
    clf = RandomizedSearchCV(
        Ridge(),
        {"alpha": uniform(loc=1.1, scale=3.9)},  # uniformly pick between 1.1 and 5
        n_iter=5,
        cv=5,
        verbose=0,
    )
    usrdf = get_usrdf(usr_id, training)
    usr_prod_df = get_usr_prod_df(usrdf)
    A = get_A(usr_prod_df)
    b = get_b(usrdf)
    print(
        f"[!] [Training] usr_id: {usr_id:^5} | Samples: {len(usrdf):^5} |"
        f" Parameters: {A.shape[1]:^5} | ",
        end="",
    )
    clf.fit(A, b)
    print(f"Best Score: {round(clf.best_score_,3)} | Best Param: {clf.best_params_} ")

    model_dict_rf[usr_id] = clf

                                                        <==*== Fitting Ridge regressor ==*==>                                                         

[!] [Training] usr_id: 16011 | Samples:  851  | Parameters: 2549  | Best Score: 0.321 | Best Param: {'alpha': 1.3885821740641209} 
[!] [Training] usr_id: 3035  | Samples: 2082  | Parameters: 2549  | Best Score: 0.238 | Best Param: {'alpha': 2.444668595146262} 
[!] [Training] usr_id: 8240  | Samples:  819  | Parameters: 2549  | Best Score: 0.324 | Best Param: {'alpha': 1.4782065008230552} 
[!] [Training] usr_id: 15603 | Samples: 6255  | Parameters: 2549  | Best Score: 0.435 | Best Param: {'alpha': 2.738984978716875} 
[!] [Training] usr_id: 7100  | Samples: 1510  | Parameters: 2549  | Best Score: 0.298 | Best Param: {'alpha': 2.001005817874194} 


In [28]:
for usr_id in model_dict_rf.keys():
    test(usr_id,regressor=model_dict_rf[usr_id], classification=False)
    print(f" | Best Training Score: {round(model_dict_rf[usr_id].best_score_,3):>3}")

[!] [Testing ] usr_id: 16011 | Samples:  213  | Parameters: 2549  | Test Score: 0.367 | Best Training Score: 0.321
[!] [Testing ] usr_id: 3035  | Samples:  520  | Parameters: 2549  | Test Score: 0.245 | Best Training Score: 0.238
[!] [Testing ] usr_id: 8240  | Samples:  205  | Parameters: 2549  | Test Score: 0.322 | Best Training Score: 0.324
[!] [Testing ] usr_id: 15603 | Samples: 1564  | Parameters: 2549  | Test Score: 0.427 | Best Training Score: 0.435
[!] [Testing ] usr_id: 7100  | Samples:  378  | Parameters: 2549  | Test Score: 0.248 | Best Training Score: 0.298


### Training `Random Forest Classfier` with `5-fold cross validation` and `grid search`
#### Problem Statement: Given the history of a user and a new beer, whether the user will give overall rating >= 4 (success) or not (failure)

In [29]:
import warnings
warnings.filterwarnings("ignore")

model_dict_ridge = {}

print(r"<==*== Fitting Random Forest Classifier ==*==>".center(150),end="\n\n")


for usr_id in usr_ids:
    clf = GridSearchCV(
        RandomForestClassifier(),
        {"min_samples_split": [2, 5, 10, 15], "min_samples_leaf": [2, 5, 10, 15]},
        n_jobs=5,
        cv=5,
        verbose=0,
    )
    usrdf = get_usrdf(usr_id, training)
    usr_prod_df = get_usr_prod_df(usrdf)
    A = get_A(usr_prod_df)
    b = np.where(get_b(usrdf)>= 4, 1, 0)
    print(
        f"[!] [Training] usr_id: {usr_id:^5} | Samples: {len(usrdf):^5} |"
        f" Parameters: {A.shape[1]:^5} | ",
        end="",
    )
    clf.fit(A, b)
    print(f"Best Score: {round(clf.best_score_,3):>3} | Best Param: {clf.best_params_} ")

    model_dict_ridge[usr_id] = clf

                                                    <==*== Fitting Random Forest Classifier ==*==>                                                    

[!] [Training] usr_id: 16011 | Samples:  851  | Parameters: 2549  | Best Score: 0.696 | Best Param: {'min_samples_leaf': 2, 'min_samples_split': 15} 
[!] [Training] usr_id: 3035  | Samples: 2082  | Parameters: 2549  | Best Score: 0.625 | Best Param: {'min_samples_leaf': 15, 'min_samples_split': 15} 
[!] [Training] usr_id: 8240  | Samples:  819  | Parameters: 2549  | Best Score: 0.656 | Best Param: {'min_samples_leaf': 15, 'min_samples_split': 5} 
[!] [Training] usr_id: 15603 | Samples: 6255  | Parameters: 2549  | Best Score: 0.67 | Best Param: {'min_samples_leaf': 15, 'min_samples_split': 10} 
[!] [Training] usr_id: 7100  | Samples: 1510  | Parameters: 2549  | Best Score: 0.699 | Best Param: {'min_samples_leaf': 5, 'min_samples_split': 15} 


In [30]:
for usr_id in model_dict_ridge.keys():
    test(usr_id,regressor=model_dict_ridge[usr_id], classification=True)
    print(f" | Best Training Score: {round(model_dict_ridge[usr_id].best_score_,3):>3}")

[!] [Testing ] usr_id: 16011 | Samples:  213  | Parameters: 2549  | Test Score: 0.7 | Best Training Score: 0.696
[!] [Testing ] usr_id: 3035  | Samples:  520  | Parameters: 2549  | Test Score: 0.656 | Best Training Score: 0.625
[!] [Testing ] usr_id: 8240  | Samples:  205  | Parameters: 2549  | Test Score: 0.639 | Best Training Score: 0.656
[!] [Testing ] usr_id: 15603 | Samples: 1564  | Parameters: 2549  | Test Score: 0.656 | Best Training Score: 0.67
[!] [Testing ] usr_id: 7100  | Samples:  378  | Parameters: 2549  | Test Score: 0.69 | Best Training Score: 0.699
