In [1]:
import pickle
import pandas as pd
import numpy as np
import os
import cornac
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from cornac import Experiment
import datetime

FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.


In [2]:
def get_similarity_dict(rating_path, contact_path):
    info = pd.read_csv(rating_path)
    with open(contact_path, 'rb') as f:
        contact = pickle.load(f)

    users = info.loc[:, "userId"].astype(str).drop_duplicates().values
    movies = info.loc[:, "movieId"].astype(str).drop_duplicates().values
    user_dict = {user_id : idx for idx, user_id in enumerate(users)}
    movie_dict = {movie_id : idx for idx, movie_id in enumerate(movies)}

    row = info.loc[:, "userId"].astype(str).values
    col = info.loc[:, "movieId"].astype(str).values
    row = [user_dict[id] for id in row]
    col = [movie_dict[id] for id in col]
    data = info.loc[:, "rating"].astype(np.float64).values
    D_coo_matrix = coo_matrix((data, (row, col)))
    D_sparse = csr_matrix(D_coo_matrix)
    similarity_matrix = cosine_similarity(D_sparse, dense_output=True)

    sim_dict = dict()
    for idx in contact.keys():
        for k, v in contact[idx].items():
            for p in contact[idx][k]:
                user_sim_key_1 = str(k) + "-" + str(p)
                user_sim_key_2 = str(p) + "_" + str(k)
                if sim_dict.__contains__(user_sim_key_1) or sim_dict.__contains__(user_sim_key_2):
                    continue
                else:
                    sim_dict[user_sim_key_1] = similarity_matrix[user_dict[str(k)], user_dict[str(p)]]
    return sim_dict, contact, info

In [3]:
class ExperimentResult(list):
    """
    Result Class for an Experiment. A list of :obj:`cornac.experiment.Result`. 
    """

    def __str__(self):
        print("===================================================")

        headers = list(self[0].metric_avg_results.keys())
        data, index = [], []
        for r in self:
            data.append([NUM_FMT.format(r.metric_avg_results[m]) for m in headers])
            index.append(r.model_name)
        return self._table_format(data, headers, index, h_bars=[1])

    
    def _table_format(data, headers=None, index=None, extra_spaces=0, h_bars=None):
        print("===================================================")
        if headers is not None:
            data.insert(0, headers)
        if index is not None:
            index.insert(0, "")
            for idx, row in zip(index, data):
                row.insert(0, idx)

        column_widths = np.asarray([[len(str(v)) for v in row] for row in data]).max(axis=0)

        row_fmt = (
            " | ".join(["{:>%d}" % (w + extra_spaces) for w in column_widths][1:]) + "\n"
        )
        if index is not None:
            row_fmt = "{:<%d} | " % (column_widths[0] + extra_spaces) + row_fmt

        output = ""
        for i, row in enumerate(data):
            if h_bars is not None and i in h_bars:
                output += row_fmt.format(
                    *["-" * (w + extra_spaces) for w in column_widths]
                ).replace("|", "+")
            output += row_fmt.format(*row)
            print("row-------------------------------------------------", row)
        return output


class CVExperimentResult(ExperimentResult):
    """
    Result Class for a cross-validation Experiment.
    """

    def __str__(self):
        print("===================================================")

        return "\n".join([r.__str__() for r in self])

In [4]:
class MyExperiment(Experiment):
    def __init__(
            self,
            eval_method,
            models,
            metrics,
            user_based=True,
            show_validation=True,
            verbose=False,
            save_dir=None
        ):
        super().__init__(
            eval_method=eval_method,
            models=models,
            metrics=metrics,
            user_based=True,
            show_validation=True,
            verbose=False,
            save_dir=None
        )
        self.eval_method = eval_method
        self.models = self._validate_models(models)
        self.metrics = self._validate_metrics(metrics)
        self.user_based = user_based
        self.show_validation = show_validation
        self.verbose = verbose
        self.save_dir = save_dir
        self.result = None
        self.val_result = None
    
    def run(self):
        """Run the Cornac experiment"""
        self._create_result()

        for model in self.models:
            test_result, val_result = self.eval_method.evaluate(
                model=model,
                metrics=self.metrics,
                user_based=self.user_based,
                show_validation=self.show_validation,
            )

            # a = {k:v for k, v in test_result.items()}
            # b = {k:v for k, v in val_result.items()}
            # print(a, b)
            self.result.append(test_result)
            print("---------------------------------------------", type(test_result))
            if self.val_result is not None:
                self.val_result.append(val_result)

            if not isinstance(self.result, CVExperimentResult):
                model.save(self.save_dir)

        output = ""
        if self.val_result is not None:
            output += "\nVALIDATION:\n...\n{}".format(self.val_result)
        output += "\nTEST:\n...\n{}".format(self.result)

        print(output)
        return self.result

        # timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
        # save_dir = "." if self.save_dir is None else self.save_dir
        # output_file = os.path.join(save_dir, "CornacExp-{}.log".format(timestamp))
        # with open(output_file, "w") as f:
        #     f.write(output)

In [None]:
data = (info.loc[info["userId"].isin(p)]).drop(labels="timestamp", axis=1)
data = [(str(data.iloc[i][0]), str(data.iloc[i][1]), float(data.iloc[i][2])) for i in range(data.shape[0]) ]
ratio_split = cornac.eval_methods.RatioSplit(data=data, test_size=0.4)

global_avg = cornac.models.GlobalAvg()
mf = cornac.models.MF(
    k=10,
    max_iter=30,
    learning_rate=0.01,
    lambda_reg=0.02,
    use_bias=True,
    early_stop=True,
    verbose=True,
)

mse = cornac.metrics.MSE()


# data
re = MyExperiment(
    eval_method=ratio_split,
    models=[mf],
    metrics=[mse]
    # user_based=True,
).run()

In [3]:
sim_dict, contact, info = get_similarity_dict("./rating_7474.csv", "./contact_7474.pkl")

In [20]:
# with open("test_7474.pkl", 'wb') as f:
#     pickle.dump(sim_dict, f)

In [4]:
item_size = [contact[k].__len__() for k in contact.keys()]

In [5]:
peer = contact[6]
peer

{98477: {30430, 55245},
 55245: {98477},
 126727: {19889},
 19889: {126727},
 124792: {141900},
 118999: {84374},
 84374: {118999},
 11940: {97811},
 97811: {11940},
 13302: {85033},
 90664: {43593, 57784, 67626, 75242, 91032, 112619, 113605},
 42759: {70369, 158786},
 70369: {42759, 158786},
 158786: {42759, 70369},
 20123: {4882,
  7730,
  10173,
  12643,
  28515,
  47745,
  78975,
  80908,
  118581,
  130885,
  135403},
 4882: {20123, 112600},
 112600: {4882},
 59353: {27440},
 64439: {70267},
 109046: {108585},
 108585: {109046, 151958},
 151958: {108585},
 151177: {4207, 73867},
 143784: {67265, 115074},
 115074: {67265, 143784},
 67265: {115074, 143784},
 25337: {15764},
 15764: {25337},
 107342: {108056},
 108056: {107342},
 4207: {73867, 151177},
 73867: {4207, 151177},
 43810: {144117, 159083},
 159083: {43810, 144117},
 144117: {43810, 159083},
 19509: {5210, 23209, 110437, 113405},
 23209: {5210, 19509, 110437, 113405},
 110437: {5210, 19509, 23209, 66071, 113405, 122214},
 

In [6]:
peer_info = {i:0 for i in range(1, 32)}
for k, v in peer.items():
    l = v.__len__()
    if l > 30:
        print(k)
        break
    num = peer_info[l] + 1
    peer_info[l] = num

22499


In [7]:
t = 22499
p = list(peer[22499])
p.append(t)


In [8]:
p

[47745,
 152070,
 80908,
 142996,
 104724,
 110231,
 119935,
 113696,
 132390,
 70575,
 7730,
 73523,
 5557,
 118581,
 10173,
 139838,
 130885,
 51271,
 40137,
 136913,
 59481,
 142176,
 111329,
 127202,
 123488,
 12643,
 28515,
 135403,
 16494,
 86651,
 78975,
 22499]

In [9]:
data = (info.loc[info["userId"].isin(p)]).drop(labels="timestamp", axis=1)
data = [(str(data.iloc[i][0]), str(data.iloc[i][1]), float(data.iloc[i][2])) for i in range(data.shape[0]) ]
ratio_split = cornac.eval_methods.RatioSplit(data=data, test_size=0.4)

global_avg = cornac.models.GlobalAvg()
mf = cornac.models.MF(
    k=10,
    max_iter=30,
    learning_rate=0.01,
    lambda_reg=0.02,
    use_bias=True,
    early_stop=True,
    verbose=True,
)

mse = cornac.metrics.MSE()


# data
re = MyExperiment(
    eval_method=ratio_split,
    models=[mf],
    metrics=[mse]
    # user_based=True,
).run()

In [10]:

# ratio_split = cornac.eval_methods.RatioSplit(data=data, test_size=0.4, verbose=True, user_based=True)



In [12]:
cornac.models.SVD()

<cornac.models.svd.recom_svd.SVD at 0x1047fbd60>

In [13]:
# mae = cornac.metrics.MAE()
# rmse = cornac.metrics.RMSE()
mse = cornac.metrics.MSE()

In [155]:
# r = cornac.Experiment(
#     eval_method=ratio_split,
#     models=[global_avg, mf],
#     metrics=[mse],
#     user_based=True,
# ).run()

100%|██████████| 10/10 [00:00<00:00, 1669.71it/s, loss=107.23]

Optimization finished!

TEST:
...
          |    MSE | Train (s) | Test (s)
--------- + ------ + --------- + --------
GlobalAvg | 1.0422 |    0.0000 |   0.0181
MF        | 0.9034 |    0.0084 |   0.0265






'rating'

In [14]:
NUM_FMT = "{:.4f}"


In [24]:
import datetime

100%|██████████| 30/30 [00:00<00:00, 1426.86it/s, loss=47.33]

Optimization finished!
--------------------------------------------- <class 'cornac.experiment.result.Result'>

TEST:
...
   |    MSE | Train (s) | Test (s)
-- + ------ + --------- + --------
MF | 0.9031 |    0.0230 |   0.0268






In [29]:
type(re)     

cornac.experiment.result.ExperimentResult

In [34]:
re.__str__()

'   |    MSE | Train (s) | Test (s)\n-- + ------ + --------- + --------\nMF | 0.9031 |    0.0230 |   0.0268\n'

In [67]:
d = cornac.datasets.movielens.load_feedback(variant="100K")

In [81]:
d[0]

('196', '242', 3.0)