Permalink
Browse files

first version, can be used for collaborative filtering and very simpl…

…e topic model
  • Loading branch information...
1 parent 2c837e1 commit 36603c21e6ecce6cd3539459e5d5094f95c99643 @whitefoxx committed Jul 29, 2012
Showing with 1,127 additions and 1 deletion.
  1. +32 −0 Makefile
  2. +32 −1 README.md
  3. +10 −0 src/config.h
  4. +47 −0 src/corpus.cpp
  5. +42 −0 src/corpus.h
  6. +78 −0 src/dataset.cpp
  7. +95 −0 src/dataset.h
  8. +53 −0 src/lda_cf.cpp
  9. +211 −0 src/model.cpp
  10. +61 −0 src/model.h
  11. +207 −0 src/model_tp.cpp
  12. +64 −0 src/model_tp.h
  13. +174 −0 src/utils.cpp
  14. +21 −0 src/utils.h
View
@@ -0,0 +1,32 @@
+CC = g++
+CFLAGS = -O3 -Wall
+
+all: lda_cf.o model.o dataset.o utils.o model_tp.o corpus.o
+ $(CC) $(CFLAGS) lda_cf.o model.o dataset.o utils.o model_tp.o corpus.o -o bin/lda
+
+lda_cf.o:
+ $(CC) $(CFLAGS) -c src/lda_cf.cpp -o lda_cf.o
+
+model.o:
+ $(CC) $(CFLAGS) -c src/model.cpp -o model.o
+
+model_tp.o:
+ $(CC) $(CFLAGS) -c src/model_tp.cpp -o model_tp.o
+
+dataset.o:
+ $(CC) $(CFLAGS) -c src/dataset.cpp -o dataset.o
+
+corpus.o:
+ $(CC) $(CFLAGS) -c src/corpus.cpp -o corpus.o
+
+utils.o:
+ $(CC) $(CFLAGS) -c src/utils.cpp -o utils.o
+
+clean:
+ rm *.o
+
+## example ##
+#lda -CF -train_file E:/data/epinions/6/train_map.txt -test_file E:/data/epinions/6/test_map.txt -n_user 40163 -n_item 139738 -n_v 5 -n_iter 100 -alpha 0.5 -beta 0.1 -n_topic 10
+#lda -TP -train_file E:/workspace/GibbsLDA++-0.2/data/trndocs.dat -alpha 0.5 -beta 0.1 -n_topic 100 -n_iter 500 -savestep 100 -tpNwd 20 -docNtp 10
+#$ lda -CF -train_file E:/data/epinions/6/base_2_map -n_iter 100 -n_user 6040 -n_item 3706 -n_v 5 -test_file E:/data/epinions/6/test_2_map -alpha 1 -beta 0.1 0.1 0.35 0.35 0.1 -n_topic 5
+
View
@@ -1,4 +1,35 @@
LDA_CF
======
-LDA_CF is a c/c++ implementation of URP model(Modeling User Rating Profiles for Collaborative Filtering, Benjamin Marlin) using Gibbs sampling.
+1. Introduction
+===============
+ LDA_CF is a c/c++ implementation of URP model(Modeling User Rating Profiles for Collaborative Filtering, Benjamin Marlin) using Gibbs sampling. The method used in the original paper is variation inference. It's more difficult to coding and often not as good as Gibbs method. Since the result is closely related to the initialized process and this process is not easy to implement. Before you go deep into the implementation details, maybe you should read the formula derivation in paper "Regularized Gibbs sampling for user profiling with soft constraints".
+
+2. How to use LDA_CF
+====================
+ 2.1. data format
+ ----------------
+ The format in training data and test data are the same. As follow:
+ user_id|rating_count
+ user_id item_id rating timestamp
+ user_id item_id rating timestamp
+ ...
+ ...
+ The "rating_count" is the number of ratings of user with id "user_id", so there are exactly "rating_count" line below the line "user_id|rating_count". Note that, timestamp is not necessary, it is optional which is specified in the command line by "has_time".
+
+ 2.2. command line
+ -----------------
+ lda -CF -train_file <string> -alpha <double> -beta <double> -n_topic <int> -n_iter <int> -n_user <int> -n_item <int> -n_v <int> -test_file <string> -has_time -del <string>
+ n_topic: the number of topics
+ n_iter: the number of iteration
+ n_user: the number of users
+ n_item: the number of items
+ n_v: the number of difference values of rating score
+ train_file: the absolute path of training file
+ test_file: the absolute path of test file
+ has_time: whether there is timestamp in data set or not, if not, don't write this optional parameter
+ del: delimiter of every rating lines, the default is blank space
+ alpha, beta: LDA hyper parameters, note that there are "n_v" values for beta
+
+ It can also be used for topic model.
+ lda -TP -train_file <string> -alpha <double> -beta <double> -eta <double> -n_topic <int> -savestep <int> -tpNwd <int> -docNtp <int>
View
@@ -0,0 +1,10 @@
+#ifndef _CONFIG_H_
+#define _CONFIG_H_
+
+#define PASER_ERROR 1000
+#define READ_DATA_ERROR 2000
+#define PARAMETER_ERROR 3000
+#define OPEN_FILE_ERROR 4000
+
+#endif
+
View
@@ -0,0 +1,47 @@
+#include "corpus.h"
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include "config.h"
+
+int corpus::read_data(string data_file)
+{
+ FILE *fp = fopen(data_file.c_str(), "r");
+ if (fp == NULL)
+ return OPEN_FILE_ERROR;
+
+ const int MAX_LEN = 2 << 14;
+ char line[MAX_LEN];
+ char *tok;
+ fgets(line, MAX_LEN, fp);
+ n_doc = atoi(line);
+ docs = new document*[n_doc];
+ vector<int> tmp;
+ int id;
+ for (int i = 0; i < n_doc; i++) {
+ fgets(line, MAX_LEN, fp);
+ line[strlen(line)-1] = '\0';
+ tok = strtok(line, " ");
+ while (tok) {
+ if (word2id.find(string(tok)) == word2id.end()) {
+ id = word2id.size();
+ word2id[string(tok)] = id;
+ id2word[id] = string(tok);
+ }
+ tmp.push_back(word2id[string(tok)]);
+ tok = strtok(NULL, " ");
+ }
+ document *doc = new document(tmp.size());
+ docs[i] = doc;
+ int n = 0;
+ for (vector<int>::iterator it = tmp.begin(); it != tmp.end(); it++) {
+ docs[i]->words[n++] = *it;
+ }
+ tmp.clear();
+ }
+ n_word = word2id.size();
+ fclose(fp);
+
+ return 0;
+}
View
@@ -0,0 +1,42 @@
+#ifndef _CORPUS_H_
+#define _CORPUS_H_
+
+#include <string>
+#include <map>
+
+using namespace std;
+
+class document {
+public:
+ int *words;
+ int length;
+ document() {
+ length = 0;
+ words = NULL;
+ }
+ document(int len) {
+ length = len;
+ words = new int[length];
+ }
+};
+
+class corpus {
+public:
+ int n_doc;
+ int n_word;
+ map<string, int> word2id;
+ map<int, string> id2word;
+
+ document **docs;
+
+ corpus() {
+ n_doc = 0;
+ n_word = 0;
+ docs = NULL;
+ }
+
+ int read_data(string train_file);
+};
+
+#endif
+
View
@@ -0,0 +1,78 @@
+#include "dataset.h"
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include "config.h"
+
+int dataset::read_ratings(string data_file, vector<int> *users)
+{
+ FILE *fp = fopen(data_file.c_str(), "r");
+ if (fp == NULL)
+ return OPEN_FILE_ERROR;
+ char line[100];
+ char *tok;
+ int user, item, star, timestamp, n;
+ while (fgets(line, 100, fp)) {
+ tok = strtok(line, "|");
+ user = atoi(tok);
+ users->push_back(user);
+ tok = strtok(NULL, "|");
+ n = atoi(tok);
+ user_rs[user] = new rating_set(n);
+ double s = 0;
+ for (int i = 0; i < n; i++) {
+ fgets(line, 100, fp);
+ tok = strtok(line, del.c_str());
+ user = atoi(tok);
+ tok = strtok(NULL, del.c_str());
+ item = atoi(tok);
+ tok = strtok(NULL, del.c_str());
+ star = atoi(tok);
+ if (has_time) {
+ tok = strtok(NULL, del.c_str());
+ timestamp = atoi(tok);
+ }
+ else {
+ timestamp = 0;
+ }
+ user_rs[user]->ratings[i] = rating(item, star, timestamp);
+ s += (double)star;
+ }
+ user_rs[user]->avg = s / n;
+ }
+ fclose(fp);
+
+ return 0;
+}
+
+
+int dataset::read_links(string link_file)
+{
+ FILE *fp = fopen(link_file.c_str(), "r");
+ if (fp == NULL)
+ return OPEN_FILE_ERROR;
+ char line[100];
+ char *tok;
+ vector<int> *all_links = new vector<int>[n_user];
+ int user_1, user_2;
+ while (fgets(line, 100, fp)) {
+ tok = strtok(line, " ");
+ user_1 = atoi(tok);
+ tok = strtok(NULL, " ");
+ user_2 = atoi(tok);
+ all_links[user_1].push_back(user_2);
+ all_links[user_2].push_back(user_1);
+ }
+ fclose(fp);
+
+ for (int i = 0; i < n_user; i++) {
+ user_ls[i] = new link_set(all_links[i].size());
+ for (int j = 0; j < user_ls[i]->length; j++) {
+ user_ls[i]->links[j] = all_links[i][j];
+ }
+ }
+
+ return 0;
+}
+
View
@@ -0,0 +1,95 @@
+#ifndef _DATASET_H_
+#define _DATASET_H_
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+class link_set {
+public:
+ int *links;
+ int length;
+
+ link_set() {
+ links = NULL;
+ length = 0;
+ }
+
+ link_set(int len) {
+ length = len;
+ links = new int[length];
+ }
+};
+
+class rating {
+public:
+ int item;
+ int star;
+ int timestamp;
+
+ rating() {}
+
+ rating(int i, int s, int t) {
+ item = i;
+ star = s;
+ timestamp = t;
+ }
+};
+
+class rating_set {
+public:
+ rating *ratings;
+ int length;
+ double avg;
+ rating_set() {
+ length = 0;
+ avg = 0;
+ ratings = NULL;
+ }
+ rating_set(int len) {
+ length = len;
+ avg = 0;
+ ratings = new rating[length];
+ }
+};
+
+class dataset {
+public:
+ int n_user;
+ int n_item;
+ int n_v;
+ bool has_time;
+ string del;
+
+ rating_set **user_rs;
+ link_set **user_ls;
+
+ dataset() {
+ n_user = 0;
+ n_item = 0;
+ n_v = 0;
+ user_rs = NULL;
+ user_ls = NULL;
+ }
+
+ dataset(int nu, int ni, int nv, bool has_time, string del) {
+ n_user = nu;
+ n_item = ni;
+ n_v = nv;
+ user_rs = new rating_set*[n_user];
+ user_ls = new link_set*[n_user];
+ for (int i = 0; i < n_user; i++) {
+ user_rs[i] = NULL;
+ user_ls[i] = NULL;
+ }
+ this->has_time = has_time;
+ this->del = del;
+ }
+
+ int read_ratings(string train_file, vector<int> *users);
+ int read_links(string link_file);
+};
+
+#endif
+
Oops, something went wrong. Retry.

0 comments on commit 36603c2

Please sign in to comment.