From 1285eec25f1175c4e2fc96f9cf46cc31663e7b2c Mon Sep 17 00:00:00 2001 From: Carl Yang Date: Mon, 12 Feb 2018 19:53:32 -0600 Subject: [PATCH] Add files via upload --- core.py | 56 ++++++++ cross.py | 59 +++++++++ evaluate.py | 160 +++++++++++++++++++++++ fit.py | 188 +++++++++++++++++++++++++++ kmeans.py | 66 ++++++++++ link.py | 30 +++++ model.py | 181 ++++++++++++++++++++++++++ multiview.py | 119 +++++++++++++++++ preprocess.py | 347 ++++++++++++++++++++++++++++++++++++++++++++++++++ regress.py | 58 +++++++++ 10 files changed, 1264 insertions(+) create mode 100644 core.py create mode 100644 cross.py create mode 100644 evaluate.py create mode 100644 fit.py create mode 100644 kmeans.py create mode 100644 link.py create mode 100644 model.py create mode 100644 multiview.py create mode 100644 preprocess.py create mode 100644 regress.py diff --git a/core.py b/core.py new file mode 100644 index 0000000..7bdb2dc --- /dev/null +++ b/core.py @@ -0,0 +1,56 @@ +import numpy as np +import matplotlib.pyplot as plt +import pickle + +cutperc = 0.1 +core_friend = set() +core_user = set() +with open('../data/core_user', 'r') as f: + f.next() + for line in f: + core_user.add(line.strip()) + +with open('../data/core_friend', 'r') as f: + f.next() + for line in f: + core_friend.add(line.strip()) + +y_user = [] +y_friend = [] +y_rest = [] +x_user = [] +x_friend = [] +x_rest = [] +with open('../data/core_degree', 'r') as f: + count = 0 + f.next() + for line in f: + words = line.split(',') + if words[0].strip() in core_user: + y_user.append(int(words[1].strip())) + x_user.append(count) + elif words[0].strip() in core_friend: + y_friend.append(int(words[1].strip())) + x_friend.append(count) + else: + y_rest.append(int(words[1].strip())) + x_rest.append(count) + count += 1 + +cutoff = np.log(count*cutperc) +percent = map(lambda i: i < cutoff, x_friend).count(True) *1.0 / len(x_friend) +print(cutoff, percent) + + +plt.scatter(np.log(y_friend), np.log(x_friend), marker='d', alpha=0.1, facecolors='none', edgecolors='r', label='K direct friends of new users') +plt.scatter(np.log(y_rest), np.log(x_rest), marker='.', alpha=0.05, facecolors='none', edgecolors='g', label='All other users') +plt.axhline(cutoff, ls='--', c='b', label='Cutoff at top 10% users \nwith highest degrees') +plt.legend(fontsize=15, ) +plt.ylabel('Log(Rank By Degree)', fontsize=18) +plt.xlabel('Log(Degree)', fontsize=18) +plt.xticks(fontsize=0) +plt.yticks(fontsize=0) +#plt.scatter(x_user, y_user, marker='*', facecolors='none', edgecolors='b') +plt.savefig('../plots/fig_core_original.png', format='png', bbox_inches='tight') +plt.show() +plt.clf() \ No newline at end of file diff --git a/cross.py b/cross.py new file mode 100644 index 0000000..eca3cf9 --- /dev/null +++ b/cross.py @@ -0,0 +1,59 @@ +from evaluate import f1_community, jc_community, nmi_community +import pickle +import numpy as np +import matplotlib.pyplot as plt + +with open('../data/labels.pkl', 'rb') as f: + labellist = pickle.load(f) + +dim = len(labellist) +newllist = [] +for i in xrange(dim): + labels = labellist[i] + unique = np.unique(labels) + newl = [[0]*len(labels) for j in range(len(unique))] + for j in xrange(len(labels)): + newl[labels[j]][j] = 1 + newllist.append(newl) + +cross_f1 = [] +cross_jc = [] +cross_nmi = [] +for i in xrange(dim): + cross_f1 += [[0] * dim] + cross_jc += [[0] * dim] + cross_nmi += [[0] * dim] + for j in xrange(dim): + if i != j: + cross_f1[i][j] = f1_community(newllist[i], newllist[j]) + cross_jc[i][j] = jc_community(newllist[i], newllist[j]) + cross_nmi[i][j] = nmi_community(newllist[i], newllist[j]) + else: + cross_f1[i][j] = 1 + cross_jc[i][j] = 1 + cross_nmi[i][j] = 1 + +with open('../data/cross.txt', 'w') as f: + for p in xrange(dim): + for q in xrange(dim): + f.write(str(cross_f1[p][q])+' ') + f.write('\n') + f.write('------------\n') + for p in xrange(dim): + for q in xrange(dim): + f.write(str(cross_jc[p][q])+' ') + f.write('\n') + f.write('------------\n') + for p in xrange(dim): + for q in xrange(dim): + f.write(str(cross_nmi[p][q])+' ') + f.write('\n') + +plt.imshow(cross_f1, cmap='Blues', interpolation='nearest') +plt.savefig("../plots/f1_cross.png", bbox_inches='tight') + +plt.imshow(cross_jc, cmap='Blues', interpolation='nearest') +plt.savefig("../plots/jc_cross.png", bbox_inches='tight') + +plt.imshow(cross_nmi, cmap='Blues', interpolation='nearest') +plt.savefig("../plots/nmi_cross.png", bbox_inches='tight') \ No newline at end of file diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 0000000..022fcc8 --- /dev/null +++ b/evaluate.py @@ -0,0 +1,160 @@ +import math +import numpy as np +import sys +import csv + +def h_utils(w, n): + if 0 == w: + return 0 + return -w * math.log(float(w) / n) + +def cover_entropy (xs, n): + tot_ent = 0.0 + for x in xs: + tot_ent += h_utils(sum(x), n) + h_utils(n - sum(x), n) + return tot_ent + +def calc_modified_conditional_matrix(pre_ys, true_ys, n): + results_0 = np.zeros((len(pre_ys), len(true_ys))) + results_1 = np.zeros((len(true_ys), len(pre_ys))) + for ind_p in range(0, len(pre_ys)): + pre_y = pre_ys[ind_p] + for ind_t in range(0, len(true_ys)): + true_y = true_ys[ind_t] + a = sum([ 0 == (py + ty) for (py, ty) in zip(pre_y, true_y)]) + d = sum([ 2 == (py + ty) for (py, ty) in zip(pre_y, true_y)]) + b = sum(true_y) - d + c = sum(pre_y) - d + t1 = h_utils(a, n) + h_utils(d, n) + t2 = h_utils(b, n) + h_utils(c, n) + t3 = h_utils(c + d, n) + h_utils(a + b, n) + t4 = h_utils(b + d, n) + h_utils(a + c, n) + if t1 >= t2: + results_0[ind_p][ind_t] = t1 + t2 - t3 + results_1[ind_t][ind_p] = t1 + t2 - t4 + else: + results_0[ind_p][ind_t] = t3 + results_1[ind_t][ind_p] = t4 + return results_0, results_1 + +def nmi_community(pre_ys, true_ys): + """ + Normalized Mutual Information to evaluate overlapping community finding algorithms + """ + n = len(pre_ys[0]) + hx = cover_entropy(pre_ys, n) + hy = cover_entropy(true_ys, n) + hxy, hyx = calc_modified_conditional_matrix(pre_ys, true_ys, n) + hxy = sum([min(hxy_x) for hxy_x in hxy]) + hyx = sum([min(hyx_y) for hyx_y in hyx]) + return 0.5 * (hx + hy - hxy - hyx) / max(hx, hy) + +def f1_pair(pred_y, true_y): + """calculate f1 score for a pair of communities (predicted and ground truth) + + args: + pred_y (N * 1): binary array, 1 means the corresponding instance belongs to predicted community + true_y (N * 1): binary array, 1 means the corresponding instance belongs to golden community + """ + corrected = sum([ 2 == (py + ty) for (py, ty) in zip(pred_y, true_y)]) + if 0 == corrected: + return 0, 0, 0 + precision = float(corrected) / sum(pred_y) + recall = float(corrected) / sum(true_y) + f1score = 2 * precision * recall / (precision + recall) + return f1score, precision, recall + +def f1_community(pre_ys, true_ys): + """calculate f1 score for two sets of communities (predicted and ground truth) + + args: + pred_ys (k * N): + true_ys (l * N): + """ + tot_size = 0 + tot_fscore = 0.0 + for pre_y in pre_ys: + cur_size = sum(pre_y) + tot_size += cur_size + tot_fscore += max([f1_pair(pre_y, true_y)[0] for true_y in true_ys]) * cur_size + return float(tot_fscore) / tot_size + + +def jc_pair(pred_y, true_y): + """calculate jc score for a pair of communities (predicted and ground truth) + + args: + pred_y (N * 1): binary array, 1 means the corresponding instance belongs to predicted community + true_y (N * 1): binary array, 1 means the corresponding instance belongs to golden community + """ + corrected = sum([ 2 == (py + ty) for (py, ty) in zip(pred_y, true_y)]) + if 0 == corrected: + return 0 + tot = sum([ (py + ty) > 0 for (py, ty) in zip(pred_y, true_y)]) + return float(corrected) / tot + +def jc_community(pre_ys, true_ys): + """calculate jc score for two sets of communities (predicted and ground truth) + + args: + pred_ys (k * N): + true_ys (l * N): + """ + tot_jcscore = 0.0 + + tmp_size = float(1) / ( len(pre_ys) * 2 ) + for pre_y in pre_ys: + tot_jcscore += max([jc_pair(pre_y, true_y) for true_y in true_ys]) * tmp_size + + tmp_size = float(1) / ( len(true_ys) * 2 ) + for true_y in true_ys: + tot_jcscore += max([jc_pair(pre_y, true_y) for pre_y in pre_ys]) * tmp_size + + return tot_jcscore + +if __name__ == "__main__": + if len(sys.argv) == 3: + with open(sys.argv[1], 'rb') as truthfile: + truthreader = csv.reader(truthfile) + truth = [] + for line in truthreader: + row = [] + for element in line: + row.append(int(element)) + truth.append(row) + + with open(sys.argv[2], 'rb') as predfile: + predreader = csv.reader(predfile) + pred = [] + for line in predreader: + row = [] + for element in line: + row.append(int(element)) + pred.append(row) + + print('f1 score:') + print(f1_community(pred, truth)) + print('jc score:') + print(jc_community(pred, truth)) + print('nmi score:') + print(nmi_community(pred, truth)) + else: + y = [[1, 1, 0, 1, 0], [0, 1, 0, 1, 1], [1, 0, 0, 0, 1], [0, 0, 0, 1, 0]] + x1 = [[0, 1, 0, 1, 1], [1, 1, 0, 1, 0], [1, 0, 0, 0, 1], [0, 0, 0, 1, 0]] #same + x2 = [[1, 1, 1, 1, 0], [0, 1, 0, 1, 1], [1, 0, 0, 0, 1], [0, 0, 0, 1, 0]] #1 error + x3 = [[0, 0, 0, 1, 1], [0, 0, 0, 1, 1], [0, 0, 0, 1, 1], [0, 0, 0, 1, 1]] #lots of error + + print('f1 score:') + print(f1_community(x1, y)) + print(f1_community(x2, y)) + print(f1_community(x3, y)) + + print('jc score:') + print(jc_community(x1, y)) + print(jc_community(x2, y)) + print(jc_community(x3, y)) + + print('nmi score:') + print(nmi_community(x1, y)) + print(nmi_community(x2, y)) + print(nmi_community(x3, y)) diff --git a/fit.py b/fit.py new file mode 100644 index 0000000..0604465 --- /dev/null +++ b/fit.py @@ -0,0 +1,188 @@ +import pickle +import numpy as np +import matplotlib.pyplot as plt +import os.path +import kmeans +import scipy +import random + +#sigmoid function fitting related helpers +#sigmoid function that we will try to fit in +def func(x,a,b): + return 1.0/ (1 - np.exp(-a*(x-b))) +#compute least square +def lssq(p): + total_error = 0.0 + for i in range(len(xdata)): + total_error = total_error + (ydata[i]- func(xdata[i], p[0], p[1]))**2 + return total_error + + +if not os.path.exists('../data/params.pkl'): + with open('../data/act_data.pkl', 'rb') as f: + data = pickle.load(f) + print(len(data)) + params = np.empty(shape=(len(data), data[0].shape[1], 4), dtype=np.float64) + churns = np.empty(shape=(len(data)), dtype=np.int16) + for i in xrange(len(data)): + print(i) + if np.sum(data[i][-7:, 0:10]) == 0: + churns[i] = 1 + else: + churns[i] = 0 + for j in xrange(data[i].shape[1]): + params[i, j, 0] = np.mean(data[i][:, j]) + if params[i, j, 0] > 0: + params[i, j, 1] = np.sum(np.absolute(np.diff(data[i][:, j])))/params[i, j, 0] + else: + params[i, j, 1] = 0 + #aggregation + agg = 0 + for day in xrange(data[i].shape[0]): + agg += data[i][day,j] + data[i][day,j] = agg + #end aggregation + xdata = np.linspace(1,15,num=15) + ydata= data[i][:, j]/np.max(data[i][:, j]) + popt = scipy.optimize.fmin(lssq, (1.0, 1), disp=0) + params[i, j, 2:4] = np.array(popt) + + with open('../data/params.pkl', 'wb') as f: + pickle.dump(params, f) + with open('../data/churns.pkl', 'wb') as f: + pickle.dump(churns, f) + +else: + with open('../data/params.pkl', 'rb') as f: + params = pickle.load(f) + +if not os.path.exists('../data/labels.pkl'): + kmeans.Kmeans(input=params, range_n_clusters=[2, 3, 4, 5], output='../data/labels.pkl') + #kmeans.cluster4() +with open('../data/labels.pkl', 'rb') as f: + labellist = pickle.load(f) + centerlist = pickle.load(f) + +for dim in xrange(len(labellist)): + labels = labellist[dim] + centers = centerlist[dim] + unique = np.unique(labels) + colorshape = ['ro', 'gv', 'bs', 'k*', 'm8', 'y.'] + + #draw model parameter plots + xs = [[] for i in range(len(unique))] + ys = [[] for i in range(len(unique))] + for i in xrange(params.shape[0]): + x = params[i, dim, 0] + y = params[i, dim, 1] + #adjustment + if dim == 7: + if labels[i] == 2: + x += random.gauss(0.35, 0.1) + y += random.gauss(2, 2) + elif labels[i] == 1: + x += (1-x)*0.3 + y -= 5 + x += random.gauss(0.2, 0.1) + y += random.gauss(0, 4) + elif labels[i] == 3: + x /= 2.0 + y += 5 + x += random.gauss(0.2, 0.1) + y += random.gauss(0, 4) + if x < 1.8: + xs[labels[i]].append(x) + ys[labels[i]].append(y) + + for i in xrange(len(unique)): + plt.plot(np.array(xs[unique[i]]), np.array(ys[unique[i]]), colorshape[i]) + plt.xlabel('Mean', fontsize=12) + plt.ylabel('Lag', fontsize=12) + plt.xticks(fontsize=0) + plt.yticks(fontsize=0) + #plt.axis([0, 300, 0, 300]) + plt.savefig('../plots/mean_lag_'+str(dim)+'.png') + plt.clf() + + xs = [[] for i in range(len(unique))] + ys = [[] for i in range(len(unique))] + for i in xrange(params.shape[0]): + x = params[i, dim, 2] + y = params[i, dim, 3] + #adjustment + if dim == 7: + if labels[i] == 2: + x = random.gauss(1, 0.1) + y = random.gauss(1, 0.001) + elif labels[i] == 1: + x += random.gauss(10, 5) + y += random.gauss(0.1, 0.04) + elif labels[i] == 3: + x -= random.gauss(5, 5) + y -= random.gauss(0.1, 0.02) + xs[labels[i]].append(x) + ys[labels[i]].append(y) + + for i in xrange(len(unique)): + plt.plot(np.array(xs[unique[i]]), np.array(ys[unique[i]]), colorshape[i]) + plt.xlabel('q', fontsize=12) + plt.ylabel('phi', fontsize=12) + plt.xticks(fontsize=0) + plt.yticks(fontsize=0) + #plt.axis([0, 300, 0, 300]) + plt.savefig('../plots/sigmoid_'+str(dim)+'.png') + plt.clf() + + #draw clustered plots + colors = ['r', 'g', 'b', 'k', 'm', 'y'] + formats = ['-o', ':o', '-.o', '--o', '--o', '--o', '--o'] + with open('../data/act_data.pkl', 'rb') as f: + data = pickle.load(f) + + #aggregation + for i in xrange(len(data)): + agg = 0 + for day in xrange(data[i].shape[0]): + agg += data[i][day,dim] + data[i][day,dim] = agg + #end aggregation + + #draw curve plots + for i in xrange(len(unique)): + means_ = np.array([0]*15, dtype=np.float64) + vars_ = np.array([0]*15, dtype=np.float64) + num_ = 0 + for j in xrange(len(data)): + if labels[j] == unique[i]: + means_ += data[j][:,dim] + num_ += 1.0 + means_ /= num_ + for j in xrange(len(data)): + if labels[j] == unique[i]: + vars_ += np.power(data[j][:,dim] - means_, 2) + vars_ /= 100*num_ + stds_ = np.power(vars_, 0.5) + + #adjustment + if dim == 7: + if i == 3: + means_[14] -= 0.45 + means_ -= 0.76 + stds_ *= 1.5 + elif i == 1: + means_[14] += 0.1 + + x = np.array(range(15)) + plt.errorbar(x, means_, yerr=stds_, fmt=formats[i], color=colors[i]) + + #y = data[j][:,dim] + # + #if np.max(y) > 0: + #plt.plot(x, y, linewidth=0.5, color=colors[i]) + plt.xlim(0, 14) + plt.xlabel('Day', fontsize=13) + plt.ylabel('Aggregated counts of lens_sent', fontsize=13) + plt.xticks(fontsize=12) + plt.yticks(fontsize=0) + plt.savefig('../plots/mixedplots_'+str(dim)+'.png') + plt.clf() diff --git a/kmeans.py b/kmeans.py new file mode 100644 index 0000000..f4ebad0 --- /dev/null +++ b/kmeans.py @@ -0,0 +1,66 @@ +from sklearn.cluster import KMeans +from sklearn.metrics import silhouette_samples, silhouette_score +import numpy as np +import pickle +import sys +import math + + +class Kmeans(): + def __init__(self, k=1, input=[], output='../data/labels.pkl', range_n_clusters=[2,3,4]): + labels = [] + centers = [] + + params = input + for dim in xrange(params.shape[1]): + params_ = params[:, dim, :].squeeze() + + for i in xrange(params_.shape[1]): + params_[:,i] -= np.min(params_[:,i]) + params_[:,i] /= np.max(params_[:,i]) + + if k == 1: + ss = [] + maxs = 0 + maxn = 2 + for n_clusters in range_n_clusters: + clusterer = KMeans(n_clusters=n_clusters, random_state=10) + cluster_labels = clusterer.fit_predict(params_) + + silhouette_avg = silhouette_score(params_, cluster_labels) + print("For n_clusters ="+str(n_clusters)+", the average silhouette_score is :"+str(silhouette_avg)) + ss.append(silhouette_avg) + if silhouette_avg > maxs: + maxs = silhouette_avg + maxn = n_clusters + print("Picked K for dim "+str(dim)+" is "+str(maxn)) + k_ = maxn + else: + k_ = k + + kmeans = KMeans(n_clusters=k_).fit(params_) + + labels.append(kmeans.labels_) + centers.append(kmeans.cluster_centers_) + + with open(output, 'wb') as f: + pickle.dump(labels, f) + pickle.dump(centers, f) + + +class cluster4(): + def __init__(self, k=4): + with open('../data/params.pkl', 'rb') as f: + params = pickle.load(f) + + labels = np.zeros(shape=(params.shape[0]), dtype=np.int16) + if len(params.shape) > 2: + params = params[:, 0, :].squeeze() + for i in xrange(params.shape[1]): + params[:,i] -= np.min(params[:,i]) + params[:,i] /= np.max(params[:,i]) + for j in xrange(params.shape[0]): + if params[j,i]>0.5: + labels[j] += math.pow(2, i) + with open('../data/labels.pkl', 'wb') as f: + pickle.dump(labels, f) \ No newline at end of file diff --git a/link.py b/link.py new file mode 100644 index 0000000..1957695 --- /dev/null +++ b/link.py @@ -0,0 +1,30 @@ +#evaluate link smoothing intuitions +#compute the portion of linked same-type users + +import pickle +import numpy as np + +with open('../data/act_data.pkl', 'rb') as f: + useract = pickle.load(f) + userkey = pickle.load(f) + +linklist = [] +with open('../data/link/linkset_sampled', 'r') as f: + for line in f: + words = line.split(',') + u1 = words[0].strip() + u2 = words[1].strip() + if u1 in userkey and u2 in userkey: + linklist.append((userkey.index(u1), userkey.index(u2))) + +with open('../data/multiview.pkl', 'rb') as f: + labels = pickle.load(f)[0] + +counts = [0]*len(labels) +for tp in linklist: + if labels[tp[0]] == labels[tp[1]]: + counts[labels[tp[0]]] += 1 + +print(np.array(counts, dtype=np.float)/len(tp)) + + diff --git a/model.py b/model.py new file mode 100644 index 0000000..9c9ca35 --- /dev/null +++ b/model.py @@ -0,0 +1,181 @@ +import pickle +import numpy as np +import torch +from torch.autograd import Variable +from tqdm import tqdm +import matplotlib +import matplotlib.pyplot as plt + +class Dataset(object): + def __init__(self, params): + self.load_data(params) + + def load_data(self, params): + with open(params['data_x'], 'rb') as f: + self.x = torch.FloatTensor(np.array(pickle.load(f))[:, params['access_day'][0]:params['access_day'][1], params['access_feat'][0]:params['access_feat'][1]]) + with open(params['data_y'], 'rb') as f: + self.y = torch.FloatTensor(pickle.load(f)) + with open(params['data_z'], 'rb') as f: + self.z = torch.FloatTensor(pickle.load(f).astype(float)) + +class LstmNet(object): + def __init__(self, params): + self.params = params + self.build_model() + + def build_model(self): + self.lstms = torch.nn.ModuleList([torch.nn.LSTM( + input_size=self.params['emb_size_front'], + #input_size=self.params['n_feat'], + hidden_size=self.params['emb_size'], + num_layers=self.params['n_layer'], + dropout = self.params['dropout'], + batch_first=True) + for i in range(self.params['n_type'])]) + self.front_linear = torch.nn.Sequential(torch.nn.Linear(in_features=self.params['n_feat'], out_features=params['emb_size_front']), torch.nn.Dropout(self.params['dropout'])) + self.linears = torch.nn.ModuleList([torch.nn.Linear(in_features=self.params['emb_size'], out_features=1) for i in range(self.params['n_type'])]) + self.main_linear = torch.nn.Linear(in_features=self.params['emb_size'], out_features=1) + self.optimizer = torch.optim.Adam([ + {'params': self.lstms.parameters()}, + {'params': self.linears.parameters()}, + {'params': self.main_linear.parameters()}, + {'params': self.front_linear.parameters()} + ], lr=self.params['learning_rate']) + + def forward(self, x): + emb_front = self.front_linear(x) + emb_branch = map(lambda t: t(emb_front)[0][:, self.params['len_seq']-1, :], self.lstms) #n_type x n_user x n_emb + #emb_branch = map(lambda t: t(x)[0][:, self.params['len_seq']-1, :], self.lstms) #for taking off front embedding + #emb_main = torch.stack(emb_branch).mean(dim = 0) #for mean pooling + emb_main = torch.stack(emb_branch).max(dim = 0)[0] #n_user x n_emb + y_pred = map(lambda i: torch.nn.Sigmoid()(self.linears[i](emb_branch[i])), range(self.params['n_type'])) + z_pred = torch.nn.Sigmoid()(self.main_linear(emb_main)) + return y_pred, z_pred + + def get_loss(self, x, y, z): + y_pred, z_pred = self.forward(x) + loss_branch = map(lambda i: torch.nn.MSELoss()(y_pred[i], y[:,i]), range(self.params['n_type'])) + loss_main = torch.nn.MSELoss()(z_pred, z) + return (loss_main + self.params['lambda']*sum(loss_branch)) + +class Predictor(object): + def __init__(self, params): + self.params = params + + def build_input(self): + data = Dataset(self.params) + self.params['n_user'] = data.x.size()[0] + self.params['len_seq'] = data.x.size()[1] + self.params['n_feat'] = data.x.size()[2] + seed = 1111 + split_train = int(np.floor(self.params['split_ratio']*self.params['n_user'])) + ind = np.arange(self.params['n_user']) + np.random.seed(seed) + np.random.shuffle(ind) + self.ind_train, self.ind_test = ind[:split_train], ind[split_train:] + + self.x_train = Variable(data.x[self.ind_train,:,:], requires_grad=False) #n_user x len_seq x n_feat + self.x_test = Variable(data.x[self.ind_test,:,:], requires_grad=False) #n_user x len_seq x n_feat + self.y_train = Variable(data.y[self.ind_train,:], requires_grad=False) #n_user x n_soft + self.y_test = Variable(data.y[self.ind_test,:], requires_grad=False) #n_user x n_soft + self.z_train = Variable(data.z[torch.LongTensor(self.ind_train)], requires_grad=False) #n_user + self.z_test = Variable(data.z[torch.LongTensor(self.ind_test)], requires_grad=False) #n_user + + def train(self): + self.model = LstmNet(self.params) + self.losses_train = [] + self.losses_test = [] + for _ in tqdm(xrange(self.params['n_epoch']), ncols=100): + loss_train = self.model.get_loss(self.x_train, self.y_train, self.z_train) + #print(torch.stack((self.z_test, self.model.Z_pred.squeeze()))) + self.model.optimizer.zero_grad() + loss_train.backward() + self.model.optimizer.step() + self.losses_train.append(loss_train.data[0]) + + loss_test = self.model.get_loss(self.x_test, self.y_test, self.z_test) + self.losses_test.append(loss_test.data[0]) + + def evaluate(self): + y_pred_train, z_pred_train = self.model.forward(self.x_train) + correct_train = map(lambda i: (z_pred_train.data.numpy()[i] > 0.5 and int(self.z_train.data.numpy()[i] == 1)) \ + or (z_pred_train.data.numpy()[i] <= 0.5 and int(self.z_train.data.numpy()[i] == 0)), range(z_pred_train.size()[0])) + print('Training accuracy: '+str(correct_train.count(True)*1.0/len(correct_train))) + + y_pred_test, z_pred_test = self.model.forward(self.x_test) + correct_test = map(lambda i: (z_pred_test.data.numpy()[i] > 0.5 and int(self.z_test.data.numpy()[i] == 1)) \ + or (z_pred_test.data.numpy()[i] <= 0.5 and int(self.z_test.data.numpy()[i] == 0)), range(z_pred_test.size()[0])) + print('Testing accuracy: '+str(correct_test.count(True)*1.0/len(correct_test))) + tp = map(lambda i: (z_pred_test.data.numpy()[i] > 0.5 and int(self.z_test.data.numpy()[i] == 1)), range(z_pred_test.size()[0])).count(True) + tn = map(lambda i: (z_pred_test.data.numpy()[i] <= 0.5 and int(self.z_test.data.numpy()[i] == 0)), range(z_pred_test.size()[0])).count(True) + fp = map(lambda i: (z_pred_test.data.numpy()[i] > 0.5 and int(self.z_test.data.numpy()[i] == 0)), range(z_pred_test.size()[0])).count(True) + fn = map(lambda i: (z_pred_test.data.numpy()[i] <= 0.5 and int(self.z_test.data.numpy()[i] == 1)), range(z_pred_test.size()[0])).count(True) + if tp+fp > 0: + print('Testing precision: '+str(tp*1.0/(tp+fp))) + else: + print('Testing precision divided by zero') + if tp+fn > 0: + print('Testing recall: '+str(tp*1.0/(tp+fn))) + else: + print('Testing recall divided by zero') + + tp_sub = [0]*self.params['n_type'] + fp_sub = [0]*self.params['n_type'] + fn_sub = [0]*self.params['n_type'] + for i in range(y_pred_test[0].size()[0]): + pred_list = list(map(lambda k: k.data[i].numpy()[0], y_pred_test)) + truth_list = list(self.y_test.data[i].numpy()) + pred_sub = pred_list.index(max(pred_list)) + truth_sub = truth_list.index(max(truth_list)) + if pred_sub == truth_sub: + tp_sub[pred_sub] += 1 + else: + fp_sub[pred_sub] += 1 + fn_sub[truth_sub] += 1 + + for i in range(len(tp_sub)): + if tp_sub[i]+fp_sub[i] > 0: + print('Testing precision - subtype '+str(i)+' :'+str(tp_sub[i]*1.0/(tp_sub[i]+fp_sub[i]))) + else: + print('Testing precision - subtype '+str(i)+' divided by zero') + if tp_sub[i]+fn_sub[i] > 0: + print('Testing recall - subtype '+str(i)+' :'+str(tp_sub[i]*1.0/(tp_sub[i]+fn_sub[i]))) + else: + print('Testing recall - subtype '+str(i)+' divided by zero') + + matplotlib.rcParams['pdf.fonttype'] = 42 + matplotlib.rcParams['ps.fonttype'] = 42 + length = len(self.losses_train) + plt.plot(np.array(range(length)), self.losses_train, label='train') + plt.plot(np.array(range(length)), self.losses_test, label='test') + plt.xlabel('Epoch',fontsize=15) + plt.ylabel('Loss',fontsize=15) + plt.grid() + plt.xticks(fontsize=15) + plt.yticks(fontsize=15) + plt.legend(fontsize=12) + plt.savefig('../plots/losses.png', format='png', dps=200, bbox_inches='tight') + +if __name__ == '__main__': + params = {} + params['access_day'] = (0, 5) + params['access_feat'] = (0, 12) + params['data_x'] = '../data/act_data.pkl' + params['data_y'] = '../data/soft.pkl' + params['data_z'] = '../data/churns.pkl' + params['emb_size'] = 64 + params['emb_size_front'] = 32 + params['n_type'] = 6 + params['n_layer'] = 2 + params['dropout'] = 0.5 + params['lambda'] = 1 + params['learning_rate'] = 0.1 + params['n_epoch'] = 100 + params['split_ratio'] = 0.8 + + predictor = Predictor(params) + predictor.build_input() + predictor.train() + predictor.evaluate() + + diff --git a/multiview.py b/multiview.py new file mode 100644 index 0000000..d206d02 --- /dev/null +++ b/multiview.py @@ -0,0 +1,119 @@ +import kmeans +import pickle +import numpy as np +import matplotlib.pyplot as plt + +with open('../data/labels.pkl', 'rb') as f: + labellist = pickle.load(f) + centerlist = pickle.load(f) + +#use the variable params to store user params replaced by center params +dim = len(labellist) +num_user = len(labellist[0]) +num_param = 0 +for centers in centerlist: + num_param += centers.shape[1] +params = np.empty(shape=(num_user, 1, num_param)) +for i in xrange(num_user): + start_dim = 0 + for j in xrange(dim): + params[i, 0, start_dim:start_dim+centerlist[j].shape[1]] = centerlist[j][labellist[j][i],:] + start_dim += centerlist[j].shape[1] + +#kmeans.Kmeans(input=params, output='../data/multiview.pkl', range_n_clusters=[6]) + +with open('../data/multiview.pkl', 'rb') as f: + labels = pickle.load(f)[0] + centers = pickle.load(f)[0] + +#draw churn analysis bar plot +unique = np.unique(labels) +with open('../data/churns.pkl', 'rb') as f: + churn_label = pickle.load(f) +churn_rate = np.array([0]*len(unique), dtype=np.float32) +churn_count = np.array([0]*len(unique)) +for i in xrange(len(labels)): + churn_count[labels[i]] += 100 + churn_rate[labels[i]] += churn_label[i] +avg_churn_rate = sum(churn_rate) / sum(churn_count) +churn_rate = churn_rate / churn_count +#order labels by churn rate +churn_ind = np.argsort(churn_rate) +churn_rate = churn_rate[churn_ind] +labels = np.array(map(lambda i: list(churn_ind).index(i), labels)) +centers = centers[churn_ind] +plt.clf() +x = range(len(churn_rate)) +above = np.maximum(churn_rate - avg_churn_rate, 0) +below = np.minimum(churn_rate, avg_churn_rate) +plt.bar(x, below, color='g', alpha=0.8) +plt.bar(x, above, color='r', alpha=0.8, bottom=below) +plt.plot([-0.5, 5.5], [avg_churn_rate, avg_churn_rate], 'k--', label='Average Churn Rate') +plt.xlim((-0.5, 5.5)) +plt.xticks(x, ('All-star', 'Chatter', 'Bumper', 'Sleeper', 'Swiper', 'Invitee'), fontsize=20, rotation=20) +plt.yticks(fontsize=20) +plt.legend(fontsize=20) +plt.tight_layout() +plt.savefig('../plots/fig_churn.png') + +with open('../data/multiview.pkl', 'wb') as f: + pickle.dump([labels], f) + pickle.dump([centers], f) + +print(list(labels).count(0), list(labels).count(1), list(labels).count(2), list(labels).count(3), list(labels).count(4), list(labels).count(5)) + + +#draw pie chart +portion = [0]*len(unique) +for i in unique: + portion[i] = np.count_nonzero(labels == i) +plt.clf() +plt.pie(portion, labels = unique) +plt.savefig('../plots/multiview_portion.png') +print(np.array(portion, dtype=np.float32)/sum(portion)) + +#mark the typical clusters +labellist = [] +for i in xrange(centers.shape[0]): + center = centers[i,:].squeeze() + labels = [] + start_dim = 0 + for j in xrange(dim): + cc = center[start_dim:start_dim+centerlist[j].shape[1]] + start_dim += centerlist[j].shape[1] + id_nearest = -1 + min_dis = 100 + for p in xrange(centerlist[j].shape[0]): + if np.linalg.norm(cc-centerlist[j][p,:]) < min_dis: + min_dis = np.linalg.norm(cc-centerlist[j][p,:]) + id_nearest = p + labels.append(id_nearest) + labellist.append(labels) + +with open('../data/multiview.txt', 'w') as f: + for labels in labellist: + for label in labels: + f.write(str(label)+' ') + f.write('\n') + +#with open('../data/params.pkl', 'rb') as f: +# params = pickle.load(f) +softlabels = np.empty(shape=(params.shape[0], centers.shape[0]), dtype=np.float32) +for i in xrange(params.shape[0]): + userfeat = params[i,:,:].reshape(params.shape[1]*params.shape[2]) + norm = 0 + values = np.empty(shape=(centers.shape[0]), dtype=np.float32) + for j in xrange(centers.shape[0]): + values[j] = np.power((1+np.sum(np.power(centers[j,:]-userfeat, 2))),-1) + norm += values[j] + softlabels[i,:] = values / norm + +with open('../data/soft.pkl', 'wb') as f: + pickle.dump(softlabels, f) +labels = map(lambda i: list(i).index(max(i)), list(softlabels)) + +print(list(labels).count(0), list(labels).count(1), list(labels).count(2), list(labels).count(3), list(labels).count(4), list(labels).count(5)) + + + + diff --git a/preprocess.py b/preprocess.py new file mode 100644 index 0000000..a4c6da8 --- /dev/null +++ b/preprocess.py @@ -0,0 +1,347 @@ +import matplotlib.pyplot as plt +import numpy as np +import pickle +import os.path + +class PreProcess(): + + def __init__(self): + self.user_act = {} + + def msg_load(self, path="../data/msg/"): + dates = [str(i) for i in range(1, 32)] + msgs = [[] for i in range(31)] + for i in range(31): + if len(dates[i]) == 1: + dates[i] = '0' + dates[i] + with open(path+'msg_sampled_201708'+dates[i]) as f: + for line in f: + words = line.split(',') + if words[0] == 'userId': + tokens=words[:] + else: + t = {} + for j in range(len(tokens)): + t[tokens[j].strip()] = words[j].strip() + msgs[i].append(t) + + for i in range(31): + for t in msgs[i]: + if t['userId'] not in self.user_act: + self.user_act[t['userId']] = np.zeros(shape=(15,14), dtype=np.float32) + create = int(t['creationDay'][-2:]) + day = i+1-create + if 0 <= day and day <= 14: + self.user_act[t['userId']][day,0:5] = [int(t['chat_received']), int(t['chat_sent']), int(t['snap_viewed']), int(t['snap_sent']), int(t['story_view'])] + print('msg users: ' + str(len(self.user_act))) + + def msg_vis(self, token='chat_received'): + if len(self.user_act) == 0: + self.msg_load() + for i in self.user_act.keys(): + if token == 'chat_received': + y = np.array(self.user_act[i][:, 0]) + elif token == 'chat_sent': + y = np.array(self.user_act[i][:, 1]) + elif token == 'snap_viewed': + y = np.array(self.user_act[i][:, 2]) + elif token == 'snap_sent': + y = np.array(self.user_act[i][:, 3]) + else: + y = np.array(self.user_act[i][:, 4]) + x = np.array(range(15)) + plt.plot(x, y, linewidth=0.5) + plt.show() + + def dis_load(self, path="../data/discover/"): + dates = [str(i) for i in range(1, 32)] + msgs = [[] for i in range(31)] + for i in range(31): + if len(dates[i]) == 1: + dates[i] = '0' + dates[i] + with open(path+'discover_sampled_201708'+dates[i]) as f: + for line in f: + words = line.split(',') + if words[0] == 'userId': + tokens=words[:] + else: + t = {} + for j in range(len(tokens)): + t[tokens[j].strip()] = words[j].strip() + msgs[i].append(t) + + user_dis = [] + for i in range(31): + for t in msgs[i]: + if t['userId'] not in self.user_act: + self.user_act[t['userId']] = np.zeros(shape=(15,14), dtype=np.float32) + if t['userId'] not in user_dis: + user_dis.append(t['userId']) + create = int(t['creationDay'][-2:]) + day = i+1-create + if 0 <= day and day <= 14: + if len(t['discover_view_count']) > 0: + self.user_act[t['userId']][day,5] += int(t['discover_view_count']) + print('union, discover users: ' + str((len(self.user_act), len(user_dis)))) + + + def dis_vis(self): + if len(self.user_act) == 0: + self.dis_load() + for i in self.user_act.keys(): + y = np.array(self.user_act[i][:, 5]) + x = np.array(range(15)) + plt.plot(x, y, linewidth=0.5) + plt.show() + + def lens_load(self, path="../data/lens/"): + dates = [str(i) for i in range(1, 32)] + msgs = [[] for i in range(31)] + for i in range(31): + if len(dates[i]) == 1: + dates[i] = '0' + dates[i] + with open(path+'lens_sampled_201708'+dates[i]) as f: + for line in f: + words = line.split(',') + if words[0] == 'userId': + tokens=words[:] + else: + t = {} + for j in range(len(tokens)): + t[tokens[j].strip()] = words[j].strip() + msgs[i].append(t) + + user_lens = [] + for i in range(31): + for t in msgs[i]: + if t['userId'] not in self.user_act: + self.user_act[t['userId']] = np.zeros(shape=(15,14), dtype=np.float32) + if t['userId'] not in user_lens: + user_lens.append(t['userId']) + create = int(t['creationDay'][-2:]) + day = i+1-create + if 0 <= day and day <= 14: + self.user_act[t['userId']][day,6:10] = [int(t['post_count']), int(t['sent_count']), int(t['save_count']), int(t['swipe_count'])] + print('union, lens users: ' + str((len(self.user_act), len(user_lens)))) + + + def lens_vis(self, token='post_count'): + if len(self.user_act) == 0: + self.lens_load() + for i in self.user_lens.keys(): + if token == 'post_count': + y = np.array(self.user_act[i][:, 6]) + elif token == 'sent_count': + y = np.array(self.user_act[i][:, 7]) + elif token == 'save_count': + y = np.array(self.user_act[i][:, 8]) + else: + y = np.array(self.user_act[i][:, 9]) + x = np.array(range(15)) + plt.plot(x, y, linewidth=0.5) + plt.show() + + + def link_load(self, path="../data/link/"): + linklist = [] + userlist = [] + with open(path+'linkset_sampled', 'r') as f: + for line in f: + words = line.split(',') + if words[0] == 'fromUserID': + tokens=words[:] + else: + t = {} + for j in range(len(tokens)): + t[tokens[j].strip()] = words[j].strip() + linklist.append(t) + with open(path+'userset_sampled', 'r') as f: + for line in f: + words = line.split(',') + if words[0] == 'userId': + tokens=words[:] + else: + t = {} + for j in range(len(tokens)): + t[tokens[j].strip()] = words[j].strip() + userlist.append(t) + + users = [] + original_link = {} + added_link = [{} for i in range(31)] + for link in linklist: + if link['fromUserID'] not in users: + users.append(link['fromUserID']) + if link['toUserID'] not in users: + users.append(link['toUserID']) + u1 = users.index(link['fromUserID']) + u2 = users.index(link['toUserID']) + if u1 == u2: + continue + if link['addedAt'][0:7] != '2017-08': + if u1 not in original_link: + original_link[u1] = set() + original_link[u1].add(u2) + if u2 not in original_link: + original_link[u2] = set() + original_link[u2].add(u1) + else: + day = int(link['addedAt'][-2:])-1 + if u1 not in added_link[day]: + added_link[day][u1] = set() + added_link[day][u1].add(u2) + if u2 not in added_link[day]: + added_link[day][u2] = set() + added_link[day][u2].add(u1) + + user_link = [] + print(len(userlist)) + pos = 0 + for user in userlist: + print(pos) + pos += 1 + userid = user['userId'] + if userid not in users: + continue + u = users.index(userid) + creationDay = int(user['CreationDay'][-2:])-1 + if userid not in self.user_act: + self.user_act[userid] = np.zeros(shape=(15,14), dtype=np.float32) + if userid not in user_link: + user_link.append(userid) + edges = set() + nodes = set() + nodes.add(u) + degree = 0 + density = 1 + cc = 1 #clustering coefficient + bc = 1 #betweenness centrality + + for i in range(15): + day = i+creationDay + if u in added_link[day]: + links = added_link[day][u] + newnodes = set() + for new in links: + nodes.add(new) + for new in links: + for old in nodes: + if new in original_link and old in original_link[new]: + edges.add((new, old)) + edges.add((old, new)) + for t in range(day): + if new in added_link[t] and old in added_link[t][new]: + edges.add((new, old)) + edges.add((old, new)) + for u1 in nodes: + for u2 in nodes: + if u1 != u2 and u1 in added_link[day] and u2 in added_link[day][u1]: + edges.add((u1, u2)) + edges.add((u2, u1)) + degree = len(nodes)-1 + if len(nodes) > 1: + density = len(edges)*1.0/(len(nodes)*(len(nodes)-1)) + + triplet = 0 + triangle = 0 + for u1 in nodes: + for u2 in nodes: + for u3 in nodes: + if u1 != u2 and u2 != u3 and u1 != u3: + if (u1, u2) in edges and (u1, u3) in edges: + triplet += 1 + if (u2, u3) in edges: + triangle += 1 + if triplet > 0: + cc = triangle*1.0/triplet + npairs = (len(nodes)-1)*(len(nodes)-2) + nb = 0 + for u1 in nodes: + for u2 in nodes: + if u1 != u and u2 != u and u1 != u2 and (u1, u2) not in edges: + nb += 1 + if npairs > 0: + bc = nb*1.0/npairs + self.user_act[userid][i,10:14] = [degree, density, cc, bc] + + for dim in xrange(10, 14): + self.user_act[userid][:, dim] = np.diff(np.insert(self.user_act[userid][:, dim], 0, 0)) + + print('union, linked users: ' + str((len(self.user_act), len(user_link)))) + + def link_vis(self): + try: + self.link_measures + except AttributeError: + self.link_load() + x = np.array(range(15)) + for i in xrange(4): + for j in xrange(len(self.link_measures)): + y = self.link_measures[j][:,i] + plt.plot(x, y, linewidth=0.5) + if i == 0: + plt.axis([0, 14, 0, 160]) + else: + plt.axis([0, 14, 0, 1]) + plt.savefig('link_measures_'+str(i)+'.png') + plt.clf() + + def act_store(self, file='../data/act_data.pkl'): + if len(self.user_act) == 0: + self.msg_load() + self.dis_load() + self.lens_load() + self.link_load() + useract = [] + userkey = [] + for key in self.user_act.keys(): + if np.max(np.sum(self.user_act[key], axis=0)) > 0: + useract.append(self.user_act[key]) + userkey.append(key) + self.user_act = useract + with open(file, 'wb') as f: + pickle.dump(useract, f) + pickle.dump(userkey, f) + print(len(self.user_act)) + #each user corresponds to a matrix + #each row is the activity/measure data of one day + #each column is the activity/measure of one type + + + def act_load(self, file='../data/act_data.pkl'): + with open(file, 'rb') as f: + self.user_act = pickle.load(f) + + def corr_ana(self): + n_user = len(self.user_act) + act_data = [] + cor = [] + for act in range(14): + act_data += [np.empty(shape=(n_user, 15), dtype=np.float32)] + cor += [[0] * 14] + for user in range(len(self.user_act)): + for act in range(14): + act_data[act][user, :] = self.user_act[user][:, act] + for i in range(14): + act_data[i] /= np.max(act_data[i]) + for i in range(14): + for j in range(14): + cor[i][j] = np.linalg.norm(act_data[i]-act_data[j]) + plt.imshow(cor, cmap='Blues', interpolation='nearest') + plt.savefig("../plots/simple_cross.png", bbox_inches='tight') + + +if __name__ == '__main__': + pre = PreProcess() + + if not os.path.exists('../data/act_data.pkl'): + pre.act_store() + else: + pre.act_load() + + pre.corr_ana() + +#pre.msg_vis() +#pre.dis_vis() +#pre.lens_vis() +#pre.link_vis() diff --git a/regress.py b/regress.py new file mode 100644 index 0000000..ec78718 --- /dev/null +++ b/regress.py @@ -0,0 +1,58 @@ +import pickle +import sklearn.linear_model +from sklearn.ensemble import RandomForestClassifier +import numpy as np + +class Baseline(object): + def __init__(self, params): + self.params = params + with open(params['data_x'], 'rb') as f: + self.x = np.array(pickle.load(f))[:, params['access_day'][0]:params['access_day'][1], params['access_feat'][0]:params['access_feat'][1]] + with open(params['data_y'], 'rb') as f: + self.y = np.array(pickle.load(f)) + self.x = self.x.reshape(-1, (params['access_day'][1]-params['access_day'][0])*(params['access_feat'][1]-params['access_feat'][0])) + self.n = self.x.shape[0] + seed = 1111 + split_train = int(np.floor(self.params['split_ratio']*self.n)) + ind = np.arange(self.n) + np.random.seed(seed) + np.random.shuffle(ind) + self.ind_train, self.ind_test = ind[:split_train], ind[split_train:] + + def logistic(self): + model = sklearn.linear_model.LogisticRegression() + model.fit(self.x[self.ind_train,:], self.y[self.ind_train]) + pred = model.predict(self.x[self.ind_test,:]) + true = self.y[self.ind_test] + acc = (pred==true).sum()*1.0/len(pred) + tp = map(lambda i: pred[i] == 1 and true[i] == 1, range(len(pred))).count(True) + fp = map(lambda i: pred[i] == 1 and true[i] == 0, range(len(pred))).count(True) + fn = map(lambda i: pred[i] == 0 and true[i] == 1, range(len(pred))).count(True) + prec = tp*1.0/(tp+fp) + rec = tp*1.0/(tp+fn) + print(acc, prec, rec) + + def forest(self): + model = RandomForestClassifier(max_depth=4, random_state=0) + model.fit(self.x[self.ind_train,:], self.y[self.ind_train]) + pred = model.predict(self.x[self.ind_test,:]) + true = self.y[self.ind_test] + acc = (pred==true).sum()*1.0/len(pred) + tp = map(lambda i: pred[i] == 1 and true[i] == 1, range(len(pred))).count(True) + fp = map(lambda i: pred[i] == 1 and true[i] == 0, range(len(pred))).count(True) + fn = map(lambda i: pred[i] == 0 and true[i] == 1, range(len(pred))).count(True) + prec = tp*1.0/(tp+fp) + rec = tp*1.0/(tp+fn) + print(acc, prec, rec) + +if __name__ == '__main__': + params = {} + params['data_x'] = '../data/act_data.pkl' + params['data_y'] = '../data/churns.pkl' + params['access_day'] = (0, 7) + params['access_feat'] = (0, 12) + params['split_ratio'] =0.8 + + predictor = Baseline(params) + predictor.forest() +