Add files via upload

yangji9181 · Feb 13, 2018 · 1285eec · 1285eec
1 parent 9f3add8
commit 1285eec
Show file tree

Hide file tree

Showing 10 changed files with 1,264 additions and 0 deletions.
diff --git a/core.py b/core.py
@@ -0,0 +1,56 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import pickle
+
+cutperc = 0.1
+core_friend = set()
+core_user = set()
+with open('../data/core_user', 'r') as f:
+	f.next()
+	for line in f:
+		core_user.add(line.strip())
+
+with open('../data/core_friend', 'r') as f:
+	f.next()
+	for line in f:
+		core_friend.add(line.strip())
+
+y_user = []
+y_friend = []
+y_rest = []
+x_user = []
+x_friend = []
+x_rest = []
+with open('../data/core_degree', 'r') as f:
+	count = 0
+	f.next()
+	for line in f:
+		words = line.split(',')
+		if words[0].strip() in core_user:
+			y_user.append(int(words[1].strip()))
+			x_user.append(count)
+		elif words[0].strip() in core_friend:
+			y_friend.append(int(words[1].strip()))
+			x_friend.append(count)
+		else:
+			y_rest.append(int(words[1].strip()))
+			x_rest.append(count)
+		count += 1
+
+cutoff = np.log(count*cutperc)
+percent = map(lambda i: i < cutoff, x_friend).count(True) *1.0 / len(x_friend)
+print(cutoff, percent)
+
+
+plt.scatter(np.log(y_friend), np.log(x_friend), marker='d', alpha=0.1, facecolors='none', edgecolors='r', label='K direct friends of new users')
+plt.scatter(np.log(y_rest), np.log(x_rest), marker='.', alpha=0.05, facecolors='none', edgecolors='g', label='All other users')
+plt.axhline(cutoff, ls='--', c='b', label='Cutoff at top 10% users \nwith highest degrees')
+plt.legend(fontsize=15, )
+plt.ylabel('Log(Rank By Degree)', fontsize=18)
+plt.xlabel('Log(Degree)', fontsize=18)
+plt.xticks(fontsize=0)
+plt.yticks(fontsize=0)
+#plt.scatter(x_user, y_user, marker='*', facecolors='none', edgecolors='b')
+plt.savefig('../plots/fig_core_original.png', format='png', bbox_inches='tight')
+plt.show()
+plt.clf()
diff --git a/cross.py b/cross.py
@@ -0,0 +1,59 @@
+from evaluate import f1_community, jc_community, nmi_community
+import pickle
+import numpy as np
+import matplotlib.pyplot as plt
+
+with open('../data/labels.pkl', 'rb') as f:
+	labellist = pickle.load(f)
+
+dim = len(labellist)
+newllist = []
+for i in xrange(dim):
+	labels = labellist[i]
+	unique = np.unique(labels)
+	newl = [[0]*len(labels) for j in range(len(unique))]
+	for j in xrange(len(labels)):
+		newl[labels[j]][j] = 1
+	newllist.append(newl)
+
+cross_f1 = []
+cross_jc = []
+cross_nmi = []
+for i in xrange(dim):
+	cross_f1 += [[0] * dim]
+	cross_jc += [[0] * dim]
+	cross_nmi += [[0] * dim]
+	for j in xrange(dim):
+		if i != j:
+			cross_f1[i][j] = f1_community(newllist[i], newllist[j])
+			cross_jc[i][j] = jc_community(newllist[i], newllist[j])
+			cross_nmi[i][j] = nmi_community(newllist[i], newllist[j])
+		else:
+			cross_f1[i][j] = 1
+			cross_jc[i][j] = 1
+			cross_nmi[i][j] = 1
+
+with open('../data/cross.txt', 'w') as f:
+	for p in xrange(dim):
+		for q in xrange(dim):
+			f.write(str(cross_f1[p][q])+' ')
+		f.write('\n')
+	f.write('------------\n')
+	for p in xrange(dim):
+		for q in xrange(dim):
+			f.write(str(cross_jc[p][q])+' ')
+		f.write('\n')
+	f.write('------------\n')
+	for p in xrange(dim):
+		for q in xrange(dim):
+			f.write(str(cross_nmi[p][q])+' ')
+		f.write('\n')
+
+plt.imshow(cross_f1, cmap='Blues', interpolation='nearest')
+plt.savefig("../plots/f1_cross.png", bbox_inches='tight')
+
+plt.imshow(cross_jc, cmap='Blues', interpolation='nearest')
+plt.savefig("../plots/jc_cross.png", bbox_inches='tight')
+
+plt.imshow(cross_nmi, cmap='Blues', interpolation='nearest')
+plt.savefig("../plots/nmi_cross.png", bbox_inches='tight')
diff --git a/evaluate.py b/evaluate.py
@@ -0,0 +1,160 @@
+import math
+import numpy as np
+import sys
+import csv
+
+def h_utils(w, n):
+    if 0 == w:
+        return 0
+    return -w * math.log(float(w) / n)
+
+def cover_entropy (xs, n):
+    tot_ent = 0.0
+    for x in xs:
+        tot_ent += h_utils(sum(x), n) + h_utils(n - sum(x), n)
+    return tot_ent
+
+def calc_modified_conditional_matrix(pre_ys, true_ys, n):
+    results_0 = np.zeros((len(pre_ys), len(true_ys)))
+    results_1 = np.zeros((len(true_ys), len(pre_ys)))
+    for ind_p in range(0, len(pre_ys)):
+        pre_y = pre_ys[ind_p]
+        for ind_t in range(0, len(true_ys)):
+            true_y = true_ys[ind_t]
+            a = sum([ 0 == (py + ty) for (py, ty) in zip(pre_y, true_y)])
+            d = sum([ 2 == (py + ty) for (py, ty) in zip(pre_y, true_y)])
+            b = sum(true_y) - d
+            c = sum(pre_y) - d
+            t1 = h_utils(a, n) + h_utils(d, n)
+            t2 = h_utils(b, n) + h_utils(c, n)
+            t3 = h_utils(c + d, n) + h_utils(a + b, n)
+            t4 = h_utils(b + d, n) + h_utils(a + c, n)
+            if t1 >= t2:
+                results_0[ind_p][ind_t] = t1 + t2 - t3
+                results_1[ind_t][ind_p] = t1 + t2 - t4
+            else:
+                results_0[ind_p][ind_t] = t3
+                results_1[ind_t][ind_p] = t4
+    return results_0, results_1
+
+def nmi_community(pre_ys, true_ys):
+    """
+    Normalized Mutual Information to evaluate overlapping community finding algorithms
+    """
+    n = len(pre_ys[0])
+    hx = cover_entropy(pre_ys, n)
+    hy = cover_entropy(true_ys, n)
+    hxy, hyx = calc_modified_conditional_matrix(pre_ys, true_ys, n)
+    hxy = sum([min(hxy_x) for hxy_x in hxy])
+    hyx = sum([min(hyx_y) for hyx_y in hyx])
+    return 0.5 * (hx + hy - hxy - hyx) / max(hx, hy)
+
+def f1_pair(pred_y, true_y):
+    """calculate f1 score for a pair of communities (predicted and ground truth)
+
+    args: 
+        pred_y (N * 1): binary array, 1 means the corresponding instance belongs to predicted community
+        true_y (N * 1): binary array, 1 means the corresponding instance belongs to golden community
+    """
+    corrected = sum([ 2 == (py + ty) for (py, ty) in zip(pred_y, true_y)])
+    if 0 == corrected:
+        return 0, 0, 0
+    precision = float(corrected) / sum(pred_y)
+    recall = float(corrected) / sum(true_y)
+    f1score = 2 * precision * recall / (precision + recall)
+    return f1score, precision, recall
+
+def f1_community(pre_ys, true_ys):
+    """calculate f1 score for two sets of communities (predicted and ground truth)
+
+    args: 
+        pred_ys (k * N): 
+        true_ys (l * N):
+    """
+    tot_size = 0
+    tot_fscore = 0.0
+    for pre_y in pre_ys:
+        cur_size = sum(pre_y)
+        tot_size += cur_size
+        tot_fscore += max([f1_pair(pre_y, true_y)[0] for true_y in true_ys]) * cur_size
+    return float(tot_fscore) / tot_size
+
+
+def jc_pair(pred_y, true_y):
+    """calculate jc score for a pair of communities (predicted and ground truth)
+
+    args: 
+        pred_y (N * 1): binary array, 1 means the corresponding instance belongs to predicted community
+        true_y (N * 1): binary array, 1 means the corresponding instance belongs to golden community
+    """
+    corrected = sum([ 2 == (py + ty) for (py, ty) in zip(pred_y, true_y)])
+    if 0 == corrected:
+        return 0
+    tot = sum([ (py + ty) > 0 for (py, ty) in zip(pred_y, true_y)])
+    return float(corrected) / tot
+
+def jc_community(pre_ys, true_ys):
+    """calculate jc score for two sets of communities (predicted and ground truth)
+
+    args: 
+        pred_ys (k * N): 
+        true_ys (l * N):
+    """
+    tot_jcscore = 0.0
+
+    tmp_size = float(1) / ( len(pre_ys) * 2 )
+    for pre_y in pre_ys:
+        tot_jcscore += max([jc_pair(pre_y, true_y) for true_y in true_ys]) * tmp_size
+
+    tmp_size = float(1) / ( len(true_ys) * 2 )
+    for true_y in true_ys:
+        tot_jcscore += max([jc_pair(pre_y, true_y) for pre_y in pre_ys]) * tmp_size
+
+    return tot_jcscore
+
+if __name__ == "__main__":
+    if len(sys.argv) == 3:
+        with open(sys.argv[1], 'rb') as truthfile:
+            truthreader = csv.reader(truthfile)
+            truth = []
+            for line in truthreader:
+                row = []
+                for element in line:
+                    row.append(int(element))
+                truth.append(row)
+
+        with open(sys.argv[2], 'rb') as predfile:
+            predreader = csv.reader(predfile)
+            pred = []
+            for line in predreader:
+                row = []
+                for element in line:
+                    row.append(int(element))
+                pred.append(row)
+
+        print('f1 score:')
+        print(f1_community(pred, truth))
+        print('jc score:')
+        print(jc_community(pred, truth))
+        print('nmi score:')
+        print(nmi_community(pred, truth))
+    else:
+        y = [[1, 1, 0, 1, 0], [0, 1, 0, 1, 1], [1, 0, 0, 0, 1], [0, 0, 0, 1, 0]]
+        x1 = [[0, 1, 0, 1, 1], [1, 1, 0, 1, 0], [1, 0, 0, 0, 1], [0, 0, 0, 1, 0]] #same
+        x2 = [[1, 1, 1, 1, 0], [0, 1, 0, 1, 1], [1, 0, 0, 0, 1], [0, 0, 0, 1, 0]] #1 error
+        x3 = [[0, 0, 0, 1, 1], [0, 0, 0, 1, 1], [0, 0, 0, 1, 1], [0, 0, 0, 1, 1]] #lots of error
+
+        print('f1 score:')
+        print(f1_community(x1, y))
+        print(f1_community(x2, y))
+        print(f1_community(x3, y))
+
+        print('jc score:')
+        print(jc_community(x1, y))
+        print(jc_community(x2, y))
+        print(jc_community(x3, y))
+
+        print('nmi score:')
+        print(nmi_community(x1, y))
+        print(nmi_community(x2, y))
+        print(nmi_community(x3, y))