In [14]:
#load dependencies / mount drive
import numpy as np
import pandas as pd
import csv 
import json
import math
import time

In [15]:
dest_1_sm = 'CF_docl_matrix.json'
dest_2_bg = 'AB_docl_matrix.json'

# Load Datasets to begin work.

with open(dest_1_sm) as sm_file:
  small_data = json.load(sm_file)

with open(dest_2_bg) as bg_file:
  big_data = json.load(bg_file)

# select which dataset to use
data = big_data

In [16]:
id_list = [] # List of Anonymized Twitter ID's
data_set = {} # Set version of original data
max_feature = float('-inf') # maximum value in feature column
min_feature = float('inf') # minimum value in feature column - Might not  need this

# O(n * m)
for key in data: #O(n)
  id_list.append(key) #O(1)
  data_set[key] = set(data[key]) #O(m)
  max_feature = max(max_feature, max(data_set[key])) #O(m)
  min_feature = min(min_feature, min(data_set[key])) #O(m)

# Since min_feature == 0, add 1 to max_feature
max_feature+= 1

print("Total # of IDs:    ", len(id_list))
print("Feature Value Min: ", min_feature)
print("Feature Value Max: ", max_feature)

Total # of IDs:     9045
Feature Value Min:  9045
Feature Value Max:  45453


In [17]:
def calculate_r_value(set_a, set_b, max_feature):
  # length of set_a is the same as the sum of all 'x'
  sig_x = len(set_a)

  # mean of all set_a values = sum of all x / total feature count
  avg_x = sig_x / max_feature

  # do same calculations for 'y'
  sig_y = len(set_b)
  avg_y = sig_y / max_feature

  # Senario A: x = 1, y = 1  
  # -- Intersection Time Complexity (Avg): O(min(len(a), len(b)))
  sen_a = set_a.intersection(set_b) 
  numerator = len(sen_a) * ((1-avg_x) * (1-avg_y))
  denom_x = len(sen_a) * ((1-avg_x)**2)
  denom_y = len(sen_a) * ((1-avg_y)**2)

  # Senario B: x = 1, y = 0  
  # -- Difference Time Complexity (Avg): O(len(a))
  sen_b = set_a.difference(set_b)  
  numerator += len(sen_b) * ((1-avg_x) * (-avg_y))
  denom_x += len(sen_b) * ((1-avg_x)**2)
  denom_y += len(sen_b) * ((-avg_y)**2)

  # Senario C: x = 0, y = 1  
  # -- Difference Time Complexity (Avg): O(len(b))
  sen_c = set_b.difference(set_a)
  numerator += len(sen_c) * ((-avg_x) * (1-avg_y))
  denom_x += len(sen_c) * ((-avg_x)**2)
  denom_y += len(sen_c) * ((1-avg_y)**2)

  # Senario D: x = 0, y = 0  
  # -- Union Time Complexity (Avg): O(len(a) + len(b))
  sen_d = (max_feature - len(set_a.union(set_b)))
  numerator += sen_d * (avg_x * avg_y)
  denom_x += sen_d * (avg_x**2)
  denom_y += sen_d * (avg_y**2)

  denominator = math.sqrt(denom_x * denom_y)

  return 1 - (numerator / denominator)

In [18]:
'''
Normal Execution
'''
def create_PCC_matrix(max_feature, id_list, data_set):
  start_time = time.time()
  adjacency_matrix = np.zeros((len(id_list), len(id_list)))
  adjacency_list = []

  for x in range(0,len(id_list)):
    for y in range(x+1, len(id_list)):
      adjacency_matrix[x][y] = calculate_r_value(data_set[id_list[x]], data_set[id_list[y]], max_feature)
      adjacency_list.append([id_list[x], id_list[y], adjacency_matrix[x][y]])

  print("Time to Execute (ms): ", str((time.time() - start_time)*1000))
  return adjacency_list , adjacency_matrix

adjacency_list, adjacency_matrix = create_PCC_matrix(max_feature, id_list, data_set)

Time to Execute (ms):  5581196.880578995


In [19]:
with open('adj_matrix.csv', 'w') as fh:
    writer = csv.writer(fh, delimiter=',')
    writer.writerow(id_list)
    for x in adjacency_matrix:
        writer.writerow(x)

csv_columns = ['id_1','id_2', 'PCD']
with open('adj_list.csv', 'w') as fh:
    writer = csv.writer(fh, delimiter=',')
    writer.writerow(csv_columns)
    for x in adjacency_list:
        writer.writerow(x)