In [1]:
import json, os, re
from metadata_metrics import readability_score, completeness_score_img, completeness_score_jsonxml, completeness_score_keyword, completeness_score_netcdf, completeness_score_tabular, tfidf_score
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import pickle as pkl

# Getting Data

In [3]:

def get_metadata_metrics(dequeue_list, file_to_index_dict, tfidf_precomputed, threshold):
	readability_scores = dict()
	completeness_scores = dict()
	tfidf_scores = dict()
	cost_scores = dict()
	extractor_count = {'keyword': 0, 'tabular':0, 'json/xml':0, 'netcdf': 0, 'image': 0}
	file_count = dict()
	count = 0
	
	for file, extractor, cost in dequeue_list:

		if count == threshold:
			break


		parsed_key = file.split("/")
		filename = parsed_key[len(parsed_key) - 1]
		if extractor == 'keyword': # keyword
			filepath = '/home/cc/CDIACMetadataExtract/CDIACKeywordExtract/' + filename + 'KWXtract' + file_to_index_dict[file] + '.json'
			completeness_scores[file] = completeness_score_keyword(filepath)			
		elif extractor == 'tabular': # tabular
			filepath = '/home/cc/CDIACMetadataExtract/CDIACTabularExtracted/' + filename +  'TabXtract' + file_to_index_dict[file] + '.json'
			completeness_scores[file] = completeness_score_tabular(filepath)
		#elif value == 3: # unknown
		#	filepath = '/home/cc/CDIACMetadataExtract/CDIACKeywordExtract/' + key + file_to_index_dict[key] + 'KWXtract.json'
		elif extractor == 'json/xml': #json/xml
			filepath = '/home/cc/CDIACMetadataExtract/CDIACJSONXMLExtracted/' + filename + 'JSONXMLXtract' + file_to_index_dict[file] + '.json'
			completeness_scores[file] = completeness_score_jsonxml(filepath)
		elif extractor == 'netcdf': #netcdf
			filepath = '/home/cc/CDIACMetadataExtract/CDIACNETCDFExtracted/' + filename + 'NetCDFXtract' + file_to_index_dict[file] + '.json'
			completeness_scores[file] = completeness_score_netcdf(filepath)
		elif extractor == 'image':
			filepath = '/home/cc/CDIACMetadataExtract/CDIACImgPredictions/' + filename + 'ImgXtract' + file_to_index_dict[file] + '.json'
			completeness_scores[file] = completeness_score_img(filepath)
		else:
			print('Something went wrong: ', extractor)

		if file in file_count:
			file_count[file] += 1
		else:
			file_count[file] = 1
		extractor_count[extractor] += 1
		readability_scores[filepath] = readability_score(filepath)
		if filepath in tfidf_precomputed:
			tfidf_scores[filepath] = tfidf_precomputed[filepath]
		else:
			tfidf_scores[filepath] = tfidf_score(filepath)
			tfidf_precomputed[filepath] = tfidf_scores[filepath]
			
		cost_scores[filepath] = cost

		count += 1

	return readability_scores, completeness_scores, tfidf_scores, cost_scores, extractor_count, file_count

# Experimentation

In [4]:
with open("EstimateTime/file_to_index.json", "r") as fp:
	file_to_index_dict = json.load(fp)
file_to_index_dict = dict(zip(file_to_index_dict.values(), file_to_index_dict.keys()))

In [5]:
X = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
Y_readability = []
Y_completeness = []
Y_tfidf = []

readability_total = dict()
completeness_total = dict()
tfidf_total = dict()

average_readability = []
average_completeness = []
average_tfidf = []

extractor_counts = [] # list of dicts
file_counts = [] # list of dicts 


tfidf_computed = dict() # Dynamic Programming Array to reduce computation redundancy

In [6]:
with open('Experiment4/dequeue_list.pkl', 'rb') as fp:
	dequeue_list = pkl.load(fp)
	for threshold in X:
		print("Threshold: ", threshold)
		input_threshold = threshold * len(dequeue_list)

		readability_scores, completeness_scores, tfidf_scores, cost_scores, extractor_count, file_count = get_metadata_metrics(dequeue_list, file_to_index_dict, tfidf_computed, input_threshold)

		extractor_counts.append(extractor_count)
		file_counts.append(file_count)

		readable_count = 0
		readable = []
		for key, value in readability_scores.items():
			if value != 'no strings':
				readable_count += 1
				readable.append(value)

		Y_readability.append(float(readable_count))
		readability_total[threshold] = readable


		if len(readable) == 0:
			avg_readability = 0
		else:
			avg_readability = sum(readable) / len(readable)

		print("Readability: ", avg_readability)
		average_readability.append(avg_readability)

		complete = 0
		complete_valid = []

		for key, value in completeness_scores.items():
			if value != 0:
				complete += 1
				complete_valid.append(value)
		
		Y_completeness.append(float(complete))
		completeness_total[threshold] = complete_valid

		if len(complete_valid) == 0:
			avg_completeness = 0
		else:
			avg_completeness = sum(complete_valid) / len(complete_valid)

		print("Completeness", avg_completeness)
		average_completeness.append(avg_completeness)

		nonzero_tfidf = 0
		tfidf_valid = []
		
		for key, value in tfidf_scores.items():
			if value != 0:
				nonzero_tfidf += 1
				tfidf_valid.append(value)
		Y_tfidf.append(float(nonzero_tfidf))
		tfidf_total[threshold] = tfidf_valid

		if len(tfidf_valid) == 0:
			avg_tfidf = 0
		else:
			avg_tfidf = sum(tfidf_valid) / len(tfidf_valid)

		print("TFIDF", avg_tfidf)
		average_tfidf.append(avg_tfidf)

		print("Precomputed length: ", len(tfidf_computed))



Threshold:  0
Readability:  0
Completeness 0
TFIDF 0
Precomputed length:  0
Threshold:  0.1


In [None]:
print(len(X))
print(len(Y_readability))
print(len(Y_completeness))
print(len(Y_tfidf))


In [None]:
X = np.asarray(X)
Y_readability = np.asarray(Y_readability)[0:]
Y_completeness = np.asarray(Y_completeness)[0:]
Y_tfidf = np.asarray(Y_tfidf)[0:]

print(len(Y_readability))

plt.title("Percent Extracted v. Readability")
plt.xlabel("Percent Extracted")
plt.ylabel("Readable files")
plt.scatter(X, Y_readability)
plt.plot(X, Y_readability)
plt.show()

plt.title("Percent Extracted v. Completeness")
plt.xlabel("Percent Extracted")
plt.ylabel("Somewhat complete files")
plt.scatter(X, Y_completeness)
plt.plot(X, Y_completeness)
plt.show()

plt.title("Percent Extracted v. TFIDF")
plt.xlabel("Percent Extracted")
plt.ylabel("Nonzero TFIDF files")
plt.scatter(X, Y_tfidf)
plt.plot(X, Y_tfidf)
plt.show()

plt.title("Reward of file over time")
plt.xlabel("File Order Index")
plt.ylabel("Cost in bytes/sec")
plt.plot(range(0, len(cost_scores)), list(cost_scores.values()))
plt.show()

### Processing of average readability

In [None]:
average_readability = np.array(average_readability)
average_readability = average_readability / np.max(average_readability)
print(average_readability)

### Processing of average completeness

In [None]:
len(completeness_total['0.1'])

cum_completeness = np.zeros(len(completeness_total))

for idx, key in enumerate(completeness_total):
	cum_completeness[idx] = np.sum(np.array(completeness_total[key]))

print(cum_completeness)

In [None]:
plt.scatter(X, average_readability)
plt.plot(X, average_readability)
plt.xlabel("File Extractor Pairs Processed")
plt.ylabel("Average Readability (Normalized)")
plt.title("File Extractor Pairs Processed vs Avg Readability")
plt.show()

plt.scatter(X, cum_completeness)
plt.plot(X, cum_completeness)
plt.xlabel("File Extractor Pairs Processed")
plt.ylabel("Cumulative Completeness")
plt.title("File Extractor Pairs Processed vs Cumulative Completeness")
plt.show()

plt.scatter(X, average_tfidf)
plt.plot(X, average_tfidf)
plt.xlabel("Percent Extracted")
plt.ylabel("Average TFIDF")
plt.title("Percent Extracted vs Avg TFIDF")
plt.show()

In [None]:
print(extractor_counts)

In [None]:
keyword_count = []
tabular_count = []
jsonxml_count = []
netcdf_count = []
image_count = []

file_count_list = []

for dictionary in extractor_counts:
	for key, value in dictionary.items():
		if key == "keyword":
			keyword_count.append(value)
		elif key == "tabular":
			tabular_count.append(value)
		elif key == "netcdf":
			netcdf_count.append(value)
		elif key == "json/xml":
			jsonxml_count.append(value)
		elif key == "image":
			image_count.append(value)
		else:
			print("Something went wrong")

for fc in file_counts:
	file_count_list.append(len(list(fc.keys())))
	

In [None]:
print(keyword_count)
print(tabular_count)
print(jsonxml_count)
print(netcdf_count)
print(image_count)
print(file_count_list)

In [None]:
plt.plot(X, keyword_count, label="Keyword Extractions")
plt.plot(X, tabular_count, label="Tabular Extractions")
plt.plot(X, jsonxml_count, label="JSON/XML Extractions")
plt.plot(X, netcdf_count, label="NetCDF Extractions")
plt.plot(X, image_count, label="Image Extractions")
plt.plot(X, file_count_list, label="Unique Files extracted")
plt.legend()
plt.show()

In [None]:
labels, data = [*zip(*readability_total.items())]  # 'transpose' items to parallel key, value lists
#for d in data:
	#print(d)

plt.boxplot(data, whis=[5, 95])
plt.xticks(range(1, len(labels) + 1), labels)
plt.title("Box and Whisker plots of Percent Extract vs. Readability Scores")
plt.xlabel("Percent Extracted")
plt.ylabel("Readability Score")
plt.show()

labels, data = [*zip(*completeness_total.items())]  # 'transpose' items to parallel key, value lists
plt.boxplot(data)
plt.xticks(range(1, len(labels) + 1), labels)
plt.xlabel("Percent Extracted")
plt.ylabel("Completeness Score")
plt.title("Box and Whisker plots of Percent Extract vs. Completeness Scores")
plt.show()

labels, data = [*zip(*tfidf_total.items())]  # 'transpose' items to parallel key, value lists
plt.boxplot(data)
plt.xticks(range(1, len(labels) + 1), labels)
plt.xlabel("Percent Extracted")
plt.ylabel("TFIDF Score")
plt.title("Box and Whisker plots of Percent Extract vs. TFIDF Scores")
plt.show()



In [None]:
Threshold = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
Time = [0, 192.54913187026978, 201.7842311859131, 287.3370723724365, 437.8477611541748,  585.3103694915771, 591.996896982193, 598.9227044582367, 606.2875552177429, 614.1233706474304, 967.380571603775]
plt.plot(Threshold, Time)
plt.xlabel("threshold")
plt.ylabel("time")
plt.title("threshold vs time")
plt.show()