In [1]:
import json, os, re
from metadata_metrics import readability_score, completeness_score_img, completeness_score_jsonxml, completeness_score_keyword, completeness_score_netcdf, completeness_score_tabular, tfidf_score
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl

# Getting Data

In [9]:

def get_metadata_metrics(dequeue_list, file_to_index_dict, tfidf_precomputed):
	readability_scores = dict()
	completeness_scores = dict()
	tfidf_scores = dict()
	cost_scores = dict()
	for file, extractor, cost in dequeue_list:
		parsed_key = file.split("/")
		filename = parsed_key[len(parsed_key) - 1]
		if extractor == 'keyword': # keyword
			filepath = '/home/cc/CDIACMetadataExtract/CDIACKeywordExtract/' + filename + 'KWXtract' + file_to_index_dict[file] + '.json'
			completeness_scores[file] = completeness_score_keyword(filepath)
		elif extractor == 'tabular': # tabular
			filepath = '/home/cc/CDIACMetadataExtract/CDIACTabularExtracted/' + filename +  'TabXtract' + file_to_index_dict[file] + '.json'
			completeness_scores[file] = completeness_score_tabular(filepath)
		#elif value == 3: # unknown
		#	filepath = '/home/cc/CDIACMetadataExtract/CDIACKeywordExtract/' + key + file_to_index_dict[key] + 'KWXtract.json'
		elif extractor == 'json/xml': #json/xml
			filepath = '/home/cc/CDIACMetadataExtract/CDIACJSONXMLExtracted/' + filename + 'JSONXMLXtract' + file_to_index_dict[file] + '.json'
			completeness_scores[file] = completeness_score_jsonxml(filepath)
		elif extractor == 'netcdf': #netcdf
			filepath = '/home/cc/CDIACMetadataExtract/CDIACNETCDFExtracted/' + filename + 'NetCDFXtract' + file_to_index_dict[file] + '.json'
			completeness_scores[file] = completeness_score_netcdf(filepath)
		elif extractor == 'image':
			filepath = '/home/cc/CDIACMetadataExtract/CDIACImgPredictions/' + filename + 'ImgXtract' + file_to_index_dict[file] + '.json'
			completeness_scores[file] = completeness_score_img(filepath)
		else:
			print('Something went wrong: ', extractor)

		readability_scores[file] = readability_score(filepath)

		if filepath in tfidf_precomputed:
			tfidf_scores[filepath] = tfidf_precomputed[filepath]
		else:
			tfidf_scores[file] = tfidf_score(filepath)
			tfidf_precomputed[file] = tfidf_scores[file]
			
		cost_scores[file] = cost


	return readability_scores, completeness_scores, tfidf_scores, cost_scores

### Small Test

In [None]:
with open("Experiment3/dequeue_list_threshold_0.0005.pkl", 'rb') as fp:
	test_dequeue_list = pkl.load(fp)

with open("EstimateTime/file_to_index.json", "r") as fp:
	file_to_index_dict = json.load(fp)
file_to_index_dict = dict(zip(file_to_index_dict.values(), file_to_index_dict.keys()))

In [None]:
read, complete, tfidf, cost_scores = get_metadata_metrics(test_dequeue_list, file_to_index_dict)

# Experimentation

In [3]:
with open("EstimateTime/file_to_index.json", "r") as fp:
	file_to_index_dict = json.load(fp)
file_to_index_dict = dict(zip(file_to_index_dict.values(), file_to_index_dict.keys()))

In [10]:
X = []
Y_readability = []
Y_completeness = []
Y_tfidf = []

readability_total = dict()
completeness_total = dict()
tfidf_total = dict()

average_readability = []
average_completeness = []
average_tfidf = []

tfidf_computed = dict()

for subdir, dirs, files in os.walk("Experiment3/"):
	for file in sorted(files):
		filepath = os.path.join(subdir, file)
		if file == 'Times.txt':
			continue

		threshold = re.search("dequeue_list_threshold_(.*).pkl", file).group(1)
		print("Threshold: ", threshold)
		X.append(float(threshold))
		with open(filepath, "rb") as fp:
			dequeue_list = pkl.load(fp)

		readability_scores, completeness_scores, tfidf_scores, cost_scores = get_metadata_metrics(dequeue_list, file_to_index_dict, tfidf_computed)

		readable_count = 0
		readable = []
		for key, value in readability_scores.items():
			if value != 'no strings':
				readable_count += 1
				readable.append(value)

		#plt.boxplot(readable)
		#plt.show()
		Y_readability.append(float(readable_count))
		readability_total[threshold] = readable

		#print(len(readable))
		avg_readability = sum(readable) / len(readable)
		print("Readability: ", avg_readability)
		average_readability.append(avg_readability)

		complete = 0
		complete_valid = []

		for key, value in completeness_scores.items():
			if value > 0:
				complete += 1
				complete_valid.append(value)
		
		#plt.boxplot(complete_valid)
		#plt.show()
		Y_completeness.append(float(complete))
		completeness_total[threshold] = complete_valid

		#print(len(complete_valid))
		avg_completeness = sum(complete_valid) / len(complete_valid)
		print("Completeness", avg_completeness)
		average_completeness.append(avg_completeness)

		nonzero_tfidf = 0
		tfidf_valid = []
		
		for key, value in tfidf_scores.items():
			if value > 0:
				nonzero_tfidf = 0
				tfidf_valid.append(value)
		Y_tfidf.append(float(nonzero_tfidf))
		tfidf_total[threshold] = tfidf_valid

		#print(len(complete_valid))
		avg_tfidf = sum(tfidf_valid) / len(tfidf_valid)
		print("TFIDF", avg_tfidf)
		average_tfidf.append(avg_tfidf)




Threshold:  0.1
Calculating TI/IDF of: /home/cc/CDIACMetadataExtract/CDIACNETCDFExtracted/SOCAT_tracks_gridded_monthly.ncNetCDFXtract67.json
Calculating TI/IDF of: /home/cc/CDIACMetadataExtract/CDIACNETCDFExtracted/SOCAT_tracks_gridded_decades_v2.ncNetCDFXtract11.json
Calculating TI/IDF of: /home/cc/CDIACMetadataExtract/CDIACTabularExtracted/SOCAT_tracks_gridded_yearly_v2.ncTabXtract57.json
Calculating TI/IDF of: /home/cc/CDIACMetadataExtract/CDIACNETCDFExtracted/SOCAT_tracks_gridded_monthly_v4.ncNetCDFXtract69.json
Calculating TI/IDF of: /home/cc/CDIACMetadataExtract/CDIACNETCDFExtracted/SOCAT_tracks_gridded_decades_v3.ncNetCDFXtract47.json
Calculating TI/IDF of: /home/cc/CDIACMetadataExtract/CDIACNETCDFExtracted/SOCAT_tracks_gridded_month_clim_v2.ncNetCDFXtract28.json
Calculating TI/IDF of: /home/cc/CDIACMetadataExtract/CDIACTabularExtracted/SOCATv1.5_TropicalAtlantic_2011_09_22.txtTabXtract94.json
Calculating TI/IDF of: /home/cc/CDIACMetadataExtract/CDIACNETCDFExtracted/SOCAT_tracks

In [None]:
X = np.asarray(X)
Y_readability = np.asarray(Y_readability)
Y_completeness = np.asarray(Y_completeness)

plt.title("Percent Extracted v. Readability")
plt.xlabel("Percent Extracted")
plt.ylabel("Readable files")
plt.scatter(X, Y_readability)
plt.plot(X, Y_readability)
plt.show()

plt.title("Percent Extracted v. Completeness")
plt.xlabel("Percent Extracted")
plt.ylabel("Somewhat complete files")
plt.scatter(X, Y_completeness)
plt.plot(X, Y_completeness)
plt.show()

plt.title("Percent Extracted v. TFIDF")
plt.xlabel("Percent Extracted")
plt.ylabel("Nonzero TFIDF files")
plt.scatter(X, Y_tfidf)
plt.plot(X, Y_tfidf)
plt.show()

plt.title("Costs of file over time")
plt.xlabel("File Order Index")
plt.ylabel("Cost in bytes/sec")
plt.plot(range(0, len(cost_scores)), cost_scores)
plt.show()

In [None]:
plt.scatter(X, average_readability)
plt.plot(X, average_readability)
plt.xlabel("Percent Extracted")
plt.ylabel("Average Readability")
plt.title("Percent Extracted vs Avg Readability")
plt.show()

plt.scatter(X, average_completeness)
plt.plot(X, average_completeness)
plt.xlabel("Percent Extracted")
plt.ylabel("Average Completeness")
plt.title("Percent Extracted vs Avg Completeness")
plt.show()

plt.scatter(X, average_tfidf)
plt.plot(X, average_tfidf)
plt.xlabel("Percent Extracted")
plt.ylabel("Average TFIDF")
plt.title("Percent Extracted vs Avg TFIDF")
plt.show()

In [None]:
labels, data = [*zip(*readability_total.items())]  # 'transpose' items to parallel key, value lists
#for d in data:
	#print(d)
plt.boxplot(data, meanline=True)
plt.xticks(range(1, len(labels) + 1), labels)
plt.title("Box and Whisker plots of Percent Extract vs. Readability Scores")
plt.xlabel("Percent Extracted")
plt.ylabel("Readability Score")
plt.show()

labels, data = [*zip(*completeness_total.items())]  # 'transpose' items to parallel key, value lists
plt.boxplot(data)
plt.xticks(range(1, len(labels) + 1), labels)
plt.xlabel("Percent Extracted")
plt.ylabel("Completeness Score")
plt.title("Box and Whisker plots of Percent Extract vs. Completeness Scores")
plt.show()

labels, data = [*zip(*tfidf_total.items())]  # 'transpose' items to parallel key, value lists
plt.boxplot(data)
plt.xticks(range(1, len(labels) + 1), labels)
plt.xlabel("Percent Extracted")
plt.ylabel("Completeness Score")
plt.title("Box and Whisker plots of Percent Extract vs. Completeness Scores")
plt.show()

