In [9]:
# General purpose libraries
import boto3
import copy
import csv
import datetime
import json
import numpy as np
import pandas as pd
import s3fs
from collections import defaultdict
import time
import re
import random
from sentence_transformers import SentenceTransformer
import sentencepiece
from scipy.spatial import distance
from json import JSONEncoder
import sys
sys.path.append("/Users/dafirebanks/Projects/policy-data-analyzer/")
from tasks.data_loading.src.utils import *

### 1. Set up AWS

In [2]:
def aws_credentials_from_file(f_name):
    with open(f_name, "r") as f:
        creds = json.load(f)
    
    return creds["aws"]["id"], creds["aws"]["secret"]


In [5]:
credentials_file = '/Users/dafirebanks/Documents/credentials.json'
aws_id, aws_secret = aws_credentials_from_file(credentials_file)
region = 'us-east-1'

s3 = boto3.resource(
    service_name = 's3',
    region_name = region,
    aws_access_key_id = aws_id,
    aws_secret_access_key = aws_secret
)

### 2. Load sentences from folder

In [6]:
policy_dict = {}
objs = []
language = "english"
bucket_name = 'wri-nlp-policy'
sents_folder = f"{language}_documents/sentences"

for i, obj in enumerate(s3.Bucket(bucket_name).objects.all().filter(Prefix=sents_folder)):
#     print(obj.key)
    if not obj.key.endswith("/"):
        serializedObject = obj.get()['Body'].read()
        policy_dict = {**policy_dict, **json.loads(serializedObject)}
        
        # Uncomment this for testing purposes
#         if i == 10:
#             break

english_documents/sentences/
english_documents/sentences/0002304f1f671ea916ae0a1f784484eb4874ceaa_sents.json
english_documents/sentences/0002a815db93aaba959b04dbeaa17e87f8585734_sents.json
english_documents/sentences/0005bd689cd9cc6ab99194b7bb5aed32fe0bbb47_sents.json
english_documents/sentences/0007adab1d0acfc53274596b784bd85cdfbe502e_sents.json
english_documents/sentences/000cc1ce72ee0d48230e5da68fe69c11ac428cfa_sents.json
english_documents/sentences/0010b6d44c3d296ba1bfaa24c718ab16d9ae017b_sents.json
english_documents/sentences/00175bb6c2a2b3368a9cfdfbd895ecac48920ccb_sents.json
english_documents/sentences/00198951ca0fcb94619e41d5256c31e8ce57d70f_sents.json
english_documents/sentences/0020eabfcbd08554df1bd58438dcfda7f454d8c7_sents.json
english_documents/sentences/00274763715091c9c1186a4d13f61ef5b773b923_sents.json


In [8]:
policy_dict.keys()

dict_keys(['0002304f1f671ea916ae0a1f784484eb4874ceaa', '0002a815db93aaba959b04dbeaa17e87f8585734', '0005bd689cd9cc6ab99194b7bb5aed32fe0bbb47', '0007adab1d0acfc53274596b784bd85cdfbe502e', '000cc1ce72ee0d48230e5da68fe69c11ac428cfa', '0010b6d44c3d296ba1bfaa24c718ab16d9ae017b', '00175bb6c2a2b3368a9cfdfbd895ecac48920ccb', '00198951ca0fcb94619e41d5256c31e8ce57d70f', '0020eabfcbd08554df1bd58438dcfda7f454d8c7', '00274763715091c9c1186a4d13f61ef5b773b923'])

#### 2.1 Before, they were separated by document, now we join all sentences into a single JSON

In [10]:
def labeled_sentences_from_dataset(dataset):
    sentence_tags_dict = {}

    for document in dataset.values():
        sentence_tags_dict.update(document['sentences'])

    return sentence_tags_dict

sentences = labeled_sentences_from_dataset(policy_dict)

#### OPTIONAL: 2.2 Now, we'll randomly sample 10 sentences for testing purposes

In [19]:
sample_sentence_ids = random.sample(list(sentences), 10)
sample_sentences = {}
for s_id in sample_sentence_ids:
    sample_sentences.update({s_id: sentences[s_id]})

### 3. Compute embeddings

In [13]:
def create_sentence_embeddings(model, sentences_dict):
    embeddings = {}
    for sentence_id, sentence_map in sentences_dict.items():
        embeddings[sentence_id] = model.encode(sentence_map['text'].lower(), show_progress_bar=False)
        
    return embeddings

In [20]:
Ti = time.perf_counter()

# We will use only one transformer to compute embeddings
transformer_name = 'xlm-r-bert-base-nli-stsb-mean-tokens'

model = SentenceTransformer(transformer_name)
embs = create_sentence_embeddings(model, sample_sentences)

Tf = time.perf_counter()

print(f"Time taken for creating the embeddings: {Tf - Ti:0.4f} seconds")

Time taken for creating the embeddings: 7.1771 seconds


### 4. Calculate cosine similarity

In [42]:
def sentence_similarity_search(model, queries, sentence_embeddings, sentences, similarity_limit, results_limit, filename):
    results = {}
    for query in queries:
        Ti = time.perf_counter()
        similarities = get_distance(model, sentence_embeddings, sentences, query, similarity_limit)
        results[query] = similarities[0:results_limit]#results[transformer][query] = similarities[0:results_limit]
        Tf = time.perf_counter()
        print(f"similarity search for query {query} has been done in {Tf - Ti:0.4f} seconds")

    path = "../../output/"
    filename = filename + ".json"
    file = path + filename
    with open(file, 'w') as fp:
        json.dump(results, fp, indent=4)
    return results

def get_distance(model, sentence_emb, sentences_dict, query, similarity_treshold):
    query_embedding = model.encode(query.lower(), show_progress_bar=False)
    highlights = []
    for sentence in sentences_dict:
        sentence_embedding = sentence_emb[sentence]
        score = round(1 - distance.cosine(sentence_embedding, query_embedding), 4)
        if score > similarity_treshold:
            highlights.append([sentence, score, sentences_dict[sentence]['text']])
    highlights = sorted(highlights, key = lambda x : x[1], reverse = True)
    return highlights

In [43]:
transformer_name ='xlm-r-bert-base-nli-stsb-mean-tokens'
model = SentenceTransformer(transformer_name)
similarity_threshold = 0.2
search_results_limit = 1000
queries = ["We will offer 10 dollars to each farmer that plants 1 seed per hour", "A number of instrumental equipment for farming will be provided to every farmer in need"]
out_fname = "test_similarity_file"

results_dict = sentence_similarity_search(model, queries, embs, sample_sentences, similarity_threshold, search_results_limit, out_fname)

similarity search for query We will offer 10 dollars to each farmer that plants 1 seed per hour has been done in 0.0949 seconds
similarity search for query A number of instrumental equipment for farming will be provided to every farmer in need has been done in 0.0960 seconds


In [44]:
results_dict

{'We will offer 10 dollars to each farmer that plants 1 seed per hour': [['00274763715091c9c1186a4d13f61ef5b773b923_sent_4310',
   0.2974,
   '(i) In units Oahu--Coastal--Unit 9, Oahu--Coastal--Unit 11, and Oahu--Coastal--Unit 12, the physical and biological features of critical habitat are: (A) Elevation: Less than 980 ft (300 m).'],
  ['00274763715091c9c1186a4d13f61ef5b773b923_sent_4239',
   0.2152,
   '(B) Annual precipitation: 50 to 75 in (130 to 190 cm).'],
  ['00274763715091c9c1186a4d13f61ef5b773b923_sent_1240',
   0.2113,
   'The following activities could potentially result in a violation of section 9 of the Act; this list is not comprehensive: (1) Unauthorized collecting, handling, possessing, selling, delivering, carrying, or transporting of the species, including import or export across State lines and international boundaries, except for properly documented antique specimens of these taxa at least 100 years old, as defined by section 10(h)(1) of the Act.']],
 'A number of i

### 5. Output results in CSV format to S3 for labeling 

In [75]:
def save_results_as_separate_csv(results_dictionary, aws_id, aws_secret):
    path = "s3://wri-nlp-policy/english_documents/assisted_labeling"
    col_headers = ["sentence_id", "similarity_score", "text"]
    for i, query in enumerate(results_dict.keys()):
        filename = f"{path}/query_{i}_results.csv"
        pd.DataFrame(results_dict[query], columns=col_headers).to_csv(filename, storage_options={"key": aws_id, "secret": aws_secret})
    

In [76]:
save_results_as_separate_csv(results_dict, aws_id, aws_secret)

### 6. Optimized full loop

In [None]:
def aws_credentials_from_file(f_name):
    with open(f_name, "r") as f:
        creds = json.load(f)
    
    return creds["aws"]["id"], creds["aws"]["secret"]

def load_all_sentences(language, s3, bucket_name):
    policy_dict = {}
    sents_folder = f"{language}_documents/sentences"

    for i, obj in enumerate(s3.Bucket(bucket_name).objects.all().filter(Prefix=sents_folder)):
        if not obj.key.endswith("/"):
            serializedObject = obj.get()['Body'].read()
            policy_dict = {**policy_dict, **json.loads(serializedObject)}
            
    return labeled_sentences_from_dataset(policy_dict)

def save_results_as_separate_csv(results_dictionary, aws_id, aws_secret):
    path = "s3://wri-nlp-policy/english_documents/assisted_labeling"
    col_headers = ["sentence_id", "similarity_score", "text"]
    for i, query in enumerate(results_dict.keys()):
        filename = f"{path}/query_{i}_results.csv"
        pd.DataFrame(results_dict[query], columns=col_headers).to_csv(filename, storage_options={"key": aws_id, "secret": aws_secret})

def labeled_sentences_from_dataset(dataset):
    sentence_tags_dict = {}

    for document in dataset.values():
        sentence_tags_dict.update(document['sentences'])

    return sentence_tags_dict

In [None]:
# Set up AWS
credentials_file = '/Users/dafirebanks/Documents/credentials.json'
aws_id, aws_secret = aws_credentials_from_file(credentials_file)
region = 'us-east-1'

s3 = boto3.resource(
    service_name = 's3',
    region_name = region,
    aws_access_key_id = aws_id,
    aws_secret_access_key = aws_secret
)

In [None]:
# Get all sentence documents
language = "english"
bucket_name = 'wri-nlp-policy'
sentences = load_all_sentences(language, s3, bucket_name)

In [None]:
# TODO: Define queries
queries = []

In [73]:
# 0. Define params
transformer_name = 'xlm-r-bert-base-nli-stsb-mean-tokens'
model = SentenceTransformer(transformer_name)

similarity_threshold = 0.2
search_results_limit = 1000

In [None]:
# 1. Calculate and store query embeddings
query_embeddings = dict(zip(queries, [model.encode(query.lower(), show_progress_bar=False) for query in queries]))

In [None]:
# 2. For each sentence, calculate its embedding, and store the similarity
query_similarities = defaultdict(list)

for sentence_id, sentence in sentences.items():
    sentence_embedding = model.encode(sentence['text'].lower(), show_progress_bar=False)
    
    for query_text, query_embedding in query_embeddings.items():
        score = round(1 - distance.cosine(sentence_embedding, query_embedding), 4)
        if score > similarity_threshold:
            query_similarities[query_text].append([sentence_id, score, sentences[sentence_id]['text']])

In [None]:
# 3. Sort results by similarity score
for query in query_similarities:
    query_similarities[query] = sorted(query_similarities[query], key = lambda x : x[1], reverse=True)

In [None]:
# 4. Store results
save_results_as_separate_csv(query_similarities, aws_id, aws_secret)