# Protocollo Vectorization into Words


In [1]:
import random
import json

random.seed(12345)

In [2]:
input_filename = './protocollo_2017.csv'
output_filename = './protocollo_2017_vectorized.csv'

In [3]:
def read_data(input_path, separator=",", contains_header=False):
    with open(input_filename, 'r') as f:
        lines = f.read().splitlines()
    
    if contains_header:
        headers = lines[0].split(separator)
        lines = lines[1:]
    else:
        headers = []
    
    data = [l.split(separator) for l in lines]
    return headers, data

In [4]:
headers, data = read_data(input_filename, separator="|", contains_header=True)

In [5]:
headers

['ANNO', 'NUMERO', 'OGGETTO', 'CLASSIFICA']

In [6]:
data

[['2017', '6', '"agli APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"ape condominio via pa"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"']]

Now we need to process the `OGGETTO` column

## Preprocessing

- filters: list (or concatenation) of characters to filter out, such as punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n' , includes basic punctuation, tabs, and newlines. These are replaced with spaces.
- split by space
- replace words with integers
- save the new dataset with id and the dictionary used for the mapping

Define the list of transformation to apply:

In [7]:
def preprocess(text,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True, split = ' '):
    """apply the transformations to the given text.

    # Arguments
        text: Input text (string).
        filters: Sequence of characters to filter out.
        lower: Whether to convert the input to lowercase.
        split: Sentence split marker (string).
        
        return a string of token separated by split
    """
    translate_map = str.maketrans(filters, split * len(filters))
    text = text.translate(translate_map)
    
    return text.strip()

In [8]:
def preprocess_dataset(data, column=2):
    data_processed = []
    for v in data:
        v[column] = preprocess(v[column])
        data_processed.append(v)
    return data_processed

In [9]:
data_processed = preprocess_dataset(data, column=2)
data_processed = preprocess_dataset(data, column=3)
data_processed

[['2017', '6', 'agli APE 2016 12 28 02267130488 006', 'P'],
 ['2017', '6', 'ape condominio via pa', 'P'],
 ['2017', '6', 'APE 2016 12 28 02267130488 006', 'P'],
 ['2017', '6', 'APE 2016 12 28 02267130488 006', 'P'],
 ['2017', '6', 'APE 2016 12 28 02267130488 006', 'P']]

Now we need to create the dictionary used for the token mapping. 

In [10]:
def extract_tokens_dict(data, column=2):
    """
        return a pair (index_to_token, token_to_word)
    """
    tokens = set()
    for row in data:
        for v in row[column].split(' '):
            tokens.add(v)
    random.shuffle(list(tokens))
    
    index_to_token = dict(enumerate(tokens))
    index_to_token[len(index_to_token)] = 'UNK'
    token_to_index = {v:k for k,v in index_to_token.items()}
    return index_to_token, token_to_index

In [11]:
index_to_token, token_to_index = extract_tokens_dict(data_processed)

In [12]:
print(index_to_token)
print(token_to_index)

{0: 'via', 1: '02267130488', 2: 'ape', 3: '12', 4: 'agli', 5: '006', 6: '28', 7: 'APE', 8: 'pa', 9: '2016', 10: 'condominio', 11: 'UNK'}
{'via': 0, '02267130488': 1, 'ape': 2, '12': 3, 'agli': 4, '006': 5, '28': 6, 'APE': 7, 'pa': 8, '2016': 9, 'condominio': 10, 'UNK': 11}


## Save the dictionary mapping

In [13]:
with open('index_to_token.json','w') as f:
    json.dump(index_to_token,f)
    
with open('token_to_index.json', 'w') as f:
    json.dump(token_to_index,f)

### Update the CSV

In [14]:
def vectorize(token_to_index, data, column=2):
    data_transformed = []

    for row in data:
        transformed = [token_to_index[v] for v in row[column].split(' ') if v in token_to_index]
        row[column] = ' '.join([str(x) for x in transformed])
        data_transformed.append(row)
    
    return data_transformed

In [15]:
data_transformed = vectorize(token_to_index, data)

In [16]:
data_transformed

[['2017', '6', '4 7 9 3 6 1 5', 'P'],
 ['2017', '6', '2 10 0 8', 'P'],
 ['2017', '6', '7 9 3 6 1 5', 'P'],
 ['2017', '6', '7 9 3 6 1 5', 'P'],
 ['2017', '6', '7 9 3 6 1 5', 'P']]

In [17]:
with open(output_filename, 'w') as f:
    for line in data_transformed:
        f.write(','.join(line) + '\n')