# Protocollo Vectorization into Chars


In [1]:
import random
import json
import string

random.seed(12345)

This notebooks used `string.printable` chars as basis for the vectorization/

In [2]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [3]:
input_filename = './protocollo_2017.csv'
output_filename = './protocollo_2017_vectorized_chars.csv'

In [4]:
def read_data(input_path, separator=",", contains_header=False):
    with open(input_filename, 'r') as f:
        lines = f.read().splitlines()
    
    if contains_header:
        headers = lines[0].split(separator)
        lines = lines[1:]
    else:
        headers = []
    
    data = [l.split(separator) for l in lines]
    return headers, data

In [5]:
headers, data = read_data(input_filename, separator="|", contains_header=True)

In [6]:
headers

['ANNO', 'NUMERO', 'OGGETTO', 'CLASSIFICA']

In [7]:
data

[['2017', '6', '"agli APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"ape condominio via pa"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"']]

Now we need to process the `OGGETTO` column

## Preprocessing

- filters: list (or concatenation) of characters to filter out, such as punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n' , includes basic punctuation, tabs, and newlines. These are replaced with spaces.
- split by space
- replace words with integers
- save the new dataset with id and the dictionary used for the mapping

Define the list of transformation to apply:

In [8]:
def preprocess(text,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True, split = ' '):
    """apply the transformations to the given text.

    # Arguments
        text: Input text (string).
        filters: Sequence of characters to filter out.
        lower: Whether to convert the input to lowercase.
        split: Sentence split marker (string).
        
        return a string of token separated by split
    """
    translate_map = str.maketrans(filters, split * len(filters))
    text = text.translate(translate_map)
    
    return text.strip()

In [9]:
def preprocess_dataset(data, column=2):
    data_processed = []
    for v in data:
        v[column] = preprocess(v[column])
        data_processed.append(v)
    return data_processed

In [10]:
data_processed = preprocess_dataset(data, column=2)
data_processed = preprocess_dataset(data, column=3)
data_processed

[['2017', '6', 'agli APE 2016 12 28 02267130488 006', 'P'],
 ['2017', '6', 'ape condominio via pa', 'P'],
 ['2017', '6', 'APE 2016 12 28 02267130488 006', 'P'],
 ['2017', '6', 'APE 2016 12 28 02267130488 006', 'P'],
 ['2017', '6', 'APE 2016 12 28 02267130488 006', 'P']]

Now we need to create the dictionary used for the token mapping. 

In [11]:
def extract_chars_dict(data, column=2):
    chars = [ch for ch in string.printable]
    random.shuffle(chars)
    
    index_to_token = dict(enumerate(chars))
    index_to_token[len(index_to_token)] = 'UNK'
    token_to_index = {v:k for k,v in index_to_token.items()}
    return index_to_token, token_to_index

In [12]:
index_to_chars, chars_to_index = extract_chars_dict(data_processed)

In [13]:
print(index_to_chars)
print(chars_to_index)

{0: '"', 1: 'j', 2: '@', 3: '^', 4: '_', 5: 's', 6: '8', 7: '/', 8: 'K', 9: 'X', 10: 'D', 11: '{', 12: 't', 13: 'Y', 14: '[', 15: 'u', 16: 'c', 17: 'p', 18: 'S', 19: 'h', 20: '!', 21: 'z', 22: 'r', 23: 'U', 24: '>', 25: '5', 26: '\r', 27: '4', 28: '\x0b', 29: 'E', 30: 'O', 31: 'g', 32: '<', 33: 'd', 34: 'v', 35: '7', 36: 'M', 37: '2', 38: ',', 39: '\n', 40: ']', 41: 'e', 42: '(', 43: 'P', 44: 'A', 45: '.', 46: '\x0c', 47: '$', 48: ':', 49: '6', 50: '}', 51: '?', 52: 'B', 53: 'N', 54: 'a', 55: 'G', 56: 'V', 57: '0', 58: '\\', 59: '`', 60: 'I', 61: 'w', 62: 'Z', 63: '%', 64: "'", 65: 'W', 66: '3', 67: 'F', 68: 'H', 69: ' ', 70: '9', 71: 'q', 72: 'i', 73: 'l', 74: '#', 75: '-', 76: 'Q', 77: '&', 78: 'b', 79: 'J', 80: 'n', 81: ')', 82: ';', 83: 'm', 84: '=', 85: '*', 86: 'x', 87: '|', 88: 'f', 89: '\t', 90: 'k', 91: 'T', 92: '+', 93: 'y', 94: 'o', 95: 'L', 96: 'C', 97: '1', 98: '~', 99: 'R', 100: 'UNK'}
{'"': 0, 'j': 1, '@': 2, '^': 3, '_': 4, 's': 5, '8': 6, '/': 7, 'K': 8, 'X': 9, 'D': 1

## Save the dictionary mapping

In [14]:
with open('index_to_chars.json','w') as f:
    json.dump(index_to_chars,f)
    
with open('chars_to_index.json', 'w') as f:
    json.dump(chars_to_index,f)

### Update the CSV

In [15]:
def vectorize(chars_to_index, data, column=2):
    data_transformed = []

    for row in data:
        transformed = [chars_to_index[v] for v in row[column] if v in chars_to_index]
        row[column] = ' '.join([str(x) for x in transformed])
        data_transformed.append(row)
    
    return data_transformed

In [17]:
data_transformed = vectorize(chars_to_index, data)

In [18]:
data_transformed

[['2017',
  '6',
  '54 31 73 72 69 44 43 29 69 37 57 97 49 69 97 37 69 37 6 69 57 37 37 49 35 97 66 57 27 6 6 69 57 57 49',
  'P'],
 ['2017',
  '6',
  '54 17 41 69 16 94 80 33 94 83 72 80 72 94 69 34 72 54 69 17 54',
  'P'],
 ['2017',
  '6',
  '44 43 29 69 37 57 97 49 69 97 37 69 37 6 69 57 37 37 49 35 97 66 57 27 6 6 69 57 57 49',
  'P'],
 ['2017',
  '6',
  '44 43 29 69 37 57 97 49 69 97 37 69 37 6 69 57 37 37 49 35 97 66 57 27 6 6 69 57 57 49',
  'P'],
 ['2017',
  '6',
  '44 43 29 69 37 57 97 49 69 97 37 69 37 6 69 57 37 37 49 35 97 66 57 27 6 6 69 57 57 49',
  'P']]

In [19]:
with open(output_filename, 'w') as f:
    for line in data_transformed:
        f.write(','.join(line) + '\n')