# Protocollo Vectorization


In [1]:
import numpy as np
import random
import json

random.seed(12345)

In [18]:
input_filename = './protocollo_2017.csv'
output_filename = './protocollo_2017_vectorized.csv'

In [19]:
with open(input_filename, 'r') as f:
    lines = f.read().splitlines()

In [20]:
headers = lines[0].split(",")
headers

['ANNO', 'NUMERO', 'OGGETTO', 'CLASSIFICA']

In [21]:
data = [l.split(",") for l in lines[1:]]

In [22]:
data

[['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"ape condominio via pa"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"']]

Now we need to process the `OGGETTO` column

## Preprocessing

- replace _ with ' '
- split by space
- replace words with integers
- save the new dataset with id and the dictionary used for the mapping


In [23]:
column = 2

def replace_dash(value):
    return value.replace('_',' ')

def process_dataset(data, column=2):
    data_processed = []
    for v in data:
        v[column] = replace_dash(v[column])
        v[column] = v[column].replace('"','')
        v[column] = v[column].lower()
        data_processed.append(v)
    return data_processed

In [24]:
data_processed = process_dataset(data)
data_processed

[['2017', '6', 'ape 2016 12 28 02267130488 006', '"P"'],
 ['2017', '6', 'ape condominio via pa', '"P"'],
 ['2017', '6', 'ape 2016 12 28 02267130488 006', '"P"'],
 ['2017', '6', 'ape 2016 12 28 02267130488 006', '"P"'],
 ['2017', '6', 'ape 2016 12 28 02267130488 006', '"P"']]

Now we need to create the dictionary used for the token mapping. 

In [25]:
def token_dict(data, column=2):
    """
        return a pair (index_to_token, token_to_word)
    """
    tokens = set()
    for row in data:
        for v in row[column].split(' '):
            tokens.add(v)
    random.shuffle(list(tokens))
    
    index_to_token = dict(enumerate(tokens))
    token_to_index = {v:k for k,v in index_to_token.items()}
    return index_to_token, token_to_index

In [26]:
index_to_token, token_to_index = token_dict(data_processed)

In [27]:
print(index_to_token)
print(token_to_index)

{0: '28', 1: '006', 2: 'condominio', 3: '02267130488', 4: '2016', 5: 'ape', 6: '12', 7: 'via', 8: 'pa'}
{'28': 0, '006': 1, 'condominio': 2, '02267130488': 3, '2016': 4, 'ape': 5, '12': 6, 'via': 7, 'pa': 8}


## Save the dictionary mapping

In [28]:
with open('index_to_token.json','w') as f:
    json.dump(index_to_token,f)
    
with open('token_to_index.json', 'w') as f:
    json.dump(token_to_index,f)

### Update the Csv

In [29]:
def vectorize(token_to_index, data, column=2):
    data_transformed = []

    for row in data:
        transformed = [token_to_index[v] for v in row[column].split(' ') if v in token_to_index]
        row[column] = ' '.join([str(x) for x in transformed])
        data_transformed.append(row)
    
    return data_transformed

In [30]:
data_transformed = vectorize(token_to_index, data)

In [31]:
data_transformed

[['2017', '6', '5 4 6 0 3 1', '"P"'],
 ['2017', '6', '5 2 7 8', '"P"'],
 ['2017', '6', '5 4 6 0 3 1', '"P"'],
 ['2017', '6', '5 4 6 0 3 1', '"P"'],
 ['2017', '6', '5 4 6 0 3 1', '"P"']]

In [33]:
with open(output_filename, 'w') as f:
    for line in data_transformed:
        f.write(','.join(line) + '\n')