# Protocollo Vectorization into Words removing stopwords

in order to run this notebook you need to install the following package

```
pip install stop-words
```


In [24]:
import random
import json
from stop_words import get_stop_words

random.seed(12345)

load the stop words for italian language

In [25]:
stopwords = get_stop_words('it')
print("the first 20 stopword %s of %d" % (stopwords[:20],len(stopwords)))

the first 20 stopword ['a', 'abbia', 'abbiamo', 'abbiano', 'abbiate', 'ad', 'adesso', 'agl', 'agli', 'ai', 'al', 'all', 'alla', 'alle', 'allo', 'allora', 'altre', 'altri', 'altro', 'anche'] of 308


In [26]:
input_filename = './protocollo_2017.csv'
output_filename = './protocollo_2017_vectorized_no_stopwords.csv'

In [27]:
def read_data(input_path, separator=",", contains_header=False):
    with open(input_filename, 'r') as f:
        lines = f.read().splitlines()
    
    if contains_header:
        headers = lines[0].split(separator)
        lines = lines[1:]
    else:
        headers = []
    
    data = [l.split(separator) for l in lines]
    return headers, data

In [28]:
headers, data = read_data(input_filename, separator="|", contains_header=True)

In [29]:
headers

['ANNO', 'NUMERO', 'OGGETTO', 'CLASSIFICA']

In [30]:
data

[['2017', '6', '"agli APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"ape condominio via pa"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"'],
 ['2017', '6', '"APE 2016_12_28_02267130488_006"', '"P"']]

Now we need to process the `OGGETTO` column

## Preprocessing

- filters: list (or concatenation) of characters to filter out, such as punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n' , includes basic punctuation, tabs, and newlines. These are replaced with spaces.
- split by space
- replace words with integers
- save the new dataset with id and the dictionary used for the mapping

Define the list of transformation to apply:

In [31]:
def preprocess(text,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True, split = ' ', stopwords_lang = 'it'):
    """apply the transformations to the given text.

    # Arguments
        text: Input text (string).
        filters: Sequence of characters to filter out.
        lower: Whether to convert the input to lowercase.
        split: Sentence split marker (string).
        
        return a string of token separated by split
    """
    translate_map = str.maketrans(filters, split * len(filters))
    text = text.translate(translate_map)
    
    if stopwords_lang is not None:
        stopwords = get_stop_words('it')
        text_tokens = [w for w in text.split(split) if w not in stopwords]
        text = ' '.join(text_tokens)
        
    return text.strip()

In [32]:
column = 2
    
def preprocess_dataset(data, column=2):
    data_processed = []
    for v in data:
        v[column] = preprocess(v[column])
        data_processed.append(v)
    return data_processed

In [33]:
data_processed = preprocess_dataset(data, column)
data_processed = preprocess_dataset(data, column=3)
data_processed

[['2017', '6', 'APE 2016 12 28 02267130488 006', 'P'],
 ['2017', '6', 'ape condominio via pa', 'P'],
 ['2017', '6', 'APE 2016 12 28 02267130488 006', 'P'],
 ['2017', '6', 'APE 2016 12 28 02267130488 006', 'P'],
 ['2017', '6', 'APE 2016 12 28 02267130488 006', 'P']]

We can see that it removes `agli` from the first row.

Now we need to create the dictionary used for the token mapping. 

In [34]:
def extract_tokens_dict(data, column=2):
    """
        return a pair (index_to_token, token_to_word)
    """
    tokens = set()
    for row in data:
        for v in row[column].split(' '):
            tokens.add(v)
    random.shuffle(list(tokens))
    
    index_to_token = dict(enumerate(tokens))
    index_to_token[len(index_to_token)] = 'UNK'
    token_to_index = {v:k for k,v in index_to_token.items()}
    return index_to_token, token_to_index

In [35]:
index_to_token, token_to_index = extract_tokens_dict(data_processed)

In [36]:
print(index_to_token)
print(token_to_index)

{0: '2016', 1: 'ape', 2: 'pa', 3: 'via', 4: '28', 5: 'APE', 6: '006', 7: '12', 8: '02267130488', 9: 'condominio', 10: 'UNK'}
{'2016': 0, 'ape': 1, 'pa': 2, 'via': 3, '28': 4, 'APE': 5, '006': 6, '12': 7, '02267130488': 8, 'condominio': 9, 'UNK': 10}


## Save the dictionary mapping

In [37]:
with open('index_to_token_no_stopwords.json','w') as f:
    json.dump(index_to_token,f)
    
with open('token_to_index_no_stopwords.json', 'w') as f:
    json.dump(token_to_index,f)

### Update the CSV

In [38]:
def vectorize(token_to_index, data, column=2):
    data_transformed = []

    for row in data:
        transformed = [token_to_index[v] for v in row[column].split(' ') if v in token_to_index]
        row[column] = ' '.join([str(x) for x in transformed])
        data_transformed.append(row)
    
    return data_transformed

In [39]:
data_transformed = vectorize(token_to_index, data)

In [40]:
data_transformed

[['2017', '6', '5 0 7 4 8 6', 'P'],
 ['2017', '6', '1 9 3 2', 'P'],
 ['2017', '6', '5 0 7 4 8 6', 'P'],
 ['2017', '6', '5 0 7 4 8 6', 'P'],
 ['2017', '6', '5 0 7 4 8 6', 'P']]

In [41]:
with open(output_filename, 'w') as f:
    for line in data_transformed:
        f.write(','.join(line) + '\n')