In [59]:
import numpy as np
import pandas as pd

import json

In [60]:
train_file = 'data/SENTIPOLC Sentiment Polarity Classification - Evalita 2016.csv'
test_file = 'data/test_set_sentipolc16_gold2000.csv'

## Load Train and Test Dataset

In [61]:
df_test = pd.read_csv(test_file)
df_test.head()

Unnamed: 0,idtwitter,subj,opos,oneg,iro,lpos,lneg,top,text
0,507074506880712705,0,0,0,0,0,0,2,Tra 5 minuti presentazione piano scuola del go...
1,507075789456961536,1,1,0,0,1,0,2,@matteorenzi: Alle 10 appuntamento su http://t...
2,507077511902425088,1,0,1,0,0,1,2,#labuonascuola gli #evangelisti #digitali non ...
3,507079183315787777,0,0,0,0,0,0,2,Riforma scuola Tutto il discorso di Renzi su ...
4,507080190225563648,1,0,0,0,0,0,2,.@matteorenzi @MiurSocial #labuonascuola basta...


In [62]:
df_test.lneg.value_counts() / df_test.shape[0]

0    0.6795
1    0.3205
Name: lneg, dtype: float64

In [63]:
df_train = pd.read_csv(train_file)
df_train.head()

Unnamed: 0,idtwitter,subj,opos,oneg,iro,lpos,lneg,top,text
0,122449983151669248,1,0,1,0,0,1,1,Intanto la partita per Via Nazionale si compli...
1,125485104863780865,1,0,1,0,0,1,1,"False illusioni, sgradevoli realtà Mario Monti..."
2,125513454315507712,1,0,1,0,0,1,1,"False illusioni, sgradevoli realtà #editoriale..."
3,125524238290522113,1,0,1,0,0,1,1,Mario Monti: Berlusconi risparmi all'Italia il...
4,125527933224886272,1,0,1,0,0,1,1,Mario Monti: Berlusconi risparmi all'Italia il...


Show the distribution of negative and positive tweets

In [64]:
df_train.lneg.value_counts() / df_train.shape[0]

0    0.653711
1    0.346289
Name: lneg, dtype: float64

The number of negative and positive tweets is equally distributed for the training and the test set.

In [50]:
columns = ['text', 'lpos']
train_data_raw = df_train[columns].as_matrix()
test_data_raw = df_test[columns].as_matrix()

In [8]:
print(train_data_raw[:2])
print(test_data_raw[:2])

[['Intanto la partita per Via Nazionale si complica. #Saccomanni dice che "mica tutti sono Mario #Monti" http://t.co/xPtNz4X7 via @linkiesta'
  0]
 ['False illusioni, sgradevoli realtà Mario Monti http://t.co/WOmMCITs via @AddToAny'
  0]]
[['Tra 5 minuti presentazione piano scuola del governo #Renzi. #passodopopasso #labuonascuola Stay tuned'
  0]
 ["@matteorenzi: Alle 10 appuntamento su http://t.co/YphnXknDML #italiariparte #labuonascuola'  #Grandinsegnanti ... #Buonlavoro"
  1]]


## Data Preprocessing

We preprocess the tweets with [p/processor](https://github.com/s/preprocessor#available-options).
We replace URL, MENTION, HASHTAG, EMOJi, and NUMBER with keywords.
The list of positive and negative emoticons.

In [9]:
import preprocessor as p

# check the options at https://github.com/s/preprocessor#available-options
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.EMOJI, p.OPT.NUMBER)

In [10]:
smiley_pos = '$SMILEY_POS$'
smiley_neg = '$SMILEY_NEG$'

POSITIVE = {"*O", "*-*", "*O*", "*o*", "* *",
            ":P", ":D", ":d", ":p",
            ";P", ";D", ";d", ";p",
            ":-)", ";-)", ":=)", ";=)",
            ":<)", ":>)", ";>)", ";=)",
            "=}", ":)", "(:;)",
            "(;", ":}", "{:", ";}",
            "{;:]",
            "[;", ":')", ";')", ":-3",
            "{;", ":]",
            ";-3", ":-x", ";-x", ":-X",
            ";-X", ":-}", ";-=}", ":-]",
            ";-]", ":-.)",
            "^_^", "^-^"}

NEGATIVE = {":(", ";(", ":'(",
            "=(", "={", "):", ");",
            ")':", ")';", ")=", "}=",
            ";-{{", ";-{", ":-{{", ":-{",
            ":-(", ";-(",
            ":,)", ":'{",
            "[:", ";]"
           }

In [13]:
def preprocess(data):
    """
    :data a list of tweets with their sentiment
    :return the tweets preprocessed and split by space
    """
    for text, lpos in data:
        sentence = p.tokenize(text).split(' ')
        result = []
        for word in sentence:
            if word in POSITIVE:
                result.append(smiley_pos)
            elif word in NEGATIVE:
                result.append(smiley_neg)
            else:
                result.append(word)
        
        yield [result, lpos]

In [14]:
train_data_preproccesed = list(preprocess(train_data_raw))
test_data_preprocessed = list(preprocess(test_data_raw))

In [15]:
print(train_data_preproccesed[18])
print(test_data_preprocessed[18])

[['$MENTION$', 'sono', "piu'", 'tranquillo', 'ora', '$SMILEY_POS$', 'buona', 'giornata.', 'Cmq', 'ci', 'vorrebbe', 'Mario', 'Monti'], 1]
[['È', 'online', 'il', 'rapporto', 'del', 'governo', '$HASHTAG$.', 'Si', 'trova', 'a', 'questo', 'link:', '$URL$.', '$HASHTAG$!', '$HASHTAG$', '$MENTION$'], 0]


Helper to find the tweet with positive and/or negative smiley. From the result we can see that there aren't so many.

In [16]:
def find_smiley(data, pos_or_neg):
    idx = []
    for i, x in enumerate(data):
        if len(set(pos_or_neg).intersection(set(x[0].split()))) > 0:
            idx.append(i)
    return idx

idx_pos = find_smiley(train_data_raw, POSITIVE)
idx_neg = find_smiley(train_data_raw, NEGATIVE)

print('number of positive {}'.format(len(idx_pos)))
print('number of negative {}'.format(len(idx_neg)))

number of positive 228
number of negative 25


In [17]:
for tweet, class_value in train_data_preproccesed[18:22]:
    print(tweet, class_value)

['$MENTION$', 'sono', "piu'", 'tranquillo', 'ora', '$SMILEY_POS$', 'buona', 'giornata.', 'Cmq', 'ci', 'vorrebbe', 'Mario', 'Monti'] 1
['Mario', '$HASHTAG$:', 'La', 'lira', 'non', 'era', 'una', 'moneta', 'strana,', 'ma', 'era', 'il', "più'", 'delle', 'volte', 'una', 'moneta', 'debole,', 'perche', 'rifletteva', 'caratteristiche', "dell'Italia"] 0
['Mario', 'Monti', 'a', 'Berlusconi,', "l'euro", 'non', 'è', 'in', 'crisi:', 'ROMA,', '$NUMBER$', 'OTT', '?', "''L'euro", 'non', 'è', 'in', "crisi'',", "''è", 'stabile', 'in', 'termin...', '$URL$'] 1
['Un', 'parere', 'autorevole', 'e', 'non', 'demagogico', "sull'euro", 'da', 'mario', 'monti', '$URL$'] 1


In [18]:
x_train = np.array([s for s, _ in train_data_preproccesed])
y_train = np.array([c for _, c in train_data_preproccesed])

x_test = np.array([s for s, _ in test_data_preprocessed])
y_test = np.array([c for _, c in test_data_preprocessed])

### Save the dataset

finally, we save the dataset an np compressed array

In [19]:
np.savez_compressed('data/sentipolc.npz', x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)

In [20]:
x_train

array([list(['Intanto', 'la', 'partita', 'per', 'Via', 'Nazionale', 'si', 'complica.', '$HASHTAG$', 'dice', 'che', '"mica', 'tutti', 'sono', 'Mario', '$HASHTAG$"', '$URL$', 'via', '$MENTION$']),
       list(['False', 'illusioni,', 'sgradevoli', 'realtà', 'Mario', 'Monti', '$URL$', 'via', '$MENTION$']),
       list(['False', 'illusioni,', 'sgradevoli', 'realtà', '$HASHTAG$', 'di', 'Mario', 'Monti', 'sul', 'Corriere', 'della', 'Sera:', '$URL$', '$HASHTAG$', 'stampa']),
       ...,
       list(['$MENTION$', 'Consolati,', 'il', 'governo', '$HASHTAG$', 'ha', 'messo', 'una', 'tassa', 'sulla', 'fortuna.', 'Te', 'non', 'la', 'pagherai', 'mai.', '$MENTION$', 'grazie', 'per', "l'aiuto", 'fratello!']),
       list(['$MENTION$', 'beh,', 'beate', 'loro!', 'Io', 'nn', 'possiedo', 'nulla', 'di', 'tutto', 'ciò..', 'Devo', 'preoccuparmi?!', '$HASHTAG$']),
       list(['Caro', '$HASHTAG$,se', '$HASHTAG$', 'spaccava', 'i', 'computer', 'e', 'ora', 'è', 'il', 'blogger', 'più', 'seguito,forse', 'è', 'più', 

In [21]:
x_train_text = [' '.join(text) for text in x_train]
x_test_text = [' '.join(text) for text in x_test]

### Encode the Dataset into Sequence of IDs per Word

prepare the dataset into a sequence of words and generate a dictionary that maps word to index

In [22]:
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict

Using TensorFlow backend.


In [23]:
word_tokenizer = Tokenizer(oov_token=True)
word_tokenizer.fit_on_texts(x_train_text)

Most common words

In [25]:
most_commons = [(v,k) for k, v in word_tokenizer.word_counts.items()]
most_commons = sorted(most_commons, key=lambda x: x[0], reverse=True)
most_commons[:20]

[(5616, 'hashtag'),
 (3227, 'mention'),
 (2675, 'il'),
 (2630, 'monti'),
 (2503, 'di'),
 (2295, 'url'),
 (2135, 'governo'),
 (2098, 'e'),
 (1873, 'a'),
 (1828, 'che'),
 (1795, 'la'),
 (1527, 'non'),
 (1326, 'è'),
 (1272, 'number'),
 (1133, 'per'),
 (1088, 'mario'),
 (1084, 'un'),
 (1000, 'del'),
 (961, 'in'),
 (860, 'i')]

Save the dictionary

In [26]:
word_index_path = 'data/sentipolc_word_index.json'

In [31]:
list(word_tokenizer.word_index.items())[:10]

[('hashtag', 1),
 ('mention', 2),
 ('il', 3),
 ('monti', 4),
 ('di', 5),
 ('url', 6),
 ('governo', 7),
 ('e', 8),
 ('a', 9),
 ('che', 10)]

In [32]:
with open(word_index_path, 'w') as f:
    json.dump(word_tokenizer.word_index, f)

In [33]:
x_train_seq = word_tokenizer.texts_to_sequences(x_train)
x_test_seq = word_tokenizer.texts_to_sequences(x_test)

#### Save the Dataset as Encoded Sequences

In [34]:
np.savez_compressed('data/sentipolc_seq.npz', 
                    x_train=x_train_seq, y_train=y_train, x_test=x_test_seq, y_test=y_test)

### Encode the Dataset into Sequence of IDs per Char

prepare the dataset into a sequence of words and generate a dictionary that maps char to index

In [35]:
import string

In [36]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [37]:
char_tokenizer = Tokenizer(char_level=True, oov_token=True)

In [38]:
char_tokenizer.fit_on_texts(x_train_text)

#### Save the char dictionary

In [39]:
char_index_path = 'data/sentipolc_char_index.json'

In [40]:
list(char_tokenizer.word_index.items())[:10]

[(' ', 1),
 ('i', 2),
 ('a', 3),
 ('o', 4),
 ('e', 5),
 ('n', 6),
 ('t', 7),
 ('r', 8),
 ('$', 9),
 ('l', 10)]

In [41]:
with open(char_index_path, 'w') as f:
    json.dump(char_tokenizer.word_index,f)

In [42]:
x_train_seq_char = char_tokenizer.texts_to_sequences(x_train_text)
x_test_seq_char = char_tokenizer.texts_to_sequences(x_test_text)

#### Save the dataset

In [43]:
np.savez_compressed('data/sentipolc_char_seq.npz', 
                    x_train=x_train_seq_char, y_train=y_train, x_test=x_test_seq_char, y_test=y_test)