# Claim2Features

## Util

### Ignore Warning

In [None]:
import warnings
warnings.filterwarnings("ignore")

### My Hot Encoder Decoder

In [None]:
import numpy as np
from collections import OrderedDict
from operator import itemgetter


def myHotEncode(input_data, max_vocab=0, vocab2idx=None):
    "Return the hot-vecotor and the vocab2idx."
    if vocab2idx is None:
        vocabFreq = {}
        for i in input_data:
            for j in i:
                if j not in vocabFreq:
                    vocabFreq[j] = 0
                vocabFreq[j] += 1
        vocabFreq = OrderedDict(sorted(vocabFreq.items(), key=itemgetter(1), reverse=True))
        vocab2idx = {}
        count = 0
        for v in vocabFreq:
            count += 1
            if max_vocab > 0 and count > max_vocab:
                break
            vocab2idx[v] = len(vocab2idx)        
    vocabEmbeddings = np.identity(len(vocab2idx), dtype='float32')
    data_ret = []
    for i in input_data:
        i_ = []
        for j in i:
            if j in vocab2idx:
                i_.append(vocabEmbeddings[vocab2idx[j]])
        if len(i_) == 0:
            i_ = np.zeros((1,len(vocab2idx)))
        data_ret.append(sum(i_))
    return data_ret, vocab2idx


def myHotDecode(input_data, vocab2idx):
    "Return the decode as final representation and decode as indexs"
    data_ = []
    data_idx = []
    for i in input_data:
        i_ = []
        i_idx = []
        if len(i) != len(vocab2idx):
            print('Erro:', 'The vocab2idx not fit the input data!')
            return
        for _i, j in enumerate(i):
            if j > 0:
                v = [k for k in vocab2idx if vocab2idx[k]==_i][0]
                i_.append(v)
                i_idx.append(_i)
        data_.append(i_)
        data_idx.append(i_idx)
    return data_, data_idx

### Confusion Matrix

In [None]:
import matplotlib.pyplot as plt
import itertools


def plot_confusion_matrix(cm, classes,
                          normalize=True,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### Treat texts

#### Treat text or a list of texts

In [None]:
def treat_text(text, removes=['\n', '\r', '\t'], strip=True):
    if type(text) == list:
        text_ = []
        for t in text:
            text_.append(treat_text(t))
        return text_
    text_ = text
    for remove in removes:
        text_ = text_.replace(remove, '')
    if strip:
        text_ = text_.strip()
    return text_

#### Split pure words

In [None]:
def pure_words(text, dividers=['-', '_'], uppercase=True, low_upper=True):
    upper = list('ABCDEFGHIJKLMNOPQRSTUVWYZ')
    text_ = ''
    last_space = True
    for c in text:
        if not last_space:
            if c in dividers:
                text_ += ' '
            elif uppercase and c in upper:
                c_ = c
                if low_upper:
                    c_ = c.lower()
                text_ += ' ' + c_
            else:
                text_ += c
        else:
            if c not in dividers:
                text_ += c
        if c == ' ':
            last_space = True
        else:
            last_space = False
    return text_

## Pre-processing

In [None]:
%%time
import pandas as pd
import ast

data_path = 'dataset.csv'
df = pd.read_csv(data_path)

columns = ['tag', 'text', 'attrs', 'level', 'brother_tag', 'brother_text', 'brother_attrs', 'url', 'site', 'label']
words_context = []
brother_words_context = []

count = 0
for tupla in df[columns].values:
    count += 1
    # Each instance is a dictionary dic
    dic = {}
    for i, column in enumerate(columns):
        dic[column] = tupla[i]
    
    # words_context attribute
    words_context_ = ''
    if dic['text'] != 'None Text':
        words_context_ += pure_words(treat_text(str(dic['text']))) + ' '
    attrs = ast.literal_eval(dic['attrs'])
    for attr in attrs.values():
        if type(attr) == str:
            words_context_ += pure_words(treat_text(str(attr))) + ' '
        else:
            for a in attr:
                words_context_ += pure_words(treat_text(str(a))) + ' '
    words_context.append(words_context_)
    
    # brother_words_context attribute
    brother_words_context_ = ''
    if dic['brother_text'] != 'None Text':
        brother_words_context_ += pure_words(treat_text(str(dic['brother_text']))) + ' '
    attrs = ast.literal_eval(dic['brother_attrs'])
    for attr in attrs.values():
        if type(attr) == str:
            brother_words_context_ += pure_words(treat_text(str(attr))) + ' '
        else:
            for a in attr:
                brother_words_context_ += pure_words(treat_text(str(a))) + ' '
    brother_words_context.append(brother_words_context_)
    
del df['attrs']
del df['brother_attrs']
df['words_context'] = words_context
df['brother_words_context'] = brother_words_context

## Features

### Bag-of-words for context words

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


vocab_size = 2000
vec = CountVectorizer(max_features=vocab_size, stop_words=ENGLISH_STOP_WORDS)
vec.fit(df['words_context'])