In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.metrics import accuracy_score

In [2]:
class XML2DataFrame:
    def __init__(self, xml_path):
        xml_data = open(xml_path)
        self.root = ET.XML(xml_data.read())[1]

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.attrib["name"]] = None if element.text == "NULL" else element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [3]:
def from_sentiment_vectors_to_numbers(labels):
    return [list(set(label[label!=0]))[0] if len(label[label!=0]) > 0 else 0 for label in labels]

In [4]:
def clean_text(text):
    cleaning_regex = r'((http|https)://(.+?)(\s|$))|(RT @(.+?) )|(@(.+?)\s)|\n|\t|&amp'
    return re.sub(cleaning_regex, ' ', text.lower()).strip()

<h2>Data preparation</h2>

<h3>TKK data</h3>

<h4>Train</h4>

In [175]:
train_tkk = XML2DataFrame("data/tkk_train_2016.xml").process_data().fillna(0)

In [176]:
train_texts_tkk = train_tkk["text"].values
train_labels_tkk = np.array(train_tkk[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].values).astype(int)

In [177]:
train_labels_tkk = from_sentiment_vectors_to_numbers(train_labels_tkk)

<h4>Test</h4>

In [178]:
test_tkk = XML2DataFrame("data/tkk_test_etalon.xml").process_data().fillna(0)

In [179]:
test_texts_tkk = test_tkk["text"].values
test_labels_tkk = np.array(test_tkk[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].values).astype(int)

In [180]:
test_labels_tkk = from_sentiment_vectors_to_numbers(test_labels_tkk)

<h3>Bank data</h3>

<h4>Train</h4>

In [181]:
train_bank = XML2DataFrame("data/bank_train_2016.xml").process_data().fillna(0)

In [182]:
train_texts_bank = train_bank["text"].values
train_labels_bank = np.array(train_bank[["alfabank", "bankmoskvy", "gazprom", "raiffeisen", "rshb", "sberbank", "uralsib", "vtb"]].values).astype(int)

In [183]:
train_labels_bank = from_sentiment_vectors_to_numbers(train_labels_bank)

<h4>Test</h4>

In [184]:
test_bank = XML2DataFrame("data/banks_test_etalon.xml").process_data().fillna(0)

In [185]:
test_texts_bank = test_bank["text"].values
test_labels_bank = np.array(test_bank[["alfabank", "bankmoskvy", "gazprom", "raiffeisen", "rshb", "sberbank", "uralsib", "vtb"]].values).astype(int)

In [186]:
test_labels_bank = from_sentiment_vectors_to_numbers(test_labels_bank)

<h3>Join and clean data</h3>

<h4>Train</h4>

In [187]:
train_texts = [clean_text(text) for text in np.concatenate((train_texts_tkk, train_texts_bank),axis=0)]
train_labels = train_labels_tkk + train_labels_bank

<h4>Test</h4>

In [188]:
test_texts = [clean_text(text) for text in np.concatenate((test_texts_tkk, test_texts_bank), axis=0)]
test_labels = test_labels_tkk + test_labels_bank

<h2>Task1: TF-IDF + LogisticRegression</h2>

<h3>Training</h3>

In [20]:
vectorizer = TfidfVectorizer(min_df=1)

In [21]:
train_tfidf_texts = vectorizer.fit_transform(train_texts)

In [22]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
clf.fit(train_tfidf_texts, train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

<h4>Evaluation</h4>

In [23]:
test_tfidf_texts = vectorizer.transform(test_texts)

In [24]:
predicted_labels = clf.predict(test_tfidf_texts)

In [25]:
accuracy_score(test_labels, predicted_labels)

0.7111510791366906

<h2>Task 2: Word2Vec + CNN</h2>

In [206]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import gensim
from pymystem3 import Mystem
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelBinarizer

In [189]:
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 50

In [78]:
nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tutelaris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h4>Tokenize and delete stop-words</h4>

In [190]:
tknzr = RegexpTokenizer(r'\w+')

In [191]:
tokenized_train_texts = [tknzr.tokenize(text) for text in train_texts]

In [192]:
tokenized_train_texts = [[word for word in tokens if word not in stop_words] for tokens in tokenized_train_texts]

<h4>Prepare data for CNN</h4>

In [193]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_features_cnn = tokenizer.texts_to_sequences(train_texts)
test_features_cnn = tokenizer.texts_to_sequences(test_texts)

In [194]:
train_features_cnn = pad_sequences(train_features_cnn, maxlen=MAX_SEQUENCE_LENGTH)
test_features_cnn = pad_sequences(test_features_cnn, maxlen=MAX_SEQUENCE_LENGTH)
train_labels_cnn = to_categorical(train_labels)
test_labels_cnn = to_categorical(test_labels)

In [207]:
encoder = LabelBinarizer()
encoder.fit(train_labels)
train_labels_cnn = encoder.transform(train_labels)
test_labels_cnn = encoder.transform(test_labels)

In [195]:
train_word_index = tokenizer.word_index

<h4>Get embeddings</h4>

In [196]:
model = gensim.models.Word2Vec(tokenized_train_texts, size=EMBEDDING_DIM, window=5, min_count=1, workers=4)

In [268]:
embedding_matrix = np.zeros((len(train_word_index) + 1, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if word in model.wv.vocab:
        embedding_vector = model.wv.get_vector(word)
        embedding_matrix[i] = embedding_vector

<h4>Create CNN</h4>

In [235]:
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout
from keras.layers import Input, Dense, Flatten, Conv1D, MaxPooling1D, Dropout, Activation, Merge, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping

In [219]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [292]:
def MyConvNet(embeddings, number_of_words, sequence_length, embedding_dim):
    convs = []
    input_layer = Input(shape=(sequence_length,), dtype='int32')
    embedding_layer = Embedding(number_of_words, 
                                embedding_dim, 
                                input_length=sequence_length, 
                                weights=[embeddings], 
                                trainable=False)(input_layer)

    for size, filters_count in [(2, 10), (4, 10), (8, 10), (10, 10)]:
        for i in range(filters_count):
            conv_layer = Conv1D(filters=1, kernel_size=size, padding='valid', activation='relu')(embedding_layer)
            max_pool_layer = GlobalMaxPooling1D()(conv_layer)
            convs.append(max_pool_layer)
    convs_output = Merge(mode='concat', concat_axis=1)(convs)
    dropout1 = Dropout(0.5)(convs_output)
    dense1 = Dense(128)(dropout1)
    dropout2 = Dropout(0.5)(dense1)
    dense2 = Dense(3, activation='sigmoid')(dropout2)
    out = Model(input_layer, dense2)
    out.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    out.summary()
    return out

In [293]:
mycnn = MyConvNet(np.array(embedding_matrix), len(train_word_index)+1, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_38 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_39 (Embedding)        (None, 50, 100)      2379400     input_38[0][0]                   
__________________________________________________________________________________________________
conv1d_307 (Conv1D)             (None, 49, 1)        201         embedding_39[0][0]               
__________________________________________________________________________________________________
conv1d_308 (Conv1D)             (None, 49, 1)        201         embedding_39[0][0]               
__________________________________________________________________________________________________
conv1d_309

  from ipykernel import kernelapp as app


In [294]:
hist = mycnn.fit(train_features_cnn, train_labels_cnn, epochs=10, callbacks=callbacks_list, validation_split=0.1, shuffle=True, batch_size=32)

Train on 16231 samples, validate on 1804 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: early stopping


In [295]:
mycnn.evaluate(test_features_cnn, test_labels_cnn)



[0.5140216706896857, 0.744964040183335]