# HW \#2

In [9]:
import xml.etree.ElementTree as ET
import pandas as pd
import gensim
import numpy as np
import math
import nltk
from nltk.stem.snowball import SnowballStemmer 
from nltk.corpus import stopwords
import spacy
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from gensim.models import FastText
import warnings
warnings.filterwarnings('ignore')

## Parse xml

In [10]:
class XML2DataFrame:
    def __init__(self, xml_path):
        xml_data = open(xml_path, encoding='utf-8')
        self.root = ET.XML(xml_data.read())[1]

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.attrib["name"]] = None if element.text == "NULL" else element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [11]:
tkk_train_raw = XML2DataFrame("SentiRuEval/tkk_train_2016.xml").process_data().fillna(0)
bank_train_raw = XML2DataFrame("SentiRuEval/bank_train_2016.xml").process_data().fillna(0)

tkk_test_raw = XML2DataFrame("SentiRuEval/tkk_test_etalon.xml").process_data().fillna(0)
bank_test_raw = XML2DataFrame("SentiRuEval/banks_test_etalon.xml").process_data().fillna(0)

## Construct train and test

In [12]:
tkk_train_text = tkk_train_raw['text'].values
tkk_test_text = tkk_test_raw['text'].values
tkk_train_labels = tkk_train_raw[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].astype(int).sum(axis=1).values
tkk_test_labels = tkk_test_raw[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].astype(int).sum(axis=1).values

bank_train_text = bank_train_raw['text'].values
bank_test_text = bank_test_raw['text'].values
bank_train_labels = bank_train_raw[['alfabank','bankmoskvy','gazprom','raiffeisen','rshb','sberbank','uralsib','vtb']].astype(int).sum(axis=1).values
bank_test_labels = bank_test_raw[['alfabank','bankmoskvy','gazprom','raiffeisen','rshb','sberbank','uralsib','vtb']].astype(int).sum(axis=1).values

## Normalize labels

In [25]:
# ~= np.array([math.copysign(x/x, x) if x !=0 else 0 for x in s]) 
def normalize_number(x):
    return int(math.copysign(x/x, x)) if x !=0 else 0
def normalize_series(s):
    return np.array([normalize_number(x) for x in s])

In [14]:
tkk_train_labels = normalize_series(tkk_train_labels)
tkk_test_labels = normalize_series(tkk_test_labels)

bank_train_labels = normalize_series(bank_train_labels)
bank_test_labels = normalize_series(bank_test_labels)

## Preprocess text: tokenize, stem, delete stopwords

In [17]:
stop_words = stopwords.words('russian')
sb_stemmer = SnowballStemmer('russian')

def tokenize(text):
    tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(text)
    tokens = [sb_stemmer.stem(token) for token in tokens if token not in stop_words]
    text = " ".join(tokens)
    
    return text

In [18]:
tkk_train_text = [tokenize(text) for text in tkk_train_text]
tkk_test_text = [tokenize(text) for text in tkk_test_text]

bank_train_text = [tokenize(text) for text in bank_train_text]
bank_test_text = [tokenize(text) for text in bank_test_text]

# CNNs

## TKK

In [19]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation

### Data prepration

In [22]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(tkk_train_text)

tkk_X_train = pad_sequences(tokenizer.texts_to_sequences(tkk_train_text), maxlen=100)
tkk_X_test = pad_sequences(tokenizer.texts_to_sequences(tkk_test_text), maxlen=100)

word_index = tokenizer.word_index

In [35]:
tkk_X_train.shape

(8643, 100)

In [27]:
encoder = LabelBinarizer()
encoder.fit(tkk_train_labels)
tkk_y_train = encoder.transform(tkk_train_labels)
tkk_y_test = encoder.transform(tkk_test_labels)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [29]:
tkk_y_train.shape

(8643, 3)

In [33]:
tkk_y_train[:5]

array([[0, 1, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0]])

### Train model

In [59]:
model = Sequential()

model.add(Embedding(10000, 1000, input_length=100))
model.add(Dropout(0.2))
model.add(Conv1D(256, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(100))
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(tkk_X_train, tkk_y_train, batch_size=32, epochs=5)

Epoch 1/5




Epoch 2/5




Epoch 3/5




Epoch 4/5




Epoch 5/5






<keras.callbacks.History at 0x2b84f8cfa90>

### Accuracy for tkk trained

In [60]:
tkk_y_pred = model.predict_classes(tkk_X_test) - 1
accuracy_score(tkk_y_pred, tkk_test_labels)

0.6555407209612817

## Using pretrained fasttext vectors

In [42]:
from gensim.models import FastText

### Prepare weights for embedding layer

In [43]:
emb_model = FastText.load('emb/araneum_none_fasttextskipgram_300_5_2018.model')

In [44]:
word_vectors = emb_model.wv

In [45]:
EMBEDDING_DIM = 300
NUM_WORDS = 20000
vocabulary_size = len(word_index)+1
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

In [47]:
model = Sequential()

model.add(Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=True, input_length=100))
model.add(Dropout(0.2))
model.add(Conv1D(256, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(100))
model.add(Dense(3, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(tkk_X_train, tkk_y_train, batch_size=32, epochs=5)

Epoch 1/5




Epoch 2/5




Epoch 3/5




Epoch 4/5




Epoch 5/5






<keras.callbacks.History at 0x2b84e23e400>

### Accuracy for tkk pretrained

In [48]:
tkk_predictions = model.predict_classes(tkk_X_test) - 1
accuracy_score(tkk_predictions, tkk_test_labels)

0.6644414775255897

# Bank

## Without pretrained

### Data prepration

In [54]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(bank_train_text)

bank_X_train = pad_sequences(tokenizer.texts_to_sequences(bank_train_text), maxlen=100)
bank_X_test = pad_sequences(tokenizer.texts_to_sequences(bank_test_text), maxlen=100)

bank_word_index = tokenizer.word_index

encoder = LabelBinarizer()
encoder.fit(bank_train_labels)
bank_y_train = encoder.transform(bank_train_labels)
bank_y_test = encoder.transform(bank_test_labels)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

### Train model

In [52]:
model = Sequential()

model.add(Embedding(10000, 1000, input_length=100))
model.add(Dropout(0.2))
model.add(Conv1D(256, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(100))
model.add(Dense(3, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam',    metrics=['accuracy'])

model.fit(bank_X_train, bank_y_train, batch_size=32, epochs=5)

Epoch 1/5




Epoch 2/5




Epoch 3/5




Epoch 4/5




Epoch 5/5






<keras.callbacks.History at 0x2b84e2d38d0>

### Accuracy for bank trained

In [53]:
bank_predictions = model.predict_classes(bank_X_test) - 1
accuracy_score(bank_predictions, bank_test_labels)

0.7120434651373377

## Using pretrained

In [55]:
EMBEDDING_DIM = 300
NUM_WORDS = 20000
vocabulary_size = len(bank_word_index) + 1
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in bank_word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

In [57]:
model = Sequential()

model.add(Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=True, input_length=100))
model.add(Dropout(0.2))
model.add(Conv1D(256, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(100))
model.add(Dense(3, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(bank_X_train, bank_y_train, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5


Epoch 3/5


Epoch 4/5


Epoch 5/5




<keras.callbacks.History at 0x2b84fdfda20>

### Accuracy for bank pretrained

In [58]:
bank_predictions = model.predict_classes(bank_X_test) - 1
accuracy_score(bank_predictions, bank_test_labels)

0.6884998490793842