# HW \#1

In [6]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer 
from nltk.corpus import stopwords
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

## Parse xml

In [7]:
class XML2DataFrame:
    def __init__(self, xml_path):
        xml_data = open(xml_path, encoding='utf-8')
        self.root = ET.XML(xml_data.read())[1]

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.attrib["name"]] = None if element.text == "NULL" else element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [8]:
tkk_train_raw = XML2DataFrame("SentiRuEval/tkk_train_2016.xml").process_data().fillna(0)
bank_train_raw = XML2DataFrame("SentiRuEval/bank_train_2016.xml").process_data().fillna(0)

tkk_test_raw = XML2DataFrame("SentiRuEval/tkk_test_etalon.xml").process_data().fillna(0)
bank_test_raw = XML2DataFrame("SentiRuEval/banks_test_etalon.xml").process_data().fillna(0)

## Construct train and test

In [9]:
tkk_train_text = tkk_train_raw['text'].values
tkk_test_text = tkk_test_raw['text'].values
tkk_train_labels = tkk_train_raw[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].astype(int).sum(axis=1).values
tkk_test_labels = tkk_test_raw[["beeline", "komstar", "mts", "rostelecom", "skylink", "tele2"]].astype(int).sum(axis=1).values

bank_train_text = bank_train_raw['text'].values
bank_test_text = bank_test_raw['text'].values
bank_train_labels = bank_train_raw[['alfabank','bankmoskvy','gazprom','raiffeisen','rshb','sberbank','uralsib','vtb']].astype(int).sum(axis=1).values
bank_test_labels = bank_test_raw[['alfabank','bankmoskvy','gazprom','raiffeisen','rshb','sberbank','uralsib','vtb']].astype(int).sum(axis=1).values

## Preprocess text: tokenize, stem, delete stopwords

In [10]:
stop_words = stopwords.words('russian')
sb_stemmer = SnowballStemmer('russian')

def tokenize(text):
    tokens = RegexpTokenizer(r'\w+').tokenize(text)
    tokens = [sb_stemmer.stem(token) for token in tokens if token not in stop_words]    
    return tokens

In [11]:
tkk_train_text = [tokenize(text) for text in tkk_train_text]
tkk_test_text = [tokenize(text) for text in tkk_test_text]

bank_train_text = [tokenize(text) for text in bank_train_text]
bank_test_text = [tokenize(text) for text in bank_test_text]

## Train Word2Vec and TF-IDF

In [37]:
tkk_w2v = gensim.models.Word2Vec(tkk_train_text, min_count=25, size=100)
tkk_w2v.train(tkk_test_text, total_examples=tkk_w2v.corpus_count, epochs=tkk_w2v.epochs)

(50997, 130195)

In [38]:
tkk_vect = TfidfVectorizer()
tkk_vect.fit([' '.join(text) for text in tkk_train_text])
tkk_vect.fit([' '.join(text) for text in tkk_test_text])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [39]:
bank_w2v = gensim.models.Word2Vec(bank_train_text, min_count=25, size=100)
bank_w2v.train(bank_test_text, total_examples=bank_w2v.corpus_count, epochs=bank_w2v.epochs)

(60864, 193745)

In [40]:
bank_vect = TfidfVectorizer()
bank_vect.fit([' '.join(text) for text in bank_train_text])
bank_vect.fit([' '.join(text) for text in bank_test_text])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)


## LogReg on TF-IDF

In [41]:
tkk_X_train = tkk_vect.transform([' '.join(text) for text in tkk_train_text])
tkk_X_test = tkk_vect.transform([' '.join(text) for text in tkk_test_text])

In [42]:
tkk_logreg = LogisticRegression().fit(tkk_X_train, tkk_train_labels)
tkk_predicted_labels = tkk_logreg.predict(tkk_X_test)
print('Accuracy for tkk on TF-IDF LogReg: ', 
accuracy_score(tkk_test_labels, tkk_predicted_labels))

# 0.6528704939919893, if we do not delete stopwords

Accuracy for tkk on TF-IDF LogReg:  0.6386292834890965


In [43]:
bank_X_train = bank_vect.transform([' '.join(text) for text in bank_train_text])
bank_X_test = bank_vect.transform([' '.join(text) for text in bank_test_text])

bank_logreg = LogisticRegression().fit(bank_X_train, bank_train_labels)
bank_predicted_labels = bank_logreg.predict(bank_X_test)
print('Accuracy for bank on TF-IDF LogReg: ', 
accuracy_score(bank_test_labels, bank_predicted_labels))

# 0.7494717778448536, if we do not delete stopwords

Accuracy for bank on TF-IDF LogReg:  0.7398128584364624


## Feature matrix for Word2Vec+IDF

In [44]:
def get_features_of_text(text, feature_names, w2v, tf_idf):
    output_vector = np.zeros(w2v.layer1_size)
    for word in text:
        try:
            # each word vector in sum is weighted by IDF coef.
            output_vector += w2v.wv[word]*tf_idf.idf_[feature_names.index(word)]
        except:
            pass
    return output_vector

In [45]:
tkk_feature_names = tkk_vect.get_feature_names()
tkk_X_train = np.array([get_features_of_text(text, tkk_feature_names, tkk_w2v, tkk_vect) 
                        for text in tkk_train_text])
tkk_X_test = np.array([get_features_of_text(text, tkk_feature_names, tkk_w2v, tkk_vect) 
                       for text in tkk_test_text])

In [46]:
bank_feature_names = bank_vect.get_feature_names()
bank_X_train = np.array([get_features_of_text(text, bank_feature_names, bank_w2v, bank_vect) 
                        for text in bank_train_text])
bank_X_test = np.array([get_features_of_text(text, bank_feature_names, bank_w2v, bank_vect) 
                       for text in bank_test_text])

## LogReg on Word2Vec+IDF

In [47]:
tkk_logreg = LogisticRegression()
tkk_logreg.fit(tkk_X_train, tkk_train_labels)
tkk_predicted_y = tkk_logreg.predict(tkk_X_test)
print('Accuracy for tkk on Word2Vec-IDF: ', accuracy_score(tkk_predicted_y, tkk_test_labels))

# 0.5287049399198932, if we do not delete stopwords

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

Accuracy for tkk on Word2Vec-IDF:  0.57543391188251


In [49]:
bank_logreg = LogisticRegression()
bank_logreg.fit(bank_X_train, bank_train_labels)
bank_predicted_y = bank_logreg.predict(bank_X_test)
print('Accuracy for bank on Word2Vec-IDF: ', 
      accuracy_score(bank_predicted_y, bank_test_labels))

# 0.6779354059764564, if we do not delete stopwords

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

Accuracy for bank on Word2Vec-IDF:  0.7153637186839722
