In [1]:
import os
import gensim
import numpy as np
import pandas as pd
import numpy as np
from math import log
from lxml import html
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords
from pymystem3 import Mystem

mystem = Mystem()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [mystem.lemmatize(word)[0] for word in words if word and word not in stops]
    words = [word for word in words if word not in stops]

    return words

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]

    return words

In [2]:
data_rt = pd.read_csv('news_texts.csv', error_bad_lines=False)

In [3]:
data_rt.dropna(inplace=True)

In [5]:
data_rt['content_norm'] = data_rt['content_norm'].apply(str.split)
data_rt['tokenized'] = data_rt['content'].apply(tokenize)

In [6]:
data_rt.to_csv('news_texts.tsv', sep='\t', index=None)

In [7]:
data_rt.head()

Unnamed: 0,content,content_norm,tokenized
0,Канцлер Германии Ангела Меркель в ходе брифинг...,"[канцлер, германия, ангел, меркель, ход, брифи...","[канцлер, германии, ангела, меркель, в, ходе, ..."
1,Российские и белорусские войска успешно заверш...,"[российский, белорусский, войско, успешно, зав...","[российские, и, белорусские, войска, успешно, ..."
2,"Дзюба, Шатов и Анюков оказались не нужны «Зени...","[дзюба, шат, анюк, оказаться, нужный, зенит, р...","[дзюба, шатов, и, анюков, оказались, не, нужны..."
3,"В Испанию без фанатов\nПожалуй, главной пятнич...","[испания, фанат, пожалуй, главный, пятничный, ...","[в, испанию, без, фанатов, пожалуй, главной, п..."
4,"Постпред России при ООН Виталий Чуркин, говоря...","[постпред, россия, оон, виталий, чуркин, говор...","[постпред, россии, при, оон, виталий, чуркин, ..."


Коллекция состоит из пар предложения (заголоков статей) и метки класса (-1,0,1). -1 не парафраз, 1 - парафраз, 0 - что-то непонятное.

In [8]:
corpus_xml = html.fromstring(open('paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)
data['text_1_tokenized'] = data['text_1'].apply(tokenize)
data['text_2_tokenized'] = data['text_2'].apply(tokenize)

data.to_csv('paraphrases.tsv', sep='\t', index=None)

In [9]:
data_rt = pd.read_csv('news_texts.tsv', sep='\t')
data = pd.read_csv('paraphrases.tsv', sep='\t')

In [10]:
count = CountVectorizer(min_df=3, max_df=0.4, max_features=1000, lowercase=False, tokenizer=lambda x: x)
X_count = count.fit_transform(data_rt['content_norm'])

In [11]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000, lowercase=False, tokenizer=lambda x: x)
X_tfidf = tfidf.fit_transform(data_rt['content_norm'])

In [12]:
def get_embedding(text, model, dim, n_documents=None, inv_idx=None):
#     text = text.split()
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i, word in enumerate(words):
        try:
            v = model.wv[word]
            if inv_idx:
                vectors[i] = v * (words[word] / total) * log(n_documents / inv_idx[word])
            else:
                vectors[i] = v
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector       

In [44]:
def transform_texts(data, model, dim, inv_idx=None):
    n_documents = len(data)
    X_text = np.zeros((n_documents, dim))
                
    for i, text in enumerate(data):
        X_text[i] = get_embedding(text, model, dim, n_documents * 2, inv_idx)
    
    return X_text

In [14]:
def get_inv_idx(data):
    inverted_index = defaultdict(list)
    for i, doc in enumerate(data):
        for word in doc: #.split():
            inverted_index[word].append(i)
    
    inv_idx = {word:len(inverted_index[word]) for word in inverted_index}
    return inv_idx

In [15]:
def similarity(v1, v2):
    v1_norm = gensim.matutils.unitvec(np.array(v1))
    v2_norm = gensim.matutils.unitvec(np.array(v2))
    return np.dot(v1_norm, v2_norm)

In [16]:
def get_similarity(model, data_1, data_2, dim, vect=None, embeddings_needed=False,
                                           weighted_tfidf=False, tokenized=False):
    if embeddings_needed:
        
        if weighted_tfidf:
            if tokenized:
                X_text_1 = transform_texts(data_1.values, model, dim, inv_idx_tokenized)
                X_text_2 = transform_texts(data_2.values, model, dim, inv_idx_tokenized)
            else:
                X_text_1 = transform_texts(data_1.values, model, dim, inv_idx)
                X_text_2 = transform_texts(data_2.values, model, dim, inv_idx)
        else:    
            X_text_1 = transform_texts(data_1.values, model, dim)
            X_text_2 = transform_texts(data_2.values, model, dim)
    else:
        
        X_text_1 = model.transform(vect.transform(data_1))
        X_text_2 = model.transform(vect.transform(data_2))
        
    sim = [similarity(v1, v2) for v1, v2 in zip(X_text_1, X_text_2)]
    return sim

In [17]:
dim = 50

In [18]:
nmf_count = NMF(dim)
nmf_count.fit(X_count)
res_nmf_count = get_similarity(nmf_count, data['text_1_norm'], data['text_2_norm'], dim, vect=count)

In [19]:
nmf_tfidf = NMF(dim)
nmf_tfidf.fit(X_tfidf)
res_nmf_tfidf = get_similarity(nmf_tfidf, data['text_1_norm'], data['text_2_norm'], dim, vect=tfidf)

In [20]:
svd_count = TruncatedSVD(dim)
svd_count.fit(X_count)
res_svd_count = get_similarity(svd_count, data['text_1_norm'], data['text_2_norm'], dim, vect=count)

In [21]:
svd_tfidf = TruncatedSVD(dim)
svd_tfidf.fit(X_tfidf)
res_svd_tfidf = get_similarity(svd_tfidf, data['text_1_norm'], data['text_2_norm'], dim, vect=tfidf)

In [22]:
inv_idx = get_inv_idx(np.concatenate([data['text_1_norm'], data['text_2_norm']], axis=0))
inv_idx_tokenized = get_inv_idx(np.concatenate([data['text_1_tokenized'], data['text_1_tokenized']], axis=0))

In [23]:
fast_text = gensim.models.FastText(data_rt['tokenized'], size=dim, min_n=4, max_n=8)

In [45]:
res_ft = get_similarity(fast_text, data['text_1_tokenized'], data['text_2_tokenized'], dim, 
                        embeddings_needed=True)

In [46]:
res_ft_tfidf = get_similarity(fast_text, data['text_1_tokenized'], data['text_2_tokenized'], dim, 
                        embeddings_needed=True, weighted_tfidf=True, tokenized=True)

In [26]:
fast_text_norm = gensim.models.FastText(data_rt['content_norm'],
                                        size=50, min_n=4, max_n=8)

In [47]:
res_ft_norm = get_similarity(fast_text_norm, data['text_1_norm'], data['text_2_norm'], dim, 
                        embeddings_needed=True)

In [48]:
res_ft_norm_tfidf = get_similarity(fast_text_norm, data['text_1_norm'], data['text_2_norm'], dim, 
                        embeddings_needed=True, weighted_tfidf=True)

In [67]:
w2v = gensim.models.Word2Vec(data_rt['content_norm'], size=dim, sg=1)

In [68]:
res_w2v = get_similarity(w2v, data['text_1_norm'], data['text_2_norm'], dim, 
                        embeddings_needed=True)

In [69]:
res_w2v_tfidf = get_similarity(w2v, data['text_1_norm'], data['text_2_norm'], dim, 
                        embeddings_needed=True, weighted_tfidf=True)

In [55]:
final_data = pd.DataFrame({'nmf_count': res_nmf_count, 'nmf_tfidf': res_nmf_tfidf,
                           'svd_count': res_svd_count, 'svd_tfidf': res_svd_tfidf,
                           'ft': res_ft, 'ft_tfidf': res_ft_tfidf,
                           'ft_norm': res_ft_norm, 'ft_norm_tfidf': res_ft_norm_tfidf,
                           'w2v': res_w2v, 'w2v_tfidf': res_w2v_tfidf})
final_data.head()

Unnamed: 0,ft,ft_norm,ft_norm_tfidf,ft_tfidf,nmf_count,nmf_tfidf,svd_count,svd_tfidf,w2v,w2v_tfidf
0,0.99435,0.992412,0.991473,0.997758,0.0,0.0,0.0,0.0,0.998637,0.996784
1,0.988949,0.986961,0.99786,0.99745,0.0,0.0,0.0,0.0,0.997773,0.999353
2,0.986031,0.996695,0.997308,0.996961,0.0,0.0,0.0,0.0,0.999345,0.998966
3,0.992464,0.969356,0.98521,0.996097,0.0,0.0,0.0,0.0,0.996024,0.995628
4,0.96708,0.967738,0.991649,0.995313,0.0,0.0,0.0,0.0,0.995571,0.997886


In [58]:
logreg = LogisticRegression(class_weight='balanced')
cross_val_score(logreg, final_data, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.43972562264702686

In [60]:
random_forest = RandomForestClassifier(n_estimators=500, class_weight='balanced')
cross_val_score(random_forest, final_data, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.49050360555015005

Увеличим размерность в ```w2v``` до 100:

In [61]:
w2v = gensim.models.Word2Vec(data_rt['content_norm'], size=100, sg=1)
res_w2v = get_similarity(w2v, data['text_1_norm'], data['text_2_norm'], dim, 
                        embeddings_needed=True)
res_w2v_tfidf = get_similarity(w2v, data['text_1_norm'], data['text_2_norm'], dim, 
                        embeddings_needed=True, weighted_tfidf=True)

In [62]:
final_data.update(pd.DataFrame({'w2v': res_w2v, 'w2v_tfidf': res_w2v_tfidf}))

In [63]:
logreg = LogisticRegression(class_weight='balanced')
cross_val_score(logreg, final_data, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.4361299709454693

In [64]:
random_forest = RandomForestClassifier(n_estimators=500, class_weight='balanced')
cross_val_score(random_forest, final_data, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.4700255770143123

Немного падает качество! Вернем прошлые значения:

In [71]:
final_data.update(pd.DataFrame({'w2v': res_w2v, 'w2v_tfidf': res_w2v_tfidf}))

Увеличим размерность в ```svd``` до 100:

In [72]:
dim = 100

svd_count = TruncatedSVD(dim)
svd_count.fit(X_count)
res_svd_count2 = get_similarity(svd_count, data['text_1_norm'], data['text_2_norm'], dim, vect=count)

svd_tfidf = TruncatedSVD(dim)
svd_tfidf.fit(X_tfidf)
res_svd_tfidf2 = get_similarity(svd_tfidf, data['text_1_norm'], data['text_2_norm'], dim, vect=tfidf)

In [73]:
final_data.update(pd.DataFrame({'svd_count': res_svd_count2, 'svd_tfidf': res_svd_tfidf2}))

In [74]:
cross_val_score(logreg, final_data, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.43986393549834435

In [75]:
cross_val_score(random_forest, final_data, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.48939432744334643

Качество немного улучшается! Увеличим размерность в ```nmf``` до 100:

In [76]:
dim = 100

nmf_count = NMF(dim)
nmf_count.fit(X_count)
res_nmf_count2 = get_similarity(nmf_count, data['text_1_norm'], data['text_2_norm'], dim, vect=count)

nmf_tfidf = NMF(dim)
nmf_tfidf.fit(X_tfidf)
res_nmf_tfidf2 = get_similarity(nmf_tfidf, data['text_1_norm'], data['text_2_norm'], dim, vect=tfidf)

In [77]:
final_data.update(pd.DataFrame({'nmf_count': res_nmf_count2, 'nmf_tfidf': res_nmf_tfidf2}))

In [78]:
cross_val_score(logreg, final_data, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.43986393549834435

In [79]:
cross_val_score(random_forest, final_data, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.49063885555332465

У ```random forest``` качество ещё немного улучшается! Уменьшим значение ```min_n``` до 2 в ```fast_text```:

In [80]:
fast_text = gensim.models.FastText(data_rt['tokenized'], size=dim, min_n=2, max_n=8)
res_ft2 = get_similarity(fast_text, data['text_1_tokenized'], data['text_2_tokenized'], dim, 
                        embeddings_needed=True)
res_ft_tfidf2 = get_similarity(fast_text, data['text_1_tokenized'], data['text_2_tokenized'], dim, 
                        embeddings_needed=True, weighted_tfidf=True, tokenized=True)
fast_text_norm = gensim.models.FastText(data_rt['content_norm'],
                                        size=50, min_n=4, max_n=8)
res_ft_norm2 = get_similarity(fast_text_norm, data['text_1_norm'], data['text_2_norm'], dim, 
                        embeddings_needed=True)
res_ft_norm_tfidf2 = get_similarity(fast_text_norm, data['text_1_norm'], data['text_2_norm'], dim, 
                        embeddings_needed=True, weighted_tfidf=True)

In [81]:
final_data.update(pd.DataFrame({'ft': res_ft2, 'ft_tfidf': res_ft_tfidf2,
                                'ft_norm': res_ft_norm2, 'ft_norm_tfidf': res_ft_norm_tfidf2}))

In [82]:
cross_val_score(logreg, final_data, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.4332233949612233

In [83]:
cross_val_score(random_forest, final_data, data['label'], cv=5, scoring='f1_micro', n_jobs=-1).mean()

0.47819703103777744

Качество не улучшается!  
  
В итоге помогло увеличение размерности ```dim = 100``` для ```svd``` и ```nmf```.