# Imports and functions

In [None]:
!pip install natasha

In [None]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup

import re
import requests
import logging
import zipfile
from tqdm import tqdm
import json

import nltk.data 
import nltk
from nltk.tokenize import sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

import gensim
from gensim.models import word2vec
from gensim.models.phrases import Phrases, Phraser
from gensim.models import KeyedVectors

from spacy.lang.ru.stop_words import STOP_WORDS
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities

In [None]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)


segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)

In [None]:
def to_wordlist(sentences, tokenizer, remove_stopwords=False):
    wordlist = []
    for sent in tqdm(sentences):
        tokens = []
        sent = Doc(sent)
        sent.segment(segmenter)
        sent.tag_morph(morph_tagger)
        for token in sent.tokens:
            token.lemmatize(morph_vocab)
            token = list(token)[-1]
            tokens.append(token)
        
        if remove_stopwords: # убираем стоп-слова
            stop = STOP_WORDS
            words = [w for w in tokens if not w in stop]
            wordlist.append(words)
        else: 
            wordlist.append(tokens)
    return(wordlist)

def to_sentences(df, tokenizer):
    sents = []
    for article in tqdm(df['Text']):
        sentences = tokenizer.tokenize(article.strip())
        for sent in sentences:
            sents.append(sent)
    return(sents)

def preprocess(sentences):
  sents = []
  for i in sentences:
      n = i.strip('[').rstrip(']').split(',')
      s = []
      for m in n:
          m = m.lstrip('"').lstrip(" '").rstrip("'")
          s.append(m)
      sents.append(s)
  return(sents)

In [None]:
scaler = preprocessing.MinMaxScaler()

In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')

# RBC & Lenta.ru: data

in this file, already collected and preprocessed data is used
for further information on the process of data collection please look into the 'data_collection.ipynb' file in this repository

In [None]:
#retreiving dataset of articles from RBC
folder_url = #######
file_url = 'df_rbc_filtered.csv'
url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download' + '?public_key=' + urllib.parse.quote(folder_url) + '&path=/' + urllib.parse.quote(file_url)

r = requests.get(url) 
h = json.loads(r.text)['href'] 

rbc = pd.read_csv(h).reset_index(drop=True)

#retreiving dataset of articles from Lenta.ru
folder_url_2 = #######
file_url_2 = 'df_lenta_filtered.csv'
url_2 = 'https://cloud-api.yandex.net/v1/disk/public/resources/download' + '?public_key=' + urllib.parse.quote(folder_url_2) + '&path=/' + urllib.parse.quote(file_url_2)

r_2 = requests.get(url_2) 
h_2 = json.loads(r_2.text)['href'] 

lenta = pd.read_csv(h_2).reset_index(drop=True)

In [None]:
df = pd.DataFrame()
df['Text'] = pd.concat([rbc['Text'], lenta['Text']], ignore_index = True)

In [None]:
sentences = to_sentences(df, tokenizer)

In [None]:
words = to_wordlist(sentences, tokenizer, remove_stopwords = True)

# Model

In [None]:
sentences = pd.read_excel('/content/words_model 2.csv')[0] #reading the data
sents = preprocess(sentences) #applying the previously defined function for preprocessing

In [None]:
bigram = Phrases(sents, min_count=4) #joining pairs of words that co-occur together in the same order more than 4 times into bigrams added to our vocabulary
trigram = Phrases(bigram[sents], min_count=4) #performing the same operation but for trigrams (3-word collocations)

model_bi = word2vec.Word2Vec(trigram[sents], workers=4, size=300, min_count=10, window=10, sample=1e-3) #training the model on the renewed dataset

As the data the model was trained in was international relations and politics, we can manually check whether the similarities and dissimilarities between vectors in the model's vocabulary accord with common sense

In [None]:
print(model_bi.wv.most_similar(positive=["самопровозглашенный"], negative=["россия"], topn=1))
print(model_bi.wv.most_similar("россия", topn=3))

[('республика_днр', 0.6437524557113647)]
[('рф', 0.4528544545173645), ('москва', 0.42486968636512756), ('российский', 0.36448073387145996)]


In [None]:
model_bi.wv.save('vectors.kv')
reloaded_word_vectors = KeyedVectors.load('vectors.kv')

In [None]:
model_path = "news.model"

print("Saving model...")
model_bi.save(model_path)

# Fine-tuning and visualizations

## Data

In [None]:
model_path = 'news.model'
!wget ### #retrieving the model from a remote repository

## Analysis example for one country

In [None]:
model_1 = word2vec.Word2Vec.load(model_path) #the baseline model is loaded

sentences_1 = to_sentences(df_1, tokenizer) #interview texts for the country are segmented into sentences
words_1 = to_wordlist(sentences_1, tokenizer, remove_stopwords = True) #sentences are segmented into words

bigram = Phrases(words_1, min_count=3) #bigrams are formed (due to lower amount of texts, 3 cases of cooccurrence are taken to mean that the pair is a collocation)
trigram = Phrases(bigram[words_1], min_count=3) #trigrams are formed

model_1.build_vocab(trigram[words_1], update=True) #vocabulary is updated
model_1.train(trigram[words_1], total_examples=model_1.corpus_count, epochs=50) #the model is trained to fit interview data

In [None]:
print(model_1.wv.most_similar("российский", topn=10)) #sanity check

[('россия', 0.8896118402481079), ('абхазия', 0.8636842370033264), ('культура', 0.8600834608078003), ('сказать', 0.8559765815734863), ('спорт', 0.8559607267379761), ('интервью', 0.842548131942749), ('местный', 0.8394073247909546), ('оказывать_поддержка', 0.8354763388633728), ('спрашивать', 0.8348627090454102), ('субъект', 0.8347859382629395)]


### Graph visualization

In [None]:
def dummy_preprocessor(doc):
    return doc
def whitelist_preprocessor(doc):
    return [x for x in doc if x in whitelist]

def get_colors(communities): 
  colors = ['b', 'g', 'r', 'c', '#c20078', 'y', 'k', 
          '#f97306'] + ['b']*(len(communities)-8)
  return(colors)
#this is a dummy solution: we hypothesize that, 
#given that only 100 words will be visualized, 
#it is unlikely that more than 8 communities will be present
#there are 8 defined colors for 8 distinct communities
#the rest of the communities, should there be any, are filled with blue by default

def words_for_pics(corpus):
  tfidf_vectorizer = TfidfVectorizer(
        tokenizer=dummy_preprocessor,
        preprocessor=dummy_preprocessor,
        max_df=0.95, min_df=2,
        max_features=100
    )
  corpus = corpus
  X = tfidf_vectorizer.fit_transform(corpus)
  importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
  tf_feature_names = np.array(tfidf_vectorizer.get_feature_names())
  whitelist = tf_feature_names[importance[:100]].tolist()
  whitelist = set(whitelist)
  return(whitelist)
#top-100 most important words in the corpus are selected via TF-IDF vectorization

def vecs_for_pics(corpus, whitelist,max_df, min_df, tr):
  count_vectorizer = CountVectorizer(
        tokenizer=whitelist_preprocessor,
        preprocessor=whitelist_preprocessor,
        max_df=max_df, min_df=min_df)

  mat = count_vectorizer.fit_transform(corpus)
  feature_names = np.array(count_vectorizer.get_feature_names()).tolist()

  mat = np.transpose(mat.toarray())
  A = np.corrcoef(mat)
  np.fill_diagonal(A, 0.0)
  A[A < tr] = 0
  return(A, feature_names)
#here we use count vectorizer to create vectors for selected words 
#we assume that if patterns of cooccurrence are similar between two words than the words themselves are similar
#a threshold 'tr' is passed to the function to define the value under which we consider correlations between created vectors to be insignificant and turn them to zero
#we also want the words we are going to visualize to not be too common or too rare in the corpus, so we pass limits on min and max occurrences in the corpus

def net_graph(A, feature_names, short_country_name, h):
  G = nx.Graph(A)
  communities = greedy_modularity_communities(G)
  coord = nx.spring_layout(G, k = 0.15)

  plt.figure(figsize=(h,h))

  labels = dict(enumerate(feature_names))
  colors = get_colors(communities)
  aux = 0
  for community in communities:
    nx.draw_networkx_nodes(G, coord, community, node_size = 10, node_color = colors[aux])
  
    label_dict = {k: v for (k,v) in labels.items() if k in set(community)}
    nx.draw_networkx_labels(G, coord, labels=label_dict, font_color=colors[aux], 
                            font_size = 13, 
                            font_family = 'Liberation Sans')
    aux = aux + 1
  
  nx.draw_networkx_edges(G, pos=coord, alpha=0.5, edge_color = 'grey')
  plt.title("{} discource graph".format(short_country_name.capitalize()))
  plt.savefig("{}.svg".format(short_country_name), format="svg")
  plt.show()

In [None]:
whitelist_1 = words_for_pics(trigram[words_1])

In [None]:
A_1, fn_1 = vecs_for_pics(trigram[words_1], whitelist_1,0.95,3,0.5)
#data matrix and feature names for the example country are defined
#words in the final visualization will be occurring in less than 95% of the sentences in our corpus but no less than 3 times
#to be present on the graph as a link between two words, their vectors have to be strongly correlated (corrcoef > 0.5)

In [None]:
net_graph(A_1, #matrix
          fn_1, #feature names
          '1', #name of the group
          25 #height of the image
          )