# Imports

In [None]:
!pip install gensim==3.4

In [None]:
!pip install python-docx

In [None]:
!pip install natasha

In [None]:
!pip install seaborn

In [None]:
!pip install transformers

In [None]:
!pip install spacy
!python -m spacy download ru_core_news_sm

In [None]:
import urllib
import requests
import json

In [None]:
import nltk.data 
import nltk

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering


from scipy.special import logsumexp
import gensim.models.phrases
from gensim.models.phrases import Phrases, Phraser
from docx import Document
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from spacy.lang.ru.stop_words import STOP_WORDS
from collections import defaultdict
import torch
from wordcloud import (WordCloud, get_single_color_func)



In [None]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)


segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("sismetanin/rubert-ru-sentiment-rusentiment")
model = AutoModelForSequenceClassification.from_pretrained("sismetanin/rubert-ru-sentiment-rusentiment")


@torch.no_grad()
def predict(text):
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()
    return predicted

# Functions

In [None]:
def getText(filename): 
    doc = Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n '.join(fullText)

In [None]:
def normalize_tokens(df): #normalization of tokens in the dataset
    normal = []
    for i in tqdm(df['Text']):
        t = []
        counter = 0
        i = Doc(i)
        i.segment(segmenter)
        i.tag_morph(morph_tagger)
        for token in i.tokens:
            token.lemmatize(morph_vocab)
            t.append(list(token)[-1])
        normal.append(t)
    return normal

In [None]:
def get_tags(df): #retrieving named entities and the number of times they occur in a given array of interviews
    tags = []
    for i in tqdm(df['Text']):
        i = Doc(i)
        i.segment(segmenter)
        i.tag_morph(morph_tagger)
        for token in i.tokens:
            token.lemmatize(morph_vocab)
        i.tag_ner(ner_tagger)
        for span in i.spans:
            span.normalize(morph_vocab)
        if i.ner.spans != []:
            tags.append(i.spans[0])
    
    tags_count = {}
    for tag in range(len(tags)): 
        tag = list(tags[tag])[-2]
        if tag not in tags_count.keys():
            tags_count[tag]=1
        else:
            tags_count[tag]=tags_count[tag]+1
    
    tcs = sorted(tags_count.items(), key=lambda x: x[1], reverse = True)
    
    return tcs,tags_count 
#tcs - list of arrays of type (named entity, frequency of occurrency) sorted by frequency
#tags_count - dictionary, where keys are named entities and values are their frequencies of occurrence in the dataframe

In [None]:
def filterSent(normalized_tokens): #removing stop-words
    filtered_sentences =[]
    stroka = ''
    for doc in tqdm(normalized_tokens):
        a = []
        for i in doc:
            if i not in STOP_WORDS:
                for n in i:
                    if n.isdigit()==False:
                        continue
                    else:
                        i = i.replace(n, '')
                a.append(i)
                stroka = stroka + i + ' '
        filtered_sentences.append(a)
    return filtered_sentences, stroka

In [None]:
def bigram(filtered_sentences): #creating 2-word collocations if words occur together in the same order more than 4 times
    phrases = Phrases(filtered_sentences, min_count=4, progress_per=10000)
    phraser = Phraser(phrases)
    bigram_sentences = phraser[filtered_sentences]
    
    stroka = ''
    word_freq = defaultdict(int)
    for sent in bigram_sentences:
        for i in sent:
            word_freq[i] += 1
            stroka = stroka + i + ' '

    freq_sorted = sorted(word_freq.items(), key=lambda x: x[1], reverse = True)
    
    return freq_sorted, phraser[filtered_sentences], stroka

In [None]:
def top_sent_word(df_pred):
    df_sent = pd.DataFrame(columns = {'Word',
                             'Sentiment',
                             'Sentence',
                                  'Filtered'})
    count = {}
    for name in names:
        pos = 0
        neg = 0
        for pred in range(len(df_pred['Text'])):
                if name in df_pred['Text'][pred]:
                    if df_pred['Sentiment label'][pred] == 2:
                        pos+=1
                        df_sent = df_sent.append({'Word': name,
                                        'Sentiment': 2, 
                                        'Sentence': df_pred['Sentence'][pred],
                                                 'Filtered':df_pred['Text'][pred]}, 
                                       ignore_index = True)
                    elif df_pred['Sentiment label'][pred] == 0:
                        neg+=1
                        df_sent = df_sent.append({'Word':name, 
                                        'Sentiment': 0, 
                                        'Sentence': df_pred['Sentence'][pred],
                                                 'Filtered':df_pred['Text'][pred]}, 
                                       ignore_index = True)
        if pos !=0 or neg !=0:
            count[name] = [pos,neg]
        
    neg_word = {}
    pos_word = {}
    avg = {}
    for i in range(len(df_sent['Filtered'])):
        for m in df_sent['Filtered'][i]:
            if m not in names:
                if df_sent["Sentiment"][i] == 0:
                    if m not in neg_word.keys():
                        neg_word[m] = 1
                    else:
                        neg_word[m] = neg_word[m]+1

                if df_sent["Sentiment"][i] == 2:
                    if m not in pos_word.keys():
                        pos_word[m] = 1
                    else:
                        pos_word[m] = pos_word[m]+1
                        
    pos = sorted(pos_word.items(), key=lambda x: x[1], reverse = True)
    neg = sorted(neg_word.items(), key=lambda x: x[1], reverse = True)
    
        
    return pos, neg, count, neg_word, pos_word
#pos - sorted ranking of words with most positive sentiments
#neg - sorted ranking of words with most negative sentiments
#count - dictionary, where keys are words and values are lists of type [freq. of positive sentiment, freq. of negative sentiment]
#neg_word - unsorted dictionary, where keys are words and values are frequencies of negative sentiment
#pos_word - unsorted dictionary, where keys are words and values are frequencies of positive sentiment

In [None]:
def bar_h(bigram_sentences):
    names = []
    values = []
    f = bigram_sentences[0][:30]
    for i in f:
        names.append(i[0])
        values.append(i[1])
    plt.figure(figsize=(15, 10))
    plt.barh(range(len(values)), values, tick_label=names)
    plt.yticks(fontsize = 16)
    plt.xticks(fontsize = 14)
    plt.tight_layout()
    plt.show()
#simple histogram illustrating the most frequent words

In [None]:
def colors(pos,neg): #color function for wordclouds
    color_to_words = {
        # words below will be colored with a green single color function
        '#90EE90': pos,
        # will be colored with a red single color function
        '#ff5349': neg}
    default_color = 'grey'
    return color_to_words, default_color

In [None]:
def get_df(number, folder):
  folder_url = folder
  file_url = 'Инт {}.docx'.format(number)
  url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download' + '?public_key=' + urllib.parse.quote(folder_url) + '&path=/' + urllib.parse.quote(file_url)

  r = requests.get(url)
  download_url = r.json()['href']

  download_response = requests.get(download_url)
  with open('file_{}.txt'.format(number), 'wb') as f: 
    f.write(download_response.content)

  df = pd.DataFrame()
  text = getText('/content/file_{}.txt'.format(number))
  doc = Doc(text)
  doc.segment(segmenter)
  counter = 0
  step_e = 0
  for m in range(len(doc.sents)):
        if 'Эксперт' in doc.sents[m].text:
            counter +=1
            step_e += 1
        if step_e !=0:
            if not 'Интервьюер' in doc.sents[m].text:
                fint = doc.sents[m].text.lstrip('Эксперт')
                step_e+=1
            else:
                step_e = 0
                continue
            
            df = df.append({
              'Номер интервью': number,
              "Номер вопроса": counter, 
              "Предложение": fint}, ignore_index=True)
        
        if step_e == 0:
            continue

  return(df)

# Analysis example for one country

In [None]:
#retrieving data from raw docx files
df_1 = pd.DataFrame()
for i in tqdm(range(1,21)):
  df = get_df(str(i), ###link to folder)
  df_1 = df_1.append(df, ignore_index=True)

100%|██████████| 20/20 [01:03<00:00,  3.17s/it]


In [None]:
t_1 = get_tags(df_1)
n_1 = normalize_tokens(df_1)

In [None]:
filt_1 = filterSent(n_1)[0]

In [None]:
b_1 = bigram(filt_1)
text_1 = filterSent(n_1)[1]

In [None]:
predicted = []
for i in tqdm(df_1['Text']):
    pred = predict(i)
    predicted.append(*pred)
df_pred = pd.DataFrame()
df_pred['Text'] = filt_1
df_pred['Sentiment label'] = predicted
df_pred['Текст'] = df_1['Text']
print(df_pred['Sentiment label'].value_counts())

In [None]:
pos_1_text = ''
for i in df_pred[df_pred['Sentiment label']==2]['Text']:
    for m in i:
        pos_1_text = pos_1_text + m + ' '

wordcloud_pos_1 = WordCloud(stopwords=STOP_WORDS,
                      font_path = '/content/Lato-Black.ttf',
                      background_color="white", 
                      max_words=30, 
                      max_font_size=30,
                      width = 400,
                      height = 200,
                      colormap = 'Greens'
                     ).generate(pos_1_text)
plt.imshow(wordcloud_pos_1, interpolation='bilinear')
plt.axis("off")
plt.show()
with open("1_pos.svg", "w") as fp:
    fp.write(wordcloud_pos_1.to_svg())

In [None]:
neg_1_text = ''
for i in df_pred[df_pred['Sentiment label']==0]['Text']:
    for m in i:
        neg_1_text = neg_1_text + m + ' '
STOP_WORDS.update(['т'])
wordcloud_neg_1 = WordCloud(stopwords=STOP_WORDS,
                      font_path = '/content/Lato-Black.ttf',
                      background_color="white", 
                      max_words=30, 
                      max_font_size=30,
                      width = 400,
                      height = 200,
                      colormap = 'Reds'
                     ).generate(neg_1_text)
plt.imshow(wordcloud_neg_1, interpolation='bilinear')
plt.axis("off")
plt.show()
with open("1_neg.svg", "w") as fp:
    fp.write(wordcloud_neg_1.to_svg())

In [None]:
bar_h(b_kz)