In [None]:
#!pip install pymorphy2 ratelimit
#!pip install pymorphy2[fast]
#!pip install wordcloud
#!pip install transliterate

In [2]:
#These are very useful for data analysis!
import numpy as np
import pandas as pd

#This is for working with raw Russian texts
import pymorphy2
import nltk

#This is for talking to the web
import requests
import ratelimit
import json
from bs4 import BeautifulSoup
import unicodedata
import re

#This is for us to create a word cloud
import wordcloud
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

#Text vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Text clustering
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.manifold import MDS
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import DBSCAN

#Networks
import transliterate
import networkx as nx
import gc

In [None]:
@ratelimit.limits(calls = 5, period=1)
def riaParser(nArt = 100):
    procArt = 0
    reqURL = "https://ria.ru/services/ria_ru/widget/more.html"
    while procArt < nArt:
        apiR = requests.get(reqURL)
        if apiR.status_code != 200:
            raise ValueError("API request returned bad status code " + str(r.status_code))
        if apiR.text == "":
            raise ValueError("API request returned an empty response")
        
        newsURLs = re.findall("\/\d{8}\/\d*?\.html", apiR.text)
        
        nextURL = re.findall("/services/ria_ru/widget/more\.html.*?type=lenta", apiR.text)[-1]
        nextURL = "https://ria.ru" + nextURL
        nextURL = nextURL.replace("&amp;", "&")
        
        for url in newsURLs:
            fullURL = "https://ria.ru" + url
            artR = requests.get(fullURL)
            if artR.status_code != 200:
                raise ValueError("API request returned bad status code " + str(r.status_code))
            if artR.text == "":
                raise ValueError("API request returned an empty response")
            
            artSoup = BeautifulSoup(artR.text)
            artTextDivs = artSoup.findAll(attrs={"class" : "article__block"})
            artText = ""
            for div in artTextDivs:
                if len(div.findAll("script")) > 0:
                    continue
                artText += div.text + "\n"
                
            artText = unicodedata.normalize("NFKC", artText)
            
            if procArt < nArt:
                yield artText
                procArt += 1
            else:
                break
        reqURL = nextURL

In [None]:
for text in riaParser(2):
    print(text)
    print("\n====")

In [None]:
texts = [text for text in riaParser(200)]
texts[0]

In [None]:
#tokenizer = nltk.tokenize.RegexpTokenizer("\w+")
tokenizer = nltk.tokenize.RegexpTokenizer("[а-яА-Яa-zA-Z]+")
tokenizedTexts = [tokenizer.tokenize(text) for text in texts]
"|".join(tokenizedTexts[0])

In [None]:
morphA = pymorphy2.MorphAnalyzer()
normalizedTexts = []
for tokens in tokenizedTexts:
    normalizedText = [morphA.parse(token)[0].normal_form for token in tokens]
    normalizedTexts.append(normalizedText)
"|".join(normalizedTexts[0])

In [None]:
stopwords = open("stopwords.txt", encoding="utf8").read().split()
stopwords = set(stopwords)

stopwords.add("риа")
stopwords.add("новость")
stopwords.add("фотобанк")

swTexts = []
for words in normalizedTexts:
    goodWords = [word for word in words if word not in stopwords]
    swTexts.append(goodWords)
"|".join(swTexts[0])

In [None]:
dtm1fact = CountVectorizer()
dtm1 = dtm1fact.fit_transform([" ".join(text) for text in swTexts])
"%d documnets, %d terms" % dtm1.shape

In [None]:
wordfreq = pd.DataFrame(np.sum(dtm1, axis=0).T, index=dtm1fact.get_feature_names(), columns = ['count'])
wordfreq.sort_values(by=['count'], ascending=False).head(10)

In [None]:
dtm2fact = TfidfVectorizer()
dtm2 = dtm2fact.fit_transform([" ".join(text) for text in swTexts])
"%d documnets, %d terms" % dtm2.shape

In [None]:
wordtfidf = pd.DataFrame(np.mean(dtm2, axis=0).T, index=dtm2fact.get_feature_names(), columns = ['score'])
wordtfidf.sort_values(by=['score'], ascending=False).head(10)

In [None]:
cloud1 = wordcloud.WordCloud(background_color="white", width = 4*400, height = 3*400)
cloud1.fit_words(wordtfidf.to_dict()["score"])

plt.figure(figsize = (4*3, 3*3))
plt.axis("off")
plt.imshow(cloud1, interpolation="bicubic", aspect='auto')

In [None]:
dist = pairwise_distances(dtm2, metric = "cosine")
distEmbed = MDS(dissimilarity='precomputed', metric=False)
distCoords = distEmbed.fit_transform(dist)

plt.axis("on")
plt.scatter(distCoords[:,0], distCoords[:,1])

In [None]:
cluNums = []
cluScores = []
for nc in range(2, 100):
    clustT = MiniBatchKMeans(n_clusters = nc)
    clustersT = clustT.fit_predict(dtm2)
    scoreT = silhouette_score(dtm2, clustersT)
    cluNums.append(nc)
    cluScores.append(scoreT)
plt.plot(cluNums, cluScores)

In [None]:
clust = MiniBatchKMeans(n_clusters = 30)
clusters = clust.fit_predict(dist)
plt.axis("on")
plt.scatter(distCoords[:,0], distCoords[:,1], c=clusters, cmap="tab20")

In [None]:
clusterN, counts = np.unique(clusters, return_counts=True)
dict(zip(clusterN, counts))

In [None]:
cluster = 20
textN = np.random.choice(np.where(clusters == cluster)[0], 3)[0]
texts[textN]

In [None]:
clustNP = DBSCAN(eps = 0.7, metric = "precomputed")
clustersNP = clustNP.fit_predict(dist)
plt.scatter(distCoords[:,0], distCoords[:,1], c=clustersNP, cmap="tab20")

In [None]:
clusterNNP, countsNP = np.unique(clustersNP, return_counts=True)
dict(zip(clusterNNP, countsNP))

In [None]:
cluster = 10
textN = np.random.choice(np.where(clustersNP == cluster)[0], 3)[0]
texts[textN]

In [None]:
#Networks
wordNames = [transliterate.translit(word, language_code='ru', reversed=True) for word in dtm2fact.get_feature_names()]
wordNames = [re.sub(r'\W+', '', word) for word in wordNames]

wordNetM = np.matmul(dtm2.todense().T, dtm2.todense())
nodeNamesMap = dict(zip(range(0, len(wordNames)), wordNames))

wordNet = nx.from_numpy_matrix(wordNetM)
nx.relabel_nodes(wordNet, nodeNamesMap, copy = False)

nx.write_pajek(wordNet, "wordnet.net")

wordNet = None
wordNetM = None
nodeNamesMap = None
gc.collect()

NameError: name 'nx' is not defined