In [1]:
import re

import nltk

In [2]:
# download punk package if it's absent
nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/titart/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/titart/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Build dataset from plain text

In [3]:
def check_str(s):
    return s.isspace() is not True and s is not ""

In [37]:
"abc ..\n"[-4:-1]

' ..'

In [47]:
STOP_WORDS = [u"—", u":", u"«", u"»", u"[", u"]"]
PUNCTS = [u".", u",", u"?", u"!"]

def convert_line_to_sents(line, line_proc):
    sents = nltk.tokenize.sent_tokenize(line)
    
    ext_sents = []
    for sent in sents:
        # splits sentences by ";" to increase total amount of lines
        ts = sent.split(";")
        ext_sents.extend(filter(None, ts))

    res = []
    for s in ext_sents:
        words = [w.lower() for w in nltk.tokenize.word_tokenize(s) if w not in STOP_WORDS]
        
        # take lines contains up 1 to 17 words
        if 1 < len(words) < 20:
            
            # skip lines contains only word and punctuation mark like "someword ."
            if len(words) == 2 and words[1] in PUNCTS:
                continue
            
            line = " ".join(words) + "\n"
            line = re.sub(r"\b\d+\b", "_num_", line)
            
            line = line_proc(line)
            
            res.append(line)
    
    return res

In [48]:

def text2sents(source, target, line_proc=lambda s: s):
    target_file = open(target, "w")
    with open(source, "r") as source_file:
        for line in source_file:
            line = line.decode("utf-8")
            sents = convert_line_to_sents(line, line_proc)
            if not sents:
                continue

            target_file.writelines([s.encode("utf-8") for s in sents])

    target_file.close()

## Convert file with text of tales to file with sentences

In [142]:
SOURCE_TALES = "data/tales/tales-text-full.txt"
TARGET_TALES = "data/tales/tales-sents-full.txt"

text2sents(SOURCE_TALES, TARGET_TALES)

## Convert file with text of news to file with sentences

In [143]:

SOURCE_NEWS = "data/news/meduza-news.txt"
TARGET_NEWS = "data/news/news-sents.txt"

text2sents(SOURCE_NEWS, TARGET_NEWS)

In [50]:

SOURCE_DOSTOEVSKY = "data/dostoevsky/dostoevsky-all.txt"
TARGET_DOSTOEVSKY = "data/dostoevsky/dostoevsky-sents.txt"

def dots_processing(line):
    # it resolves problems with three dots in Dostoevsy texts 
    line = line.replace(u"…", "")
    if u" ..\n" in line:
        line = line[:-4] + "\n" 
    
    return line


text2sents(SOURCE_DOSTOEVSKY, TARGET_DOSTOEVSKY, line_proc=dots_processing)

In [55]:

SOURCE_MARININA = "data/marinina/marinina-all.txt"
TARGET_MARININA = "data/marinina/marinina-sents.txt"

def hyphen_processor(s):
    if u"–" in s:
        s = s.replace(u"– ", "")
        
    return dots_processing(s)

text2sents(SOURCE_MARININA, TARGET_MARININA, hyphen_processor)

# Parse news corpus from <a href="http://meduza.io">Meduza</a> API

In [83]:
import requests
import json
from bs4 import BeautifulSoup

In [88]:
API_PATH = "https://meduza.io/api/v3"

def get_collection(topic="news", locale="ru", page=0, page_size=100):
    params = {"chrono": topic, "locale": locale, "page": page, "per_page": page_size}
    req = requests.get(API_PATH + "/search", params)
    return json.loads(req.content)["collection"]

def get_content(post_path):
    h2t = html2text.HTML2Text()
    req = requests.get(API_PATH + "/" + post_path)
    body = json.loads(req.content)["root"]["content"]["body"]
    soup = BeautifulSoup(body)
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

In [95]:
NEWS_DATA_FILE = "meduza-news.txt"
PAGES = 40

# WARNING: it takes a lot of time!!!

it = 0
with open(NEWS_DATA_FILE, "w") as f:
    for page in range(PAGES):
        collection = get_collection(page=page)
        for path in collection:
            try:
                f.write(get_content(path).encode("utf-8"))
            except:
                continue
            it += 1
            if it % 5 == 0:
                print "%d lines have been read" % it

5 lines have been read
10 lines have been read
15 lines have been read
20 lines have been read
25 lines have been read
30 lines have been read
35 lines have been read
40 lines have been read
45 lines have been read
50 lines have been read
55 lines have been read
60 lines have been read
65 lines have been read
70 lines have been read
75 lines have been read
80 lines have been read
85 lines have been read
90 lines have been read
95 lines have been read
100 lines have been read
105 lines have been read
110 lines have been read
115 lines have been read
120 lines have been read
125 lines have been read
130 lines have been read
135 lines have been read
140 lines have been read
145 lines have been read
150 lines have been read
155 lines have been read
160 lines have been read
165 lines have been read
170 lines have been read
175 lines have been read
180 lines have been read
185 lines have been read
190 lines have been read
195 lines have been read
200 lines have been read
205 lines have been 

1625 lines have been read
1630 lines have been read
1635 lines have been read
1640 lines have been read
1645 lines have been read
1650 lines have been read
1655 lines have been read
1660 lines have been read
1665 lines have been read
1670 lines have been read
1675 lines have been read
1680 lines have been read
1685 lines have been read
1690 lines have been read
1695 lines have been read
1700 lines have been read
1705 lines have been read
1710 lines have been read
1715 lines have been read
1720 lines have been read
1725 lines have been read
1730 lines have been read
1735 lines have been read
1740 lines have been read
1745 lines have been read
1750 lines have been read
1755 lines have been read
1760 lines have been read
1765 lines have been read
1770 lines have been read
1775 lines have been read
1780 lines have been read
1785 lines have been read
1790 lines have been read
1795 lines have been read
1800 lines have been read
1805 lines have been read
1810 lines have been read
1815 lines h

3205 lines have been read
3210 lines have been read
3215 lines have been read
3220 lines have been read
3225 lines have been read
3230 lines have been read
3235 lines have been read
3240 lines have been read
3245 lines have been read
3250 lines have been read
3255 lines have been read
3260 lines have been read
3265 lines have been read
3270 lines have been read
3275 lines have been read
3280 lines have been read
3285 lines have been read
3290 lines have been read
3295 lines have been read
3300 lines have been read
3305 lines have been read
3310 lines have been read
3315 lines have been read
3320 lines have been read
3325 lines have been read
3330 lines have been read
3335 lines have been read
3340 lines have been read
3345 lines have been read
3350 lines have been read
3355 lines have been read
3360 lines have been read
3365 lines have been read
3370 lines have been read
3375 lines have been read
3380 lines have been read
3385 lines have been read
3390 lines have been read
3395 lines h

# Split dataset files to train, dev and test subfiles

In [57]:
def write_part(source, target, count):
    for i in xrange(count):
        target.write(source.readline())

def count_lines(path):
    with open(path, "r") as f:
        count = sum(1 for _ in f)
    return count

def create_dataset(source, target_path, postfix, dev_rat=0.14, test_rat=0.28):
    count = count_lines(source)
    with open(source, "r") as source_file:
        dev_count = int(count * dev_rat)
        test_count = int(count * test_rat)
        train_count = count - dev_count - test_count
        
        filename = "%s/sentiment.train.%s" % (target_path, postfix)
        with open(filename, "w") as f:
            write_part(source_file, f, train_count)
        
        filename = "%s/sentiment.dev.%s" % (target_path, postfix)
        with open(filename, "w") as f:
            write_part(source_file, f, dev_count)
        
        
        filename = "%s/sentiment.test.%s" % (target_path, postfix)
        with open(filename, "w") as f:
            write_part(source_file, f, test_count)
        
        

In [59]:
FINAL_DATASET_PATH = "data/news2tales/"

create_dataset("data/news/news-sents.txt", FINAL_DATASET_PATH, "0")
create_dataset("data/tales/tales-sents-full.txt", FINAL_DATASET_PATH, "1")

In [61]:
FINAL_DATASET_PATH = "data/news2dostoevsky/"

create_dataset("data/dostoevsky/dostoevsky-sents.txt", FINAL_DATASET_PATH, "1")
create_dataset("data/news/news-sents.txt", FINAL_DATASET_PATH, "0")

In [62]:
FINAL_DATASET_PATH = "data/marinina2dostoevsky/"

create_dataset("data/marinina/marinina-sents.txt", FINAL_DATASET_PATH, "0")
create_dataset("data/dostoevsky/dostoevsky-sents.txt", FINAL_DATASET_PATH, "1")

In [63]:
FINAL_DATASET_PATH = "data/marinina2tales/"

create_dataset("data/marinina/marinina-sents.txt", FINAL_DATASET_PATH, "0")
create_dataset("data/tales/tales-sents-full.txt", FINAL_DATASET_PATH, "1")