### russpelling module
Before 1917 all texts in Russian were written in pre-reform orthography which can affect further processing i.e. lemmatization & annotation. 
Here is a code that helps to rewrite texts from pre-reform to the modern orthography using python module russpelling.

Module documentation:
https://github.com/ingoboerner/russpelling  
Developers: Ingo Boerner and David J. Birnbaum

In [1]:
import os
import re
from russpelling import *
import pandas as pd

In [2]:
# test a line
test = 'изъ Франціи и Италіи'
print(normalize(test))

из Франции и Италии


#### Table data

In [5]:
df = pd.DataFrame(data = {'id': [1,2],
                         'text':['его попеченіями, юные Художники пріобрѣтаютъ извѣстность', 'въ домѣ Марса на Невскомъ проспектѣ']})
df

Unnamed: 0,id,text
0,1,"его попеченіями, юные Художники пріобрѣтаютъ и..."
1,2,въ домѣ Марса на Невскомъ проспектѣ


In [9]:
df['norm'] = df['text'].apply(normalize)
df

Unnamed: 0,id,text,norm
0,1,"его попеченіями, юные Художники пріобрѣтаютъ и...","его попечениями, юные Художники приобретают из..."
1,2,въ домѣ Марса на Невскомъ проспектѣ,в доме Марса на Невском проспекте


In [3]:
est = pd.read_csv("../../data/MFW_months_8k_est_goub_ved_decades.csv")
liv = pd.read_csv("../../data/MFW_months_8k_liv_goub_ved_decades.csv")

In [None]:
est.head()

In [16]:
print(len(est))
print(len(liv))

8123716
6823478


In [7]:
est['word'] = est['word'].str.replace(r'(.*)iя$', 'iя * ', regex=True)

In [8]:
est['norm'] = est['word'].apply(normalize)

FileNotFoundError: [Errno 2] No such file or directory: 'adj-with-ija.txt'

In [20]:
est.head()

Unnamed: 0.1,Unnamed: 0,word,month_id,n,norm
0,1,въ,ekmteataja188908,5828,в
1,2,и,ekmteataja188908,4923,и
2,3,отд,ekmteataja190611,4097,отд
3,4,въ,ekmteataja189307,3474,в
4,5,въ,ekmteataja189406,3453,в


In [24]:
est.to_csv("../../data/MFW_months_8k_est_goub_ved_decades_NF.csv")

same for liv

In [33]:
liv['word'] = liv['word'].str.replace(r'ыя$|ия$|ые$', '', regex=True)
liv['word'] = liv['word'].str.replace(r'\s+$', '', regex=True)

In [34]:
liv['norm'] = liv['word'].apply(normalize)
liv.head()

FileNotFoundError: [Errno 2] No such file or directory: 'adj-with-ija.txt'

In [None]:
liv.to_csv("../../data/MFW_months_8k_liv_goub_ved_decades_NF.csv")

In [18]:
rev = pd.read_csv("../../data/test.csv")

In [19]:
rev.head()

Unnamed: 0.1,Unnamed: 0,X,word,decade.x,n
0,1,1,въ,1900,400573
1,2,2,и,1900,323184
2,3,3,въ,1890,315570
3,4,4,и,1890,245926
4,5,5,въ,1880,200698


In [20]:
rev['norm'] = rev['word'].apply(normalize)

FileNotFoundError: [Errno 2] No such file or directory: 'adj-with-ija.txt'

#### File normalization

In [12]:
# Test one file
# NB: change filename!
with open('test.txt') as text:
    for line in text:
        print(normalize(line)) # no ъ, i or ѣ should be found

Общество для поощрения отечественных художников быстро достигает своей благородной и патриотической цели. Юные Художники в недрах сего Общества, как на отеческом лоне, находят покровительство, мудрые и беспристрастные советы, ученую и благонамеренную критику своих произведений: опытные Художники находят в оных просвещенных ценителей своих дарований. Не распространяясь о пользе, принесенной сим Обществом Изящным Художествам, и о пособиях, доставленных многим Артистам, довольно упомянуть, что некоторые молодые люди с отличными дарованиями путешествуют на счет Общества для усовершенствования своих способностей, другие воспитываются или учатся разным Художествам.


In [14]:
fh = 'test.txt' # change !
new_name = fh.replace('.txt', '_new-orf.txt') # new file name
# nested structure for opening existing file and write to the new one
with open(fh, 'r') as text:
    with open(new_name, 'w') as new_file: # open empty file_new-orf.txt in a writing mode
        for line in text: # read exising file
            new = None
            new_orf = [] # list to store converted results

            new = normalize(line) # normalize() -- function to convert to modern orthography 
            new_orf.append(new + os.linesep) # add all lines from the file separated by newline (could be '\n' but I put os-specific line separator just in case)

            result = ''.join(new_orf) # convert list to string
            new_file.write(result) # write new file
            print('Text in the modern orthography is written in', new_name)

Text in the modern orthography is written in test_new-orf.txt


#### Folder rewriting

In [None]:
# change directory to the folder with texts
folder = 'test_folder/'
os.chdir(folder)

In [None]:
# test if all texts are covered by 'if file.endswith('.txt')
# or change the if condition
for file in os.listdir():
    if file.endswith('.txt'):
        print(file)

In [None]:
# compare to the files inside the folder:
print(os.listdir())

In [None]:
for file in os.listdir(): # for each file in the folder
    if file.endswith('.txt'): # which is .txt
        # make a new filename from the exising one by appending '_new-orf.txt'
        result_name = file.replace('.txt', '_new-orf.txt')
        with open(file, 'r') as text: # read exising file
            with open(result_name, 'w') as new_file: # open empty file_new-orf.txt in a writing mode
                 for line in text: # input to the function is a string
                    new = None # empty variables to store results
                    new_orf = []
                    
                    new = normalize(line) # new orthography function
                    new_orf.append(new + os.linesep) # add all lines from the file separated by newline (could be '\n' but I put os-specific line separator)

                    result = ''.join(new_orf) # paste it as strings separated by the new lines
                    new_file.write(result) # write the result to the new file
                    print('Text in the modern orthography is written in', result_name)
                    

Done?