# Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import convertapi
from pprint import pprint
import translators as ts

Using Canada server backend.


# Functions

In [2]:
def get_source(url):
    try:
        session = requests.Session()
        response = session.get(url)
        return response.content
    except requests.exceptions.RequestException as e:
        print("Error ->", e)
        return None


def get_local_sources(html_filenames):
    local_sources = []
    for html_filename in html_filenames:
        with open(html_filename) as html_file:
            local_source = html_file.read()
            local_sources.append(local_source)
    return local_sources


def download_and_get_doc_filenames(urls):
    filenames = []
    filename_suffix = 1
    for url in urls:
        filename = f"word_doc{filename_suffix:03d}.doc"
        data = requests.get(url)
        with open(filename, 'wb')as file:
            file.write(data.content)
        filenames.append(filename)
        filename_suffix = filename_suffix + 1
    return filenames
    
        
def get_urls(soup):
    doc_links = soup.find_all('a', {'class':'link_info'})
    urls = [doc_links[i]['href'] for i in range(len(doc_links))]
    return urls


def convert_and_get_html_filenames(doc_filenames):
    convertapi.api_secret = CONVERT_API_KEY
    html_filenames = []
    for doc_filename in doc_filenames:
        html_filename = f'{doc_filename[0:-4]}.html'
        convertapi.convert('html', {
            'File': doc_filename
        }, from_format = 'doc').save_files(html_filename)
        html_filenames.append(html_filename)
    return html_filenames

def get_local_soups(local_sources):
    return [BeautifulSoup(local_source, 'html.parser') for local_source in local_sources]


def get_first_table(table):
    cells = table.find_all('td')
    left = cells[0]
    left_text = left.get_text()
    left_text = left_text.replace('\n', '')
    right = cells[1]
    right_text = right.get_text()
    right_text = right_text.replace('\n', '')
    left_text = ts.google(left_text)
    right_text = ts.google(right_text)
    return left_text, right_text

def get_table_df(table):
    return pd.read_html(str(table), header=0)[0]

def google_translate(thing):
    try:
        return ts.google(thing)
    except:
        return thing
    
    
def get_table_dfs_en(table_dfs):
    table_dfs_en = []
    for table_df in table_dfs:
        table_df_en = table_df.applymap(google_translate, 'ignore')
        table_df_en.set_axis(list(map(ts.google, list(table_df_en.columns))), axis=1, inplace=True)
        table_dfs_en.append(table_df_en)
    return table_dfs_en

def get_person_name(soup):
    try:
        name = soup.find_all('p', {'style':'text-align:center'})[3].get_text()
        return ts.google(name)
    except:
        return ''

# Global Variables
- `URL`: The URL of the page whose word documents we will be scraping
- `CONVERT_API_KEY`: The script converts word (`.doc`) into `.html` files for easy scraping. This service is provided by `convertapi.com`. The free tier is currently used. The paid tier should be considered if limit is reached or requests are throttled.
- `START`, `STOP`: Provides the start and stop index of the `.doc` files to be captured (inclusive `START` to exlusive `STOP`, 0 is first index). For example, `START = 0; STOP = 1` captures only the first doc file, `START = 0; STOP = 3` captures the first three, `START = 3; STOP = 5` captures the forth and fifth. The max value for `STOP` is `368`

In [3]:
URL = 'http://www.council.gov.ru/structure/machinery/vacancies/property/'
CONVERT_API_KEY = 'Ez96VL6BxeWjmniA'
START = 1
STOP = 2  # 368 is max

# Script

## Download `.doc` Files and convert to `.html`

In [4]:
source = get_source(URL)
soup = BeautifulSoup(source, 'html.parser')
urls = get_urls(soup)

urls = urls[START:STOP]
doc_filenames = download_and_get_doc_filenames(urls)
html_filenames = convert_and_get_html_filenames(doc_filenames)

## Parse HTML and Translate
- the `translator.google` method is used here to translate from Russian to English, however, a paid tier service from Google should be considered as this free service takes a long time. Paid service is available here: https://cloud.google.com/translate

In [5]:
local_sources = get_local_sources(html_filenames)
local_soups = get_local_soups(local_sources)
person_name = get_person_name(local_soups[0])
tables = local_soups[0].find_all('table', {'class':'MsoNormalTable'})
first_table = get_first_table(tables[0])
table_dfs = [get_table_df(table) for table in tables[1:]]
table_dfs_en = get_table_dfs_en(table_dfs)

# Viewing the Data We've Captured
- during the testing phase, we only view the tables of one doc file. Later, we will be iterating over all files eventually and saving this data to CSV.
- Should we be capturing the text outside the tables too?

In [6]:
person_name 

'Agaltsova Valery Stanislavovich'

In [7]:
first_table

('Replaced position',
 'Deputy Head of the Department of International Relations')

In [8]:
table_dfs_en[0]

Unnamed: 0,№ n/p,Type of income,The amount of income (rub.)
0,1.0,Declated annual income of the Federal State Ci...,1843996.0
1,2.0,Declared annual income of family members,
2,,Spouses,1369600.0


In [9]:
table_dfs_en[1]

Unnamed: 0,№ n/p,Type of property,Square (sq.m),Square (sq.m) .1,Land of location,The owner of the property,Owner of property. 1
0,Land:,Land:,Land:,Land:,Land:,Land:,Land:
1,1,for individual housing construction,1514.00,Russian Federation,Russian Federation,Russian Federation,Agaltsov V.S.
2,2,for gardening and gardening,1354.00,Russian Federation,Russian Federation,Russian Federation,wife
3,3,for individual housing construction,1504.00,Russian Federation,Russian Federation,Russian Federation,wife
4,Apartments:,Apartments:,Apartments:,Apartments:,Apartments:,Apartments:,Apartments:
5,4,flat,164.00,Russian Federation,Russian Federation,Russian Federation,Agaltsov V.S.
6,5,flat,53.80,Russian Federation,Russian Federation,Russian Federation,wife
7,Cottages:,Cottages:,Cottages:,Cottages:,Cottages:,Cottages:,Cottages:
8,6,cottage with farm buildings,100.00,Russian Federation,Russian Federation,Russian Federation,wife
9,,,,,,,


In [10]:
table_dfs_en[2]

Unnamed: 0,№ n/p,№ n/p.1,Type of property,Square (sq.m.),Square (sq.m.). 1,Square (sq.m.). 2,Land of location,Who is in use,Who has in use. 1,Unnamed: 9
0,,Land:,Land:,Land:,Land:,Land:,Land:,Land:,Land:,Land:
1,,1,Land plot,Land plot,2500.00,Russian Federation,Russian Federation,Russian Federation,wife,wife
2,,Apartments:,Apartments:,Apartments:,Apartments:,Apartments:,Apartments:,Apartments:,Apartments:,Apartments:
3,,2,flat,flat,53.80,Russian Federation,Russian Federation,Russian Federation,Minor daughter,Minor daughter
4,,3,flat,flat,164.00,Russian Federation,Russian Federation,Russian Federation,Minor son,Minor son
5,,,,,,,,,,


In [11]:
table_dfs_en[3]

Unnamed: 0,№ n/p,Type and brand of vehicles,The owner of the vehicle
0,Passenger cars:,Passenger cars:,Passenger cars:
1,1,Mercedes-Benz Jil 350 Sidi 4matic,Agaltsov V.S.
2,2,Audi Kew5,wife
