# Ukrainian Migrants

In [2]:
import os
import pandas as pd

In [3]:
import requests
from requests import get
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
import csv

In [4]:
from bs4 import NavigableString, Tag

**re** is not required if you don't use regex to extract text. Python3 works better for this script; Python2 doesn't handle utf-8 too well.

In [None]:
from time import sleep
from random import randint

This is added to slow down requests rate from the website.

## Scraping multiple pages

### Take a look at the structure of the url links:

### 1. page:

In [None]:
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=0

### 2. page:

In [None]:
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&&p_ID=30

### 3. page:

In [None]:
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=60

### Last page:

In [None]:
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=1710

With our search in "YearOfImmigration=191*" we need to cover multiple pages. 
- The first part of the url is always the same: "https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=", 
- But the tail differs! It starts with 0, ends with 1710, and changes in steps of 30 (because there are 30 entries per page).

Therefore we will create a numpy array that contains all these numbers to add to the static url-head
- There are 3 arguments: start, stop, step
- The stop urlnumber is not included in a numpy array, so we have to go one step further and set it to 1740

In [5]:
urltails = np.arange(0, 1740, 30)

In [6]:
urltails

array([   0,   30,   60,   90,  120,  150,  180,  210,  240,  270,  300,
        330,  360,  390,  420,  450,  480,  510,  540,  570,  600,  630,
        660,  690,  720,  750,  780,  810,  840,  870,  900,  930,  960,
        990, 1020, 1050, 1080, 1110, 1140, 1170, 1200, 1230, 1260, 1290,
       1320, 1350, 1380, 1410, 1440, 1470, 1500, 1530, 1560, 1590, 1620,
       1650, 1680, 1710])

Test with first 5 pages:

In [7]:
for item in urltails[:5]:
    item = str(item) # do the string conversion once
    print(item)
    page_request = requests.get("https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=" + item)
    #page_request.raise_for_status()
    #page_request.encoding = "shiftjis"
    print(page_request.url)

0
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=0
30
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=30
60
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=60
90
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=90
120
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=120


Looking good.

## Initialize empty containers:

In [12]:
item_number = []
surname = []
given_name = []
age = []
ship = []
year_of_immigration = []

## Scrape the content:

In [None]:
#del(out_migrants) #deletes the datafram; use when debugging for example after running the script several times which can result in duplicate entries

In [32]:
for item in urltails:
    item = str(item) # do the string conversion once
    page_request = requests.get("https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/list.aspx?YearOfImmigration=191*&p_ID=" + item)
    soup = BeautifulSoup(page_request.text, features="lxml")
    tables = soup.find_all('tbody')
    
    sleep(10)
    
    for container in tables:
            tablerows = container.find_all("tr")
            for row in tablerows:
                tabledata = row.find_all("td")
            
                item_numbers = tabledata[0].get_text(strip=True) if tabledata[0] else '-'
                item_number.append(item_numbers)

                surnames = tabledata[1].get_text(strip=True) if tabledata[0] else '-'
                surname.append(surnames)

                given_names = tabledata[2].get_text(strip=True) if tabledata[0] else '-'
                given_name.append(given_names)

                ages = tabledata[3].get_text(strip=True) if tabledata[0] else '-'
                age.append(ages)
            
                ships = tabledata[4].get_text(strip=True) if tabledata[0] else '-'
                ship.append(ships)
            
                years_of_immigration = tabledata[5].get_text(strip=True) if tabledata[0] else '-'
                year_of_immigration.append(years_of_immigration)

## Create a dataframe from scraped contents:

In [33]:
out_migrants = pd.DataFrame({
 'item_number': item_number,
    'surname': surname,
    'given_name': given_name,
    'ship': ship,
    'year_of_immigration': year_of_immigration
})

## Take a look at the output: 

In [34]:
out_migrants

Unnamed: 0,item_number,surname,given_name,ship,year_of_immigration
0,252,Zajonczkowski,Mikal,SICILIAN PRINCE,1910
1,253,Kasienczuk,Semen,TUNISIAN,1910
2,254,Zajonczkowski,Wasyl,URANIUM,1910
3,255,Zajonczkowski,Michal,URANIUM,1910
4,256,Zajonczkowski,Paraska,URANIUM,1910
...,...,...,...,...,...
1708,5813,Nowsielskyj,Petro,PRESIDENT LINCOLN,1910
1709,5814,Nowosielskyj,Fedor,PRESIDENT LINCOLN,1910
1710,5908,Laluk,Dmytro,NOORDAM,1914
1711,5909,Laluk,Szymon,BARBAROSSA,1914


In [36]:
ukr_migrants = out_migrants.replace('\n',' ', regex=True)

### Take a look at some of the values in the dataframe. 

Using a simple value counts function on the 'year_of_immigration' column in the dataframe gives an overview of the number of migrants for each year. 

In [41]:
ukr_migrants['year_of_immigration'].value_counts() # count occurences of values in column

1912    502
1913    415
1910    346
1914    244
1911    206
Name: year_of_immigration, dtype: int64

## Export as CSV file:

In [38]:
ukr_migrants.to_csv('ukr_migrants.csv', index= False)

## Next scraping run with new url-page strucure for individual entries:

url sample:

In [None]:
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/item.aspx?IdNumber=252

In [None]:
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/item.aspx?IdNumber=253

This time, the only changing part from page to page is the **IdNumber**. We can take those numbers from the 'item_number' column in the dataframe we already created in the previous step (ukr_migrants). We put all these numbers into another numpy array calling it 'url_tails' (with underscore character) this time. If we use the same variable as above ('urltails') those previous numbers would be overwritten. 

In [42]:
url_endings= ukr_migrants['item_number']

In [47]:
url_tails = url_endings.to_numpy()
print(url_tails)

['252' '253' '254' ... '5908' '5909' '5910']


In [56]:
url_tails

array(['252', '253', '254', ..., '5908', '5909', '5910'], dtype=object)

Do a testrun with the first 5 url pages:

In [49]:
counter = 1
for page_name in url_tails[:5]:
    page_name = str(page_name) # do the string conversion once
    print(str(page_name), counter)
    counter = counter + 1
    page_response = requests.get("https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/item.aspx?IdNumber=" + page_name)
    print(page_response.url)   

252 1
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/item.aspx?IdNumber=252
253 2
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/item.aspx?IdNumber=253
254 3
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/item.aspx?IdNumber=254
255 4
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/item.aspx?IdNumber=255
256 5
https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/item.aspx?IdNumber=256


That seems to work. Now the scraping code for the new table. First initialize new containers:

In [128]:
given_name = []
surname = []
age = []
ship = []
year_of_immigration = []
departure_port = []
departure_date = []
arrival_port = []
arrival_date = []
destination = []
marital_status = []
able_to_read_and_write = []
occupation = []
nationality = []
ethnic_group =[]
religion = []
note = []
# page_number = []
# book_number = []
# file_number = []
# volume_number = []
# microfilm = []
reference = []
# MIKAN_number = []
item_number = []

This is the scraping code for the more detailed table.

In [None]:
counter = 1
for page_name in url_tails:
    page_name = str(page_name) # do the string conversion once
    print(str(page_name), counter)
    counter = counter + 1
    page_response = requests.get("https://www.bac-lac.gc.ca/eng/discover/immigration/immigration-records/immigrants-ukraine-1891-1930/Pages/item.aspx?IdNumber=" + page_name)
    #print(page_response.url)   
    soup = BeautifulSoup(page_response.text, features="lxml")
    ind_tables = soup.find_all('dl', class_='genapp-definitionlist row')
    
    sleep(10)
    
    for container in ind_tables:
        given_names = container.find('dt', string=re.compile('Given Name')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Given Name')) else '-'
        given_name.append(given_names)
        
        surnames = container.find('dt', string=re.compile('Surname:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Surname:')) else '-'
        surname.append(surnames)
        
        ages = container.find('dt', string=re.compile('Age:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Age:')) else '-'
        age.append(ages)
        
        ships = container.find('dt', string=re.compile('Ship:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Ship:')) else '-'
        ship.append(ships)
        
        years_of_immigration = container.find('dt', string=re.compile('Year of Immigration:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Year of Immigration:')) else '9999'
        year_of_immigration.append(years_of_immigration)
        
        departure_ports = container.find('dt', string=re.compile('Departure Port:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Departure Port:')) else '-'
        departure_port.append(departure_ports)
        
        departure_dates = container.find('dt', string=re.compile('Departure Date:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Departure Date:')) else '9999-99-99'
        departure_date.append(departure_dates)
        
        arrival_ports = container.find('dt', string=re.compile('Arrival Port:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Arrival Port:')) else '-'
        arrival_port.append(arrival_ports)
        
        arrival_dates = container.find('dt', string=re.compile('Arrival Date:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Arrival Date:')) else '9999-99-99'
        arrival_date.append(arrival_dates)
        
        destinations = container.find('dt', string=re.compile('Destination:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Destination:')) else '-'
        destination.append(destinations)
        
        marital_statuses = container.find('dt', string=re.compile('Marital Status:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Marital Status:')) else '-'
        marital_status.append(marital_statuses)
        
        able_to_read_and_write_info = container.find('dt', string=re.compile('Able to Read and Write:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Able to Read and Write:')) else '-'
        able_to_read_and_write.append(able_to_read_and_write_info)
        
        nationalities = container.find('dt', string=re.compile('Nationality:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Nationality:')) else '-'
        nationality.append(nationalities)
        
        occupations = container.find('dt', string=re.compile('Occupation:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Occupation:')) else '-'
        occupation.append(occupations)
        
        religions = container.find('dt', string=re.compile('Religion:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Religion:')) else '-'
        religion.append(religions)
        
        references = container.find('dt', string=re.compile('Reference:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Reference:')) else '-'
        reference.append(references)
        
        item_numbers = container.find('dt', string=re.compile('Item Number:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Item Number:')) else '-'
        item_number.append(item_numbers)
        
        ethnic_groups = container.find('dt', string=re.compile('Ethnic Group:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Ethnic Group:')) else '-'
        ethnic_group.append(ethnic_groups) #no sample code, but works
        
        notes = container.find('dt', string=re.compile('Notes:')).find_next('dd').get_text(strip=True) if container.find('dt', string=re.compile('Notes:')) else '-'
        note.append(notes) #no sample code, but works

## Fill our new dataframe with the scraped content:

In [130]:
ind_migrants = pd.DataFrame({
'given_name': given_name,
'surname': surname,
'age': age,
'ship': ship,
'year_of_immigration': year_of_immigration,
'departure_port': departure_port,
'departure_date': departure_date,
'arrival_port': arrival_port,
'arrival_date': arrival_date,
'destination': destination,
'marital_status': marital_status,
'able_to_read_and_write': able_to_read_and_write,
'occupation': occupation,
'nationality': nationality,
'ethnic_group': ethnic_group,
'religion': religion,
'note': note,
# 'page_number': page_number,
# 'book_number': book_number,
# 'file_number': file_number,
# 'volume_number': volume_number,
# 'microfilm': microfilm,
'reference': reference,
# 'MIKAN_number': MIKAN_number,
    'item_number': item_number
})

In [1]:
#del(ind_migrants)

## Take a look at the new output:

In [176]:
ind_migrants

Unnamed: 0,given_name,surname,age,ship,year_of_immigration,departure_port,departure_date,arrival_port,arrival_date,destination,marital_status,able_to_read_and_write,occupation,nationality,ethnic_group,religion,note,reference,item_number
0,Mikal,Zajonczkowski,44,SICILIAN PRINCE,1910,Rotterdam,1910-03-01,"Halifax, Nova Scotia",1910-03-13,"Montreal, Quebec",Married,No,General Labourer,Austrian (Galician),-,Roman Catholic,-,"RG76 C1b, Passengers Lists, Halifax, Nova Scotia",252
1,Semen,Kasienczuk,32,TUNISIAN,1910,Liverpool,1910-04-07,"Halifax, Nova Scotia",1910-04-15,"Montreal, Quebec",Married,Yes,General Labourer,Austrian (Galician),Ruthenian,-,-,"RG76 C1b, Passengers Lists, Halifax, Nova Scotia",253
2,Wasyl,Zajonczkowski,37,URANIUM,1910,Rotterdam,1910-04-09,"Halifax, Nova Scotia",1910-04-20,"Duck Lake, Saskatchewan",Married,No,"Labourer, Farm Labourer",Austrian,Ruthenian,Roman Catholic,-,"RG76 C1b, Passengers Lists, Halifax, Nova Scotia",254
3,Michal,Zajonczkowski,50,URANIUM,1910,Rotterdam,1910-04-09,"Halifax, Nova Scotia",1910-04-20,"Duck Lake, Saskatchewan",Married,No,Labourer,Austrian,Ruthenian,Roman Catholic,-,"RG76 C1b, Passengers Lists, Halifax, Nova Scotia",255
4,Paraska,Zajonczkowski,45,URANIUM,1910,Rotterdam,1910-04-09,"Halifax, Nova Scotia",1910-04-20,"Duck Lake, Saskatchewan",Married,-,-,Austrian,Ruthenian,-,-,"RG76 C1b, Passengers Lists, Halifax, Nova Scotia",256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1708,Petro,Nowsielskyj,18,PRESIDENT LINCOLN,1910,Hamburg,1910-04-02,New York,1910-04-14,"Montreal, Quebec",Single,Yes,Farm labourer,Austrian,Polish,-,-,"RG76, Passengers Lists, New York, United State...",5813
1709,Fedor,Nowosielskyj,18,PRESIDENT LINCOLN,1910,Hamburg,1910-04-02,New York,1910-04-14,"Montreal, Quebec",Single,No,Farm labourer,Austrian,Polish,-,-,"RG76, Passengers Lists, New York, United State...",5814
1710,Dmytro,Laluk,17,NOORDAM,1914,Rotterdam,1914-02-14,"Halifax, Nova Scotia",1914-02-24,"Montreal, Quebec",Single,Yes,Farm Hand,Austrian,Polish,Catholic,p 7 line 18,"RG76 C1b, Passengers Lists, Halifax, Nova Scotia",5908
1711,Szymon,Laluk,27,BARBAROSSA,1914,Bremerhaven,1914-03-14,"Halifax, Nova Scotia",1914-03-26,"Montreal, Quebec",Married,Yes,General Labourer,Austrian,Polish,Greek Catholic,p 10 line 27 Orest Reshitnyks maternal grandpa...,"RG76 C1b, Passengers Lists, Halifax, Nova Scotia",5909


## Disambiguate values (remove typos etc.)

### Ethnic group column

In [177]:
ind_migrants['ethnic_group'].value_counts()

Ruthenian              1144
Polish                  172
-                       154
Russian                 136
Bukowinian               38
Galician                 26
Roumanian                15
Austrian                 12
Austrian, Ruthenian       5
German                    3
Austrian, Roumanian       2
Scandinavian              1
Rutherian                 1
Rudinki                   1
Austrian, Polish          1
Austrian, Russian         1
Bulgarian                 1
Name: ethnic_group, dtype: int64

In [196]:
pro_migrants = pro_migrants.replace(['Rutherian'], ['Ruthenian'])

### Occupation column

In [242]:
ind_migrants['occupation'].value_counts()

Farm Labourer                      379
General Labourer                   260
-                                  252
Farm labourer                      200
Labourer                            97
Railroad Labourer                   87
Farm Hand                           84
Rail Road Labourer                  76
Railroad labourer                   72
Domestic                            50
Housewife                           28
Farmer                              25
Servant                             14
farmer                               8
Miner                                7
Railway Labourer                     5
railroad labourer                    5
Maid Servant                         4
Housemaid                            3
Labourer, Farm Labourer              3
Farming                              3
Farm Maid                            3
Farm Labourer, General Labourer      3
Railway Man                          2
miner                                2
City Labourer            

In [180]:
pro_migrants = ind_migrants.replace(['Farm labourer'], ['Farm Labourer'])

In [198]:
pro_migrants = pro_migrants.replace(['Railroad labourer'], ['Railroad Labourer'])

In [192]:
pro_migrants = pro_migrants.replace(['Farming'], ['Farmer'])

In [186]:
pro_migrants = pro_migrants.replace(['miner'], ['Miner'])

In [187]:
pro_migrants = pro_migrants.replace(['Laborer'], ['Labourer'])

In [188]:
pro_migrants = pro_migrants.replace(['servant'], ['Servant'])

In [194]:
pro_migrants = pro_migrants.replace(['Factory Worker'], ['Factory Labourer'])

In [245]:
pro_migrants['occupation'].value_counts()

Farm Labourer                      580
General Labourer                   260
-                                  252
Railroad Labourer                  247
Labourer                            98
Farm Hand                           84
Domestic                            50
Farmer                              38
Housewife                           28
Servant                             15
Miner                                9
Factory Labourer                     4
Maid Servant                         4
Farm Labourer, General Labourer      3
Labourer, Farm Labourer              3
Farm Maid                            3
Housemaid                            3
infant                               2
Railway Man                          2
City Labourer                        2
Farmwork                             2
Maid servant                         2
Bootmaker                            1
Teacher                              1
Locksmith                            1
housemaid                

## Deal with dates

In [244]:
cai_migrants['departure_date'].value_counts()

1914-05-15, 16, 19       76
1912-04-24               55
1914-05-29, 30, 06-01    47
1912-06-14, 15, 17       36
1913-06-07               32
                         ..
1910-02-20                1
1912-04-26                1
1910-05-21                1
1910-01-13                1
1910-08-11                1
Name: departure_date, Length: 323, dtype: int64

In [234]:
cai_migrants['corr_departure_date'] = cai_migrants['departure_date'].apply(lambda x: x[0:10])

In [233]:
cai_migrants['corr_arrival_date'] = cai_migrants['arrival_date'].apply(lambda x: x[0:10])

In [227]:
cai_migrants = cai_migrants.replace(['19112-03-0'], ['1912-03-08'])

In [236]:
cai_migrants = cai_migrants.replace(['19010-05-1'], ['1910-05-11'])

In [232]:
cai_migrants = cai_migrants.replace(['1911-04126'], ['1911-04-26'])

In [238]:
cai_migrants = cai_migrants.replace(['1911-04-126'], ['1911-04-26'])

In [240]:
cai_migrants = cai_migrants.replace(['19112-03-0'], ['1912-03-08'])

In [247]:
cai_migrants['corr_departure_date'].value_counts()

1914-05-15    77
1912-04-24    55
1914-05-29    47
1912-06-14    36
1913-06-07    32
              ..
1909-12-29     1
1913-08-07     1
1913-07-22     1
1913-06-09     1
1911-06-14     1
Name: corr_departure_date, Length: 299, dtype: int64

## Export as CSV file

In [205]:
cai_migrants = pro_migrants

In [203]:
pro_migrants.to_csv('pro_migrants.csv', index= False)

In [178]:
ind_migrants.to_csv('ind_migrants.csv', index= False)

In [241]:
cai_migrants.to_csv('cai_migrants.csv', index= False)