# Import packages, open original dataframe, get only relevant data

In [11]:
import requests
import urllib
import pandas as pd
from bs4 import BeautifulSoup
import urllib.parse
import time
import re
from tqdm import tqdm

import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
#Open the dataframe

df_final_corpus = pd.read_csv("./finalcorpus.tsv", sep='\t')

In [9]:
df_final_corpus.head()

Unnamed: 0,docid,hathi_author,hathi_title,authordate,birthyear,deathyear,chi_date,ukw_date,copyright_date,firstpub,...,distances,copyright_corpus,manual_corpus,us_national,pubdate_known,authof3ormore,tokens,pagesinchunk,tokensperpage,omnibus
0,loc.ark+=13960=t49p3qv8g,"Thwing, Edward P[ayson]","Outdoor life in Europe, sketches of men and ma...",,,,,,,1880,...,,False,False,False,False,False,45865,54,849.352,False
1,loc.ark+=13960=t7wm1fd5j,"Jones, Joseph",Major Jones's travels,1812-1882.,1812.0,,,,,1880,...,,False,False,False,False,False,42135,204,206.544,False
2,nyp.33433081882650,"Allan-Olney, Mary",The new Virginians,,,,,,,1880,...,,False,False,False,False,False,32052,244,131.361,False
3,uva.x030742146,,"The Growing world; or, Progress of civilizatio...",,,,,,,1880,...,,False,False,False,False,False,374115,420,890.75,False
4,nyp.33433074386792,of Samosata. Lucian,A traveller's true tale,,,,,,,1880,...,,False,False,False,False,False,11848,128,92.562,False


In [7]:
#Let's take only unique authors

unique_authors = df_final_corpus.hathi_author.unique()
authors = pd.DataFrame(unique_authors, columns=["hathi_author"])
authors.head()

Unnamed: 0,hathi_author
0,"Thwing, Edward P[ayson]"
1,"Jones, Joseph"
2,"Allan-Olney, Mary"
3,
4,of Samosata. Lucian


# Google and Wiki scraping, the creation of the preliminary dataframe

In [24]:
authors = pd.DataFrame(unique_authors, columns=["hathi_author"])
#Let's scrape and parse
# Google likes to limit the automatic generation of queries, so we should define the time.sleep period
author = []
clean_name = []
google = []
wiki = []
wiki_natio = []
wiki_link = []

#I limited the df up to 1000 entries for now
for index, row in tqdm(authors[:1000].iterrows()):  
    if isinstance(row["hathi_author"], str):
        
        #append the author's name to our df
        author.append(row["hathi_author"])
        
        #let's clean the name -> this step is not a necessary one, but it can improve our query results
        surname_name = row["hathi_author"].split(',')
        clean_surname_name = [str(re.sub("\(|\)", "", re.search("\(.+\)", re.sub("\[|\]", "", surname_name[1])).group(0))) + " " + surname_name[0] \
                if len(surname_name) > 1 and "(" in surname_name[1]\
                else re.sub("\[|\]", "", ''.join(surname_name))]
        clean_name.append(clean_surname_name[0])
        
        
        #let's create a google query and get google results
        query = clean_surname_name[0] + "writer "
        query = urllib.parse.quote_plus(query).lower()
        response = requests.get("https://www.google.com/search?q=" + query)
        response.encoding = 'utf8'
        html = response.text
        
        #the next commented line is an optional one, in general it takes less than a sec to scrape and parse everthing, but here it will take 2 sec
#         time.sleep(2) 
        
        soup = BeautifulSoup(html)
        
        #let's check if there's a window with a brief info about an author and their nationality
        cur_google_length = len(google)
        if soup.find( 'div', class_="BNeawe tAd8D AP7Wnd"): 
            brief_info = soup.find( 'div', class_="BNeawe tAd8D AP7Wnd")
            nationality = brief_info.text
            
            if len(nationality) > 0:
                doc = nlp(nationality)
                for ent in doc.ents:
                    #if we find the window, let's check whether a nationality is indicated in it
                    if ent.label_ == "NORP" or ent.label_ == "LANGUAGE":
            
                        google.append(str(ent))
                        break
                            
        if len(google) == cur_google_length:
            google.append(None)

        #now let's look at the links google provides us with
        heading_object=soup.find_all( 'a' )
        
        
        cur_wiki_length = len(wiki)
        cur_wiki_natio_length = len(wiki_natio)
        cur_wiki_link_length = len(wiki_link)
        
        for info in heading_object:
            
            #here we check if there's a Wikipedia link among all the links
            if "Wikipedia" in info.text:
                link = info["href"]
                if link[link.index("https://"):link.index("&sa")]:
                    link = link[link.index("https://"):link.index("&sa")]
                else:
                    link = link[link.index("https://"):]
                    
                link = urllib.parse.unquote(urllib.parse.unquote(link))
                wiki_link.append(link)

                response_wiki = requests.get(link)
                response_wiki.encoding = 'utf8'
                html_wiki = response_wiki.text
                soup_wiki = BeautifulSoup(html_wiki)
                
                #sometimes there's a specific line with a nationality info, so let's check if it's on the page
                if soup_wiki.find("td", class_ = 'infobox-data category'):
                    natio = soup_wiki.find("td", class_ = 'infobox-data category')
                    doc = nlp(natio.text)
                    nationality_found = False
                    for ent in doc.ents:
                        if ent.label_ == "NORP" or ent.label_ == "LANGUAGE":
                            wiki_natio.append(str(ent))
                            break
                    
                
                #in addition, let's take three first paragraphs of the page, where the basic info is
                
                page = soup_wiki.find_all('p')[:3]
                
                doc = nlp(''.join([paragraph.get_text() for paragraph in page]))

                nationalities = []
                
                #let's check again if there's a nationality
                for ent in doc.ents:
                    if ent.label_ == "NORP" or ent.label_ == "LANGUAGE":
                        nationalities.append(str(ent))
                        break
                if len(nationalities)>0:
                    wiki.append(nationalities[0])
                break
        
        if len(wiki_natio) == cur_wiki_natio_length:
            wiki_natio.append(None)            
        if len(wiki) == cur_wiki_length:
            wiki.append(None)
        if len(wiki_link) == cur_wiki_link_length:
            wiki_link.append(None)
            
        time.sleep(3) 

194it [13:15,  4.10s/it]


In [27]:
#Let's combine all the info
df_nationalities = pd.DataFrame()

df_nationalities["Author"] = author
df_nationalities["Clean_name"] = clean_name
df_nationalities["Google"] = google
df_nationalities["Wiki_par"] = wiki
df_nationalities["Wiki_natio"] = wiki_natio
df_nationalities["Wiki_link"] = wiki_link

df_nationalities.loc[:,'Nationality'] = None

# Create a column "Nationality" and "sum up" all the results from the columns

In [28]:
#it is necessary to feel out the final column about nationality based on several factors

for index, row in df_nationalities.iterrows():
    
    if row['Google'] is None and row['Wiki_par'] == row["Wiki_natio"]:
        df_nationalities.at[index,"Nationality"] = row['Wiki_natio']
        
    elif row['Google'] is None and row['Wiki_natio'] is None:
        df_nationalities.at[index,"Nationality"] = row['Wiki_par']
        
    elif row['Google'] is None and row['Wiki_par'] is None:
        df_nationalities.at[index,"Nationality"] = row['Wiki_natio']
        
    elif row['Google'] is not None and row['Wiki_par'] is not None and row['Wiki_natio'] is not None:
        df_nationalities.at[index,"Nationality"] = row['Wiki_natio']
        
    elif row['Google'] is None and row['Wiki_par'] is not None and row["Wiki_natio"] is not None and row['Wiki_par']!= row["Wiki_natio"]:
        df_nationalities.at[index,"Nationality"] = row['Wiki_natio']
    
    elif row['Google'] is None and row["Wiki_natio"] is not None:
        df_nationalities.at[index,"Nationality"] = row['Wiki_natio']
        
    else:
        df_nationalities.at[index,"Nationality"] = row['Google']

In [11]:
# #Voila!
df_nationalities

Unnamed: 0,Author,Clean_name,Google,Wiki_par,Wiki_natio,Wiki_link,Nationality
0,"Thwing, Edward P[ayson]",Thwing Edward Payson,,,,,
1,"Jones, Joseph",Jones Joseph,American,American,,https://en.wikipedia.org/wiki/Joseph_R._Jones,American
2,"Allan-Olney, Mary",Allan-Olney Mary,,,,,
3,of Samosata. Lucian,of Samosata. Lucian,,Greek,,https://en.wikipedia.org/wiki/Lucian,Greek
4,"Arnold, Matthew",Arnold Matthew,English,English,British,https://en.wikipedia.org/wiki/Matthew_Arnold,British
...,...,...,...,...,...,...,...
800,"Lynch, Lawrence L",Lynch Lawrence L,,,,,
801,"Halstead, Ada L",Halstead Ada L,American,American,,https://en.wikipedia.org/wiki/Halsted_Sullivan,American
802,"Libbey, Laura Jean",Libbey Laura Jean,American,American,,https://en.wikipedia.org/wiki/Laura_Jean_Libbey,American
803,"Shand, Alexander Innes",Shand Alexander Innes,,,,https://en.wikipedia.org/wiki/Alexander_Shand,


In [30]:
#Let's check the count of None-s
df_nationalities["Nationality"].isna().sum()

65

In [31]:
#Let's save
df_nationalities.to_csv("./nationalities.csv")