In [66]:
# import required libraries
import wikipediaapi
import pycountry_convert as pc
import requests
import pandas as pd 

In [None]:
# download csv data into pandas dataframe
cities_pop = pd.read_csv("worldcities.csv")  

# delete columns that definitely won't be used
del cities_pop['lat']
del cities_pop['lng']
del cities_pop['iso3']
del cities_pop['admin_name']
del cities_pop['id']

# print first 5 dataframe in the list to check
cities_pop.head(5)

In [None]:
# add continets column
cities_pop['continent'] = ''

# For each row in population add the continent code
for i in range(len(cities_pop)):
    # hard fixes required for pycountry library to work
    if (cities_pop['country'].iloc[i] == 'Gaza Strip'):
        cities_pop.at[i, 'continent'] = 'AS'
        continue; 
    if (cities_pop['country'].iloc[i] == 'West Bank'):
        cities_pop.at[i, 'continent'] = 'AS'
        continue; 
    if (cities_pop['country'].iloc[i] == 'Falkland Islands '):
        cities_pop.at[i, 'continent'] = 'SA'
        continue; 
    if (cities_pop['country'].iloc[i] == 'Kosovo'):
        cities_pop.at[i, 'continent'] = 'NA'
        continue; 
    if (cities_pop['country'].iloc[i] == 'Saint Barthelemy'):
        cities_pop.at[i, 'continent'] = 'SA'
        continue; 
    
    country_code = pc.country_name_to_country_alpha2(cities_pop['country'].iloc[i], cn_name_format="default")

    # hard fixes required for pycountry library to work
    if (country_code == 'TL'):
        cities_pop.at[i, 'continent'] = 'AS'
    elif (country_code == 'VA'):
        cities_pop.at[i, 'continent'] = 'EU'
    elif (country_code == 'SX'):
        cities_pop.at[i, 'continent'] = 'SA'
    elif (country_code == 'PN'):
        cities_pop.at[i, 'continent'] = 'AS'
    else:
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        cities_pop.at[i, 'continent'] = continent_code

# print first 5 dataframe in the list to check
cities_pop.head(5)

In [258]:
# SRS sample of size 100
cities_srs = cities_pop.sample(n = 100)

# Find proportional sizes for stratified sample 
N = len(cities_pop)
N_NA = sum(cities_pop['continent'] == 'NA')
N_SA = sum(cities_pop['continent'] == 'SA')
N_EU = sum(cities_pop['continent'] == 'EU')
N_AS = sum(cities_pop['continent'] == 'AS')
N_AF = sum(cities_pop['continent'] == 'AF')
N_OC = sum(cities_pop['continent'] == 'OC')

n = 100
n_NA = round(100 * (N_NA/N))
n_SA = round(100 * (N_SA/N))
n_EU = round(100 * (N_EU/N))
n_AS = round(100 * (N_AS/N))
n_AF = round(100 * (N_AF/N))
n_OC = round(100 * (N_OC/N))

# Stratified samples totalling in sample size = 100
cities_NA = cities_pop[cities_pop['continent'] == "NA"].sample(n_NA)
cities_SA = cities_pop[cities_pop['continent'] == "SA"].sample(n_SA)
cities_EU = cities_pop[cities_pop['continent'] == "EU"].sample(n_EU)
cities_AS = cities_pop[cities_pop['continent'] == "AS"].sample(n_AS)
cities_AF = cities_pop[cities_pop['continent'] == "AF"].sample(n_AF)
cities_OC = cities_pop[cities_pop['continent'] == "OC"].sample(n_OC)

cities_strat = pd.concat([cities_NA, cities_SA, cities_EU, cities_AS, cities_AF, cities_OC], axis=0)

In [259]:
# create the wikipedia object
wiki_wiki = wikipediaapi.Wikipedia('en')

# add word count column 
word_count_srs = []
word_count_strat = []

# loop through SRS, look up Wikipedia page, and get length of page
for i in range(len(cities_srs)):
    page_py = wiki_wiki.page(cities_srs['city_ascii'].iloc[i])
    words = len(page_py.text.split())
    word_count_srs.append(words)

# loop through stratified sample, look up Wikipedia page, and get length of page
for i in range(len(cities_strat)):
    page_py = wiki_wiki.page(cities_strat['city_ascii'].iloc[i])
    words = len(page_py.text.split())
    word_count_strat.append(words)

cities_srs['word_count'] = word_count_srs
cities_strat['word_count'] = word_count_strat

cities_srs.head(5)
cities_strat.head(5)



Unnamed: 0,city,city_ascii,country,iso2,capital,population,continent,word_count
13378,Euclid,Euclid,United States,US,,46550.0,,2133
14119,Linton Hall,Linton Hall,United States,US,,41386.0,,705
36241,Springs,Springs,United States,US,,7036.0,,780
18727,Waukee,Waukee,United States,US,,24089.0,,948
36726,Tabernacle,Tabernacle,United States,US,,6851.0,,2198


In [263]:
# Create arrays that will be added as columns to the df
views_list = []
views_list_s = []

# send API query to Wikipedia asking for number of views the page has in the calendar year of 2022 for the SRS
for i in range(len(cities_srs)):
    city = cities_srs['city_ascii'].iloc[i].replace(" ","_")
    url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/' + city + '/daily/2022010100/2022110700'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
    resp = requests.get(url, headers=headers)
    if(resp.status_code == 404):
        views_list.append(0)
        continue;
    data = resp.json()
    views_list.append(data['items'][0]['views'])


# send API query to Wikipedia asking for number of views the page has in the calendar year of 2022 for the stratified sample
for i in range(len(cities_strat)):
    city = cities_srs['city_ascii'].iloc[i].replace(" ","_")
    url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/' + city + '/daily/2022010100/2022110700'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
    resp = requests.get(url, headers=headers)
    if(resp.status_code == 404):
        views_list_s.append(0)
        continue;
    data = resp.json()
    views_list_s.append(data['items'][0]['views'])

cities_srs['views'] = views_list
cities_strat['views'] = views_list_s

In [264]:
# write data to two separate csv files
srs_data = cities_srs.to_csv('cities_srs.csv', index = True)
strat_data = cities_strat.to_csv('cities_strat.csv', index = True)