Using Capitals.txt dataset;

Find two capital cities furthest away from each other and the
distance between them.

In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version
!pip install pyspark

In [7]:
from pyspark import SparkContext

# import file
sc = SparkContext.getOrCreate()
capitalRDD = sc.textFile("Capitals.txt")

# Split data, remove spaces, arrange to make float, make float, replace longitude and latitude if they are disorder.
def split_data(line):
    info = line.split("\t")
    country = info[1].strip()
    capital = country + ", " + info[2].strip().replace("'", " ")
    latitude = float(info[3].strip().replace(",", "."))
    longitude = float(info[4].strip().replace(",", "."))
    if latitude > 90 or latitude < -90:
        return (capital, (longitude, latitude))
    else:
        return (capital, (latitude, longitude))


capitalRDD = capitalRDD.map(lambda line: split_data(line))
capitalRDD.take(10)

[('Afghanistan, Kabul', (34.53, 69.17)),
 ('Albania, Tirana', (41.33, 19.82)),
 ('Algeria, Algiers', (36.75, 3.04)),
 ('American Samoa, Pago Pago', (-14.28, -170.7)),
 ('Andorra, Andorra la Vella', (42.51, 1.52)),
 ('Angola, Luanda', (-8.84, 13.23)),
 ('Anguilla, The Valley', (18.22, -63.06)),
 ('Antigua and Barbuda, St. John s', (17.12, -61.85)),
 ('Argentina, Buenos Aires', (-34.61, -58.38)),
 ('Armenia, Yerevan', (40.18, 44.51))]

In [13]:
import geopy.distance

def calculate_distance(coords1, coords2):
    distance = geopy.distance.geodesic(coords1, coords2).km
    distance = round(distance,2)
    return distance

# create combinations using cartesian function
combinationRDD = capitalRDD.cartesian(capitalRDD)
# calculate distance for each combination
distancesRDD = combinationRDD.map(lambda x: (x[0][0], x[1][0], calculate_distance(x[0][1], x[1][1])))
# Sort and filter distances
distancesRDD = distancesRDD.filter(lambda x: x[2] != 0)
# get farthest capitals
distancesRDD = distancesRDD.sortBy(lambda x :x[2], ascending=False)

farthest_combination = distancesRDD.take(1)
# farthest_combination
print("The capital cities farthest from each other in the world: ", farthest_combination[0][0], " and ", farthest_combination[0][1], ". Distance (km):", farthest_combination[0][2])


The capital cities farthest from each other in the world:  Nigeria, Abuja  and  Tokelau, Nukunonu . Distance (km): 19957.32
