## XML World Fact
Data source: https://gitlab.com/opstar/share20/-/raw/master/worldfact.xml

Read [here](https://gitlab.com/opstar/share20/-/raw/master/Reading_XML_data_in_Spark.pdf) if you need help installing the XML library


In [None]:
from pyspark.sql.functions import explode, regexp_replace, trim


In [None]:
wcountry = spark.read.format('xml')\
  .options(rowTag='country')\
  .load('/FileStore/sample/worldfact.xml')

wcity = spark.read.format('xml')\
  .options(rowTag='city')\
  .load('/FileStore/sample/worldfact.xml')


In [None]:
country = wcountry.selectExpr('_id AS country','_name AS countryname','int(_population) AS countrypop')

country.show()

+-------+--------------------+----------+
|country|         countryname|countrypop|
+-------+--------------------+----------+
| f0_136|             Albania|   3249136|
| f0_144|             Andorra|     72766|
| f0_149|             Austria|   8023244|
| f0_157|             Belarus|  10415973|
| f0_162|             Belgium|  10170241|
| f0_169|Bosnia and Herzeg...|   2656240|
| f0_174|            Bulgaria|   8612757|
| f0_179|             Croatia|   5004112|
| f0_184|      Czech Republic|  10321120|
| f0_193|             Denmark|   5249632|
| f0_198|             Estonia|   1459428|
| f0_203|       Faroe Islands|     43857|
| f0_208|             Finland|   5105230|
| f0_213|              France|  58317448|
| f0_220|             Germany|  83536112|
| f0_227|           Gibraltar|     28765|
| f0_232|              Greece|  10538594|
| f0_239|            Guernsey|     62920|
| f0_244|            Holy See|       840|
| f0_251|             Hungary|  10002541|
+-------+--------------------+----

In [None]:
city = wcity\
  .withColumn('cityname',wcity.name[0])\
  .withColumn('citypop',wcity.population._VALUE[0])\
  .withColumn('cityname',trim(regexp_replace('cityname','[\n]+','')))\
  .selectExpr(['_country AS country','cityname','int(citypop) AS citypop'])

city.show()

+-------+----------------+-------+
|country|        cityname|citypop|
+-------+----------------+-------+
| f0_136|          Tirane| 192000|
| f0_136|         Shkoder|  62000|
| f0_136|          Durres|  60000|
| f0_136|           Vlore|  56000|
| f0_136|         Elbasan|  53000|
| f0_136|           Korce|  52000|
| f0_144|Andorra la Vella|  15600|
| f0_149|      Eisenstadt|  10102|
| f0_149|      Klagenfurt|  87321|
| f0_149|         Bregenz|   NULL|
| f0_149|          Vienna|1583000|
| f0_149|            Linz| 203000|
| f0_149|       Innsbruck| 118000|
| f0_149|            Graz| 238000|
| f0_149|        Salzburg| 144000|
| f0_149|      St. Polten|  51102|
| f0_157|           Minsk|1540000|
| f0_162|         Antwerp| 459072|
| f0_162|        Brussels| 951580|
| f0_162|           Ghent| 227483|
+-------+----------------+-------+
only showing top 20 rows



In [None]:
allcities = country.join(city,'country')\
  .selectExpr('countryname','countrypop','cityname','citypop','int(citypop/countrypop*100) AS citypct')

allcities.orderBy(allcities.citypct.desc()).show()



+--------------------+----------+-------------+-------+-------+
|         countryname|countrypop|     cityname|citypop|citypct|
+--------------------+----------+-------------+-------+-------+
|       Liechtenstein|     31122|        Vaduz|  27714|     89|
|           Singapore|   3396924|    Singapore|2558000|     75|
| Antigua and Barbuda|     65647|  Saint Johns|  36000|     54|
|             Bahamas|    259367|       Nassau| 140000|     53|
|            Holy See|       840| Vatican City|    392|     46|
|               Palau|     16952|        Koror|   7685|     45|
|               Qatar|    547761|         Doha| 217294|     39|
|             Uruguay|   3238952|   Montevideo|1247000|     38|
|              Latvia|   2468982|         Riga| 900000|     36|
|Saint Kitts and N...|     41369|   Basseterre|  14700|     35|
|             Armenia|   3463574|      Yerevan|1200000|     34|
|            Suriname|    436418|   Paramaribo| 150000|     34|
|             Estonia|   1459428|      T

# The hard way, engineered by Kush

In [None]:
c = spark.read.format('xml')\
.options(rowTag='worldfact')\
.load('/FileStore/sample/worldfact.xml')

c.printSchema()

In [None]:
root
 |-- continent: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- _name: string (nullable = true)
 |-- country: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _capital: string (nullable = true)
 |    |    |-- _car_code: string (nullable = true)
 |    |    |-- _datacode: string (nullable = true)
 |    |    |-- _gdp_agri: double (nullable = true)
 |    |    |-- _gdp_ind: double (nullable = true)
 |    |    |-- _gdp_serv: double (nullable = true)
 |    |    |-- _gdp_total: double (nullable = true)
 |    |    |-- _government: string (nullable = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- _indep_date: string (nullable = true)
 |    |    |-- _infant_mortality: double (nullable = true)
 |    |    |-- _inflation: double (nullable = true)
 |    |    |-- _name: string (nullable = true)
 |    |    |-- _population: string (nullable = true)
 |    |    |-- _population_growth: double (nullable = true)
 |    |    |-- _total_area: double (nullable = true)
 |    |    |-- border: array (nullable = true)
...
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- _country: string (nullable = true)
 |    |    |    |    |-- _province: string (nullable = true)

In [None]:
c2 = c.withColumn('country_detail', explode('country')).select('country_detail.*')
c2.printSchema()

In [None]:
root
 |-- _capital: string (nullable = true)
 |-- _car_code: string (nullable = true)
 |-- _datacode: string (nullable = true)
 |-- _gdp_agri: double (nullable = true)
 |-- _gdp_ind: double (nullable = true)
 |-- _gdp_serv: double (nullable = true)
 |-- _gdp_total: double (nullable = true)
 |-- _government: string (nullable = true)
 |-- _id: string (nullable = true)
 |-- _indep_date: string (nullable = true)
 |-- _infant_mortality: double (nullable = true)
 |-- _inflation: double (nullable = true)
 |-- _name: string (nullable = true)
 |-- _population: string (nullable = true)
 |-- _population_growth: double (nullable = true)
 |-- _total_area: double (nullable = true)
 |-- border: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _country: string (nullable = true)
 |    |    |-- _length: double (nullable = true)
 |-- city: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _country: string (nullable = true)
...
 |    |-- element: struct (containsNull = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _percentage: double (nullable = true)

In [None]:
country_df = c2.selectExpr('_name AS country_name', '_population AS country_pop', '_id')
country_df.show()

In [None]:
This produces the appropriate output for Country, which can then be joined with an output for City [unaccomplished]