In [1]:
import pyspark.sql.types as T
from pyspark.sql.functions import when, col, lit, regexp_extract
from pyspark.sql import SparkSession

In [2]:
people_schema = T.StructType([ \
    T.StructField("rank", T.IntegerType(), True), \
    T.StructField("name", T.StringType(), True), \
    T.StructField("net_worth", T.StringType(), True), \
    T.StructField("bday", T.StringType(), True), \
    T.StructField("age", T.IntegerType(), True), \
    T.StructField("nationality", T.StringType(), True)
])

In [5]:
country_schema = T.StructType([ \
    T.StructField("country", T.StringType(), True), \
    T.StructField("num_billionares", T.StringType(), True), \
    T.StructField("billionaire_per_million", T.StringType(), True)
])

In [6]:
spark = SparkSession\
            .builder\
            .appName('spark_basic')\
            .getOrCreate()
            


In [16]:
df_people = spark.read.option('header', 'true').schema(people_schema).csv('data/top_100_richest.csv')
df_people.show(df_people.count(), truncate=False)

+----+----------------------------------+-------------+---------+----+------------------------+
|rank|name                              |net_worth    |bday     |age |nationality             |
+----+----------------------------------+-------------+---------+----+------------------------+
|1   |Elon Musk                         |$240 Billion |28-Jun-71|51  |South Africa            |
|2   |Jeff Bezos                        |$150 Billion |12-Jan-64|58  |United States of America|
|3   |Gautam Adani                      |$138 Billion |24-Jun-62|60  |India                   |
|4   |Bernard Arnault                   |$135 Billion |5-Mar-49 |73  |France                  |
|5   |Bill Gates                        |$118 Billion |28-Oct-55|66  |United States of America|
|6   |Warren Buffett                    |$101 Billion |30-Aug-30|92  |United States of America|
|7   |Larry Page                        |$100 Billion |26-Mar-73|49  |United States of America|
|8   |Sergey Brin                       

In [17]:
df_country = spark.read.option('header', 'true').schema(country_schema).csv('data/wiki_number_of_billionaires.csv')
df_country.show(df_country.count(), truncate=False)

+--------------------+---------------+-----------------------+
|country             |num_billionares|billionaire_per_million|
+--------------------+---------------+-----------------------+
|"World,""2          |668""          |0.35"                  |
|United States       |735            |1.853                  |
|Mainland China      |539            |0.276                  |
|India               |166            |0.101                  |
|Germany             |134            |1.553                  |
|Russia              |83             |0.677                  |
|Hong Kong           |67             |8.830                  |
|Canada              |64             |1.147                  |
|Brazil              |62             |0.211                  |
|Italy               |52             |0.607                  |
|Taiwan              |51             |1.529                  |
|United Kingdom      |49             |0.674                  |
|Australia           |46             |1.203            

In [15]:
df_people_filtered = df_people.withColumn('net_worth', regexp_extract(col('net_worth'), '^\$(\\d+).*$', 1).cast('int')).filter(col('net_worth') > 60)
df_people_filtered.show(df_people_filtered.count(), truncate=False)

+----+----------------------------+---------+---------+----+------------------------+
|rank|name                        |net_worth|bday     |age |nationality             |
+----+----------------------------+---------+---------+----+------------------------+
|1   |Elon Musk                   |240      |28-Jun-71|51  |South Africa            |
|2   |Jeff Bezos                  |150      |12-Jan-64|58  |United States of America|
|3   |Gautam Adani                |138      |24-Jun-62|60  |India                   |
|4   |Bernard Arnault             |135      |5-Mar-49 |73  |France                  |
|5   |Bill Gates                  |118      |28-Oct-55|66  |United States of America|
|6   |Warren Buffett              |101      |30-Aug-30|92  |United States of America|
|7   |Larry Page                  |100      |26-Mar-73|49  |United States of America|
|8   |Sergey Brin                 |96       |21-Aug-73|48  |United States of America|
|9   |Steve Ballmer               |94       |24-Mar-56

In [20]:
df_people_new = df_people_filtered.withColumn('nationality', when(col('nationality') == 'United States of America',  'United States').\
                                                                            when(col('nationality') == 'French',  'France').\
                                                                            when(col('nationality') == 'England', 'United Kingdom').\
                                                                            otherwise(col('nationality')))
df_people_new.show(df_people_new.count(), truncate=False)

+----+----------------------------+---------+---------+----+-------------+
|rank|name                        |net_worth|bday     |age |nationality  |
+----+----------------------------+---------+---------+----+-------------+
|1   |Elon Musk                   |240      |28-Jun-71|51  |South Africa |
|2   |Jeff Bezos                  |150      |12-Jan-64|58  |United States|
|3   |Gautam Adani                |138      |24-Jun-62|60  |India        |
|4   |Bernard Arnault             |135      |5-Mar-49 |73  |France       |
|5   |Bill Gates                  |118      |28-Oct-55|66  |United States|
|6   |Warren Buffett              |101      |30-Aug-30|92  |United States|
|7   |Larry Page                  |100      |26-Mar-73|49  |United States|
|8   |Sergey Brin                 |96       |21-Aug-73|48  |United States|
|9   |Steve Ballmer               |94       |24-Mar-56|66  |United States|
|10  |Larry Ellison               |93       |17-Aug-44|78  |United States|
|11  |Alain & Gerard Wert

In [21]:
df_prelast = df_people_new.join(df_country, df_people_new['nationality'] == df_country['country'], 'inner')
df_prelast.show(df_prelast.count(), truncate=False)

+----+----------------------------+---------+---------+----+-------------+-------------+---------------+-----------------------+
|rank|name                        |net_worth|bday     |age |nationality  |country      |num_billionares|billionaire_per_million|
+----+----------------------------+---------+---------+----+-------------+-------------+---------------+-----------------------+
|1   |Elon Musk                   |240      |28-Jun-71|51  |South Africa |South Africa |4              |0.067                  |
|2   |Jeff Bezos                  |150      |12-Jan-64|58  |United States|United States|735            |1.853                  |
|3   |Gautam Adani                |138      |24-Jun-62|60  |India        |India        |166            |0.101                  |
|4   |Bernard Arnault             |135      |5-Mar-49 |73  |France       |France       |43             |0.579                  |
|5   |Bill Gates                  |118      |28-Oct-55|66  |United States|United States|735      

In [22]:
df_last = df_prelast.filter(col('age').isNotNull()).select(col('rank'), col('name'), col('net_worth'), col('bday'), col('age'), col('nationality'))
df_last.show(df_last.count(), truncate=False)

+----+----------------------------+---------+---------+---+-------------+
|rank|name                        |net_worth|bday     |age|nationality  |
+----+----------------------------+---------+---------+---+-------------+
|1   |Elon Musk                   |240      |28-Jun-71|51 |South Africa |
|2   |Jeff Bezos                  |150      |12-Jan-64|58 |United States|
|3   |Gautam Adani                |138      |24-Jun-62|60 |India        |
|4   |Bernard Arnault             |135      |5-Mar-49 |73 |France       |
|5   |Bill Gates                  |118      |28-Oct-55|66 |United States|
|6   |Warren Buffett              |101      |30-Aug-30|92 |United States|
|7   |Larry Page                  |100      |26-Mar-73|49 |United States|
|8   |Sergey Brin                 |96       |21-Aug-73|48 |United States|
|9   |Steve Ballmer               |94       |24-Mar-56|66 |United States|
|10  |Larry Ellison               |93       |17-Aug-44|78 |United States|
|12  |Mukesh Ambani               |89 

In [23]:
df_parquet = spark.read.schema(people_schema).parquet('data_parquet')

In [25]:
df_parquet.show(df_parquet.count(), truncate=False)

+----+----------------------------------+-------------+---------+----+------------------------+
|rank|name                              |net_worth    |bday     |age |nationality             |
+----+----------------------------------+-------------+---------+----+------------------------+
|2   |Jeff Bezos                        |$150 Billion |12-Jan-64|58  |United States of America|
|5   |Bill Gates                        |$118 Billion |28-Oct-55|66  |United States of America|
|6   |Warren Buffett                    |$101 Billion |30-Aug-30|92  |United States of America|
|7   |Larry Page                        |$100 Billion |26-Mar-73|49  |United States of America|
|8   |Sergey Brin                       |$96 Billion  |21-Aug-73|48  |United States of America|
|9   |Steve Ballmer                     |$94 Billion  |24-Mar-56|66  |United States of America|
|10  |Larry Ellison                     |$93 Billion  |17-Aug-44|78  |United States of America|
|17  |Michael Bloomberg                 

In [26]:
df_parquet_filtered = df_parquet.filter(col('nationality') == 'Russia')
df_parquet_filtered.show(df_parquet_filtered.count(), truncate=False)

+----+------------------+-----------+---------+---+-----------+
|rank|name              |net_worth  |bday     |age|nationality|
+----+------------------+-----------+---------+---+-----------+
|14  |Vladimir Putin    |$70 Billion|7-Oct-52 |69 |Russia     |
|36  |Vladimir Potanin  |$35 Billion|3-Jan-61 |61 |Russia     |
|72  |Andrey Melnichenko|$21 Billion|8-Mar-72 |50 |Russia     |
|75  |Alexey Mordashov  |$20 Billion|26-Sep-65|56 |Russia     |
|80  |Leonid Mikhelson  |$20 Billion|11-Aug-55|67 |Russia     |
|84  |Vladimir Lisin    |$19 Billion|7-May-56 |66 |Russia     |
|90  |Alisher Usmanov   |$18 Billion|9-Sep-53 |68 |Russia     |
|93  |Pavel Durov       |$17 Billion|10-Oct-84|37 |Russia     |
|98  |Viktor Vekselberg |$17 Billion|14-Apr-57|65 |Russia     |
+----+------------------+-----------+---------+---+-----------+



In [27]:
df_last = df_last.union(df_parquet_filtered)
df_last.show(df_last.count(), truncate=False)

+----+----------------------------+-----------+---------+---+-------------+
|rank|name                        |net_worth  |bday     |age|nationality  |
+----+----------------------------+-----------+---------+---+-------------+
|1   |Elon Musk                   |240        |28-Jun-71|51 |South Africa |
|2   |Jeff Bezos                  |150        |12-Jan-64|58 |United States|
|3   |Gautam Adani                |138        |24-Jun-62|60 |India        |
|4   |Bernard Arnault             |135        |5-Mar-49 |73 |France       |
|5   |Bill Gates                  |118        |28-Oct-55|66 |United States|
|6   |Warren Buffett              |101        |30-Aug-30|92 |United States|
|7   |Larry Page                  |100        |26-Mar-73|49 |United States|
|8   |Sergey Brin                 |96         |21-Aug-73|48 |United States|
|9   |Steve Ballmer               |94         |24-Mar-56|66 |United States|
|10  |Larry Ellison               |93         |17-Aug-44|78 |United States|
|12  |Mukesh

In [30]:
df_last.write.option("header", 'true').mode('overwrite').csv("1.csv")


In [31]:
plans = df_last._jdf.queryExecution().toString()

In [33]:
plans

"== Parsed Logical Plan ==\n'Union false, false\n:- Project [rank#44, name#45, net_worth#195, bday#47, age#48, nationality#381]\n:  +- Filter isnotnull(age#48)\n:     +- Join Inner, (nationality#381 = country#300)\n:        :- Project [rank#44, name#45, net_worth#195, bday#47, age#48, CASE WHEN (nationality#49 = United States of America) THEN United States WHEN (nationality#49 = French) THEN France WHEN (nationality#49 = England) THEN United Kingdom ELSE nationality#49 END AS nationality#381]\n:        :  +- Filter (net_worth#195 > 60)\n:        :     +- Project [rank#44, name#45, cast(regexp_extract(net_worth#46, ^\\$(\\d+).*$, 1) as int) AS net_worth#195, bday#47, age#48, nationality#49]\n:        :        +- Relation [rank#44,name#45,net_worth#46,bday#47,age#48,nationality#49] csv\n:        +- Relation [country#300,num_billionares#301,billionaire_per_million#302] csv\n+- Filter (nationality#570 = Russia)\n   +- Relation [rank#565,name#566,net_worth#567,bday#568,age#569,nationality#5

In [34]:
plans_1 = df_last._sc._jvm.PythonSQLUtils.explainString(df_last._jdf.queryExecution(), 'EXTENDED')


In [35]:
plans_1

"== Parsed Logical Plan ==\n'Union false, false\n:- Project [rank#44, name#45, net_worth#195, bday#47, age#48, nationality#381]\n:  +- Filter isnotnull(age#48)\n:     +- Join Inner, (nationality#381 = country#300)\n:        :- Project [rank#44, name#45, net_worth#195, bday#47, age#48, CASE WHEN (nationality#49 = United States of America) THEN United States WHEN (nationality#49 = French) THEN France WHEN (nationality#49 = England) THEN United Kingdom ELSE nationality#49 END AS nationality#381]\n:        :  +- Filter (net_worth#195 > 60)\n:        :     +- Project [rank#44, name#45, cast(regexp_extract(net_worth#46, ^\\$(\\d+).*$, 1) as int) AS net_worth#195, bday#47, age#48, nationality#49]\n:        :        +- Relation [rank#44,name#45,net_worth#46,bday#47,age#48,nationality#49] csv\n:        +- Relation [country#300,num_billionares#301,billionaire_per_million#302] csv\n+- Filter (nationality#570 = Russia)\n   +- Relation [rank#565,name#566,net_worth#567,bday#568,age#569,nationality#5

In [39]:
with open('plans.txt', 'w') as file:
    file.write(plans)

In [38]:
with open('plans1.txt', 'w') as file:
    file.write(plans_1)