In [1]:
from pyspark.sql import SparkSession

In [27]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Column as cols

In [3]:
spark = SparkSession.builder.master("local[4]").appName("FlagAnalysis").getOrCreate()

25/07/06 04:31:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/06 04:31:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/07/06 04:31:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [14]:
country_flag_sch = StructType([\
    StructField("name", StringType(), True),\
    StructField("landmass", IntegerType(), True),\
    StructField("zone", IntegerType(), True),\
    StructField("area", IntegerType(), True),\
    StructField("population", IntegerType(), True),\
    StructField("language", IntegerType(), True),\
    StructField("religion", IntegerType(), True),\
    StructField("bars", IntegerType(), True),\
    StructField("stripes", IntegerType(), True),\
    StructField("colours", IntegerType(), True),\
    StructField("red", IntegerType(), True),\
    StructField("green", IntegerType(), True),\
    StructField("blue", IntegerType(), True),\
    StructField("gold", IntegerType(), True),\
    StructField("white", IntegerType(), True),\
    StructField("black", IntegerType(), True),\
    StructField("orange", IntegerType(), True),\
    StructField("mainhue", StringType(), True),\
    StructField("circles", IntegerType(), True),\
    StructField("crosses", IntegerType(), True),\
    StructField("saltires", IntegerType(), True),\
    StructField("quaters", IntegerType(), True),\
    StructField("sunstars", IntegerType(), True),\
    StructField("cresent", IntegerType(), True),\
    StructField("triange", IntegerType(), True),\
    StructField("icon", IntegerType(), True),\
    StructField("animate", IntegerType(), True),\
    StructField("text", IntegerType(), True),\
    StructField("topleft", StringType(), True),\
    StructField("botright", StringType(), True)\
])

In [15]:
country = spark.read.csv("certification/country/country.csv",sep=",",schema=country_flag_sch)

In [17]:
country.show(4)

+--------------+--------+----+----+----------+--------+--------+----+-------+-------+---+-----+----+----+-----+-----+------+-------+-------+-------+--------+-------+--------+-------+-------+----+-------+----+-------+--------+
|          name|landmass|zone|area|population|language|religion|bars|stripes|colours|red|green|blue|gold|white|black|orange|mainhue|circles|crosses|saltires|quaters|sunstars|cresent|triange|icon|animate|text|topleft|botright|
+--------------+--------+----+----+----------+--------+--------+----+-------+-------+---+-----+----+----+-----+-----+------+-------+-------+-------+--------+-------+--------+-------+-------+----+-------+----+-------+--------+
|   Afghanistan|       5|   1| 648|        16|      10|       2|   0|      3|      5|  1|    1|   0|   1|    1|    1|     0|  green|      0|      0|       0|      0|       1|      0|      0|   1|      0|   0|  black|   green|
|       Albania|       3|   1|  29|         3|       6|       6|   0|      0|      3|  1|    0| 

### A. Count number of countries based on landmass

In [22]:
country_count_df = country.groupBy("landmass").agg(count("landmass").alias("Country_Count")).orderBy(col("landmass"))

In [34]:
continents = ["N. America", "S. America", "Europe", "Africa", "Asia", "Oceania"]

In [45]:
continent_df = spark.createDataFrame([(x,y) for x,y in zip(range(1,7),continents)], ["landmass", "continents"])

In [46]:
continent_df.show()

+--------+----------+
|landmass|continents|
+--------+----------+
|       1|N. America|
|       2|S. America|
|       3|    Europe|
|       4|    Africa|
|       5|      Asia|
|       6|   Oceania|
+--------+----------+



                                                                                

In [48]:
continent_df.join(country_count_df, on="landmass", how="inner").select(["continents", "Country_Count"]).show()

                                                                                

+----------+-------------+
|continents|Country_Count|
+----------+-------------+
|N. America|           31|
|S. America|           17|
|    Europe|           35|
|    Africa|           52|
|      Asia|           39|
|   Oceania|           20|
+----------+-------------+



### B. Find out top 5 country with Sum of bars and strips in a flag

In [53]:
country.select(["name", "bars", "stripes"]).withColumn("bars+stripes", country.bars+country.stripes).orderBy(col("bars+stripes").desc()).show(5)

+--------+----+-------+------------+
|    name|bars|stripes|bars+stripes|
+--------+----+-------+------------+
|Malaysia|   0|     14|          14|
|     USA|   0|     13|          13|
| Liberia|   0|     11|          11|
| Uruguay|   0|      9|           9|
|  Greece|   0|      9|           9|
+--------+----+-------+------------+
only showing top 5 rows



### C. Count of countries with icon.

In [56]:
country.select(["name", "icon"]).where(country.icon==1).count()

49

### D. Count of countries which have same top left and bottom right color in flag.

In [60]:
country.select(["name", "topleft", "botright"]).where(country.topleft==country.botright).count()

76

### E. Count number of countries based on zone.

In [79]:
zone_count = country.groupBy("zone").agg(count("zone").alias("Country_Count"))

In [80]:
zones = ["NE", "SE", "SW", "NW"]

In [81]:
zone_name = spark.createDataFrame([(x,y) for x,y in zip(range(1,5),zones)],["zone", "zone_name"])

In [84]:
zone_count.join(zone_name, on="zone", how="inner").select(["zone_name", "Country_Count"]).orderBy(col("Country_Count").desc()).show()

+---------+-------------+
|zone_name|Country_Count|
+---------+-------------+
|       NE|           91|
|       NW|           58|
|       SE|           29|
|       SW|           16|
+---------+-------------+



                                                                                

### F. Find out largest county in terms of area in NE zone.

In [92]:
country.where(country.zone==1).orderBy(col("area").desc()).select(["name", "area"]).show(1)

+----+-----+
|name| area|
+----+-----+
|USSR|22402|
+----+-----+
only showing top 1 row



### G. Find out least populated country in S.America landmass

In [96]:
country.where(country.landmass==2).select(["name", "population"]).orderBy("population").show(1)

+-------------+----------+
|         name|population|
+-------------+----------+
|French-Guiana|         0|
+-------------+----------+
only showing top 1 row



French-Guiana has less than million populations

### H. Find out largest speaking language among all countries.

In [102]:
language_count = country.groupBy("language").agg(count("language").alias("Country_Count")).orderBy(col("Country_Count").desc())

In [98]:
languages = ["English", "Spanish", "French", "German", "Slavic", "Other Indo-European", "Chinese", "Arabic", "Japanese/Turkish/Finnish/Magyar", "Others"]

In [99]:
language_df = spark.createDataFrame([(x, y) for x, y in zip(range(1,11), languages)],["language", "language_name"])

In [103]:
language_df.join(language_count, on="language", how="inner").select(["language_name", "Country_Count"]).orderBy(col("Country_Count").desc()).show()

                                                                                

+--------------------+-------------+
|       language_name|Country_Count|
+--------------------+-------------+
|              Others|           46|
|             English|           43|
| Other Indo-European|           30|
|             Spanish|           21|
|              Arabic|           19|
|              French|           17|
|              German|            6|
|             Chinese|            4|
|              Slavic|            4|
|Japanese/Turkish/...|            4|
+--------------------+-------------+



### I. Find most common colour among flags from all countries

In [118]:
color_sum = country.select(sum(country.blue).alias("blue"), sum(country.red).alias("red"), sum(country.green).alias("green"), sum(country.gold).alias("gold"), sum(country.orange).alias("orange"), sum(country.black).alias("black"), sum(country.white).alias("white"))

In [120]:
color_df = color_sum.toPandas().transpose()

In [130]:
color_df1 = color_df.reset_index()

In [131]:
color_df1.columns = ["Colour", "Count"]

In [133]:
color_df1.sort_values(by="Count", ascending=False)

Unnamed: 0,Colour,Count
1,red,153
6,white,146
0,blue,99
2,green,91
3,gold,91
5,black,52
4,orange,26


### J. Sum of all circles present in all country flags

In [140]:
country.select(sum(country.circles)).show()

+------------+
|sum(circles)|
+------------+
|          33|
+------------+



### K. Count of countries which have both icon and text in flag.

In [142]:
country.where((country.icon==1)&(country.text==1)).count()

13