In [None]:
from pyspark import SparkContext
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col
from pyspark.sql.functions import round,avg

sc = SparkContext("local", "Revenue Data Analysis")
spark = SparkSession(sc)

In [27]:
rdd = sc.wholeTextFiles("input")
rdd = rdd.map(lambda x: (x[0].split("/")[-1], x[1]))
rdd = rdd.map(lambda x: (x[0].split(".")[0], x[1].split("\n"))).flatMapValues(lambda x: x)
rdd = rdd.map(lambda x: (x[0].split("_")[0],x[0] if x[0].count("_") >  0 else x[0] , x[1].split(" ")[0], x[1].split(" ")[1].replace("\r", "")))
df = rdd.map(lambda x: Row(city=x[0],id=x[1] ,month=x[2], revenue=x[3])).toDF()
df = df.withColumn("revenue", col("revenue").cast("int"))

In [28]:
df.printSchema()
df.show()

root
 |-- city: string (nullable = true)
 |-- id: string (nullable = true)
 |-- month: string (nullable = true)
 |-- revenue: integer (nullable = true)

+----------+------------+-----+-------+
|      city|          id|month|revenue|
+----------+------------+-----+-------+
|    troyes|      troyes|  JAN|     21|
|    troyes|      troyes|  FEB|     21|
|    troyes|      troyes|  MAR|     11|
|    troyes|      troyes|  APR|     17|
|    troyes|      troyes|  MAY|     15|
|    troyes|      troyes|  JUN|     25|
|    troyes|      troyes|  JUL|     11|
|    troyes|      troyes|  AUG|     22|
|    troyes|      troyes|  SEP|     21|
|    troyes|      troyes|  OCT|     28|
|    troyes|      troyes|  NOV|     11|
|    troyes|      troyes|  DEC|     11|
|marseilles|marseilles_2|  JAN|     11|
|marseilles|marseilles_2|  FEB|     11|
|marseilles|marseilles_2|  MAR|     11|
|marseilles|marseilles_2|  APR|     17|
|marseilles|marseilles_2|  MAY|     12|
|marseilles|marseilles_2|  JUN|     25|
|marsei

- Average monthly income of the shop in France

In [29]:
df.agg({"revenue":"avg"}).show()

+-----------------+
|     avg(revenue)|
+-----------------+
|23.19871794871795|
+-----------------+



- Average monthly income of the shop in each city

In [49]:
df.groupBy("city").agg(round(avg("revenue"),2).alias("avg_revenue")).show()

+----------+-----------+
|      city|avg_revenue|
+----------+-----------+
|    nantes|      17.25|
|    troyes|      17.83|
|     paris|      43.56|
|      lyon|      16.08|
|     anger|      13.83|
|marseilles|      21.46|
|      nice|      16.92|
|    orlean|      16.33|
|    rennes|       15.0|
|  toulouse|      14.75|
+----------+-----------+



- Total revenue per city per year

In [31]:
df.groupBy("city").agg({"revenue":"sum"}).orderBy("sum(revenue)", ascending=False).show()

+----------+------------+
|      city|sum(revenue)|
+----------+------------+
|     paris|        1568|
|marseilles|         515|
|    troyes|         214|
|    nantes|         207|
|      nice|         203|
|    orlean|         196|
|      lyon|         193|
|    rennes|         180|
|  toulouse|         177|
|     anger|         166|
+----------+------------+



- Total revenue per store per year

In [32]:
df.groupBy("id").agg({"revenue":"sum"}).orderBy("sum(revenue)", ascending=False).show()

+------------+------------+
|          id|sum(revenue)|
+------------+------------+
|     paris_2|         642|
|     paris_1|         596|
|     paris_3|         330|
|marseilles_1|         284|
|marseilles_2|         231|
|      troyes|         214|
|      nantes|         207|
|        nice|         203|
|      orlean|         196|
|        lyon|         193|
|      rennes|         180|
|    toulouse|         177|
|       anger|         166|
+------------+------------+



- The store that achieves the best performance in each month

In [118]:
max_revenue_by_months = df.groupBy("month").agg({"revenue":"max"})
best_store_df = df.join(max_revenue_by_months, (df["month"] == max_revenue_by_months["month"]) & (df["revenue"] == max_revenue_by_months["max(revenue)"]),"inner").drop(df["month"]).drop(df["revenue"])
best_store_df.select('month','id','max(revenue)').orderBy("max(revenue)",ascending=False).show()

+-----+-------+------------+
|month|     id|max(revenue)|
+-----+-------+------------+
|  JUN|paris_2|          85|
|  MAY|paris_2|          72|
|  DEC|paris_1|          71|
|  OCT|paris_1|          68|
|  NOV|paris_2|          64|
|  SEP|paris_2|          63|
|  JUL|paris_1|          61|
|  APR|paris_1|          57|
|  JAN|paris_1|          51|
|  AUG|paris_2|          45|
|  MAR|paris_2|          44|
|  FEB|paris_2|          42|
+-----+-------+------------+

