In [54]:
from email import header
from struct import Struct
from typing import List

from pyspark.sql.types import *
from pyspark.sql import SparkSession


spark = (SparkSession
            .builder
            .appName("IT_comp")
            .getOrCreate()
        )





In [55]:
schema = StructType([
    StructField("id",IntegerType(),False),
    StructField("name",StringType(),False),
    StructField("surname",StringType(),False),
    StructField("departement",StringType(),False),
    StructField("salary",IntegerType(),False),
    StructField("skills",StringType(),False)
])

df = spark.read.csv("./redo.csv",header=False,schema=schema)

df.show(n=10,truncate=False)

+---+---------+---------+-----------+------+--------------------------+
|id |name     |surname  |departement|salary|skills                    |
+---+---------+---------+-----------+------+--------------------------+
|0  |Carri    |Stogginb |C          |439   |C#  Java  C/C++           |
|1  |Trethaw  |Adestic  |C          |843   |C#  Java  Python          |
|2  |Upigniona|Sleniator|B          |256   |Java                      |
|3  |Psater   |Easephif |D          |697   |Storm  R  Python          |
|4  |Tentestes|Ieutimena|D          |251   |Java  Python              |
|5  |Furron   |Asmahed  |D          |2745  |C/C++  Java  Spark  Python|
|6  |Anca     |Ardas    |A          |318   |C/C++                     |
|7  |Arym     |Atinquis |C          |2786  |Spark  Python             |
|8  |Halarstar|Cloap    |C          |538   |C#  R                     |
|9  |Molotonla|Strou    |B          |607   |C#  C/C++  Python         |
+---+---------+---------+-----------+------+--------------------

In [56]:
from pyspark.sql.functions import col, split

df = df.withColumn('skills', split(col('skills'),"  "))
df.show()

+---+---------+---------+-----------+------+--------------------+
| id|     name|  surname|departement|salary|              skills|
+---+---------+---------+-----------+------+--------------------+
|  0|    Carri| Stogginb|          C|   439|   [C#, Java, C/C++]|
|  1|  Trethaw|  Adestic|          C|   843|  [C#, Java, Python]|
|  2|Upigniona|Sleniator|          B|   256|              [Java]|
|  3|   Psater| Easephif|          D|   697|  [Storm, R, Python]|
|  4|Tentestes|Ieutimena|          D|   251|      [Java, Python]|
|  5|   Furron|  Asmahed|          D|  2745|[C/C++, Java, Spa...|
|  6|     Anca|    Ardas|          A|   318|             [C/C++]|
|  7|     Arym| Atinquis|          C|  2786|     [Spark, Python]|
|  8|Halarstar|    Cloap|          C|   538|             [C#, R]|
|  9|Molotonla|    Strou|          B|   607| [C#, C/C++, Python]|
| 10| Andaltim|     Tors|          D|    98|              [Java]|
| 11|    Accep|    Pille|          A|  5400|   [C#, Java, Spark]|
| 12|   Ck

In [38]:
from itertools import groupby
import pyspark.sql.functions as F


(df
    .groupby("departement")
    .agg(F.avg("salary").alias("avg_salary"))
    .orderBy(col("avg_salary").desc())
    .show()
)

+-----------+------------------+
|departement|        avg_salary|
+-----------+------------------+
|          A| 1959.696881091618|
|          B|1458.1549576482312|
|          C|1074.5340296495956|
|          D| 720.4279105628373|
+-----------+------------------+



In [57]:
(df
    .withColumn("skills",F.explode("skills"))
    .groupBy("skills")
    .agg(F.avg("salary").alias("avg_salary"))
    .orderBy(col("avg_salary").desc())   
    .show() 
)

+----------+------------------+
|    skills|        avg_salary|
+----------+------------------+
|     Spark|  2854.00245398773|
|Javascript| 1217.235418875928|
|         R|   1202.1435546875|
|        C#|1182.9353876739563|
|       PHP|1178.2782426778242|
|    Python| 1169.735187760779|
|     C/C++|  1161.35306628855|
|      Java|1148.8350316717158|
|      Hive| 893.5743243243244|
|    Hadoop| 889.2778702163062|
|     Storm| 878.0658857979503|
|       MPI| 873.6853582554517|
+----------+------------------+



In [58]:
(df
    .withColumn("skills",F.explode("skills"))
    .groupBy("departement","skills")
    .agg(F.avg("salary").alias("avg_salary"))
    .orderBy(col("avg_salary").desc())   
    .show() 
)

+-----------+----------+------------------+
|departement|    skills|        avg_salary|
+-----------+----------+------------------+
|          A|     Spark|5121.3591836734695|
|          B|     Spark| 3798.280163599182|
|          C|     Spark|2776.1414965986396|
|          A|         R|2245.5757575757575|
|          A|        C#|2197.8039215686276|
|          A|Javascript|2158.6534653465346|
|          A|     C/C++|2067.7665615141955|
|          A|      Java|  1987.60162601626|
|          A|       PHP|1984.7894736842106|
|          A|    Python|1974.9005376344087|
|          D|     Spark|1870.3688524590164|
|          B|         R|1689.3830845771145|
|          B|       PHP| 1689.313315926893|
|          A|    Hadoop|         1677.8125|
|          B|Javascript|1618.2702702702702|
|          A|     Storm|1594.0133333333333|
|          A|       MPI| 1582.295081967213|
|          A|      Hive|       1581.796875|
|          B|     C/C++|1562.5747747747748|
|          B|        C#|1524.017

In [65]:
(df
    .withColumn("skills",F.explode("skills"))
    .groupBy("skills","departement")
    .max("salary")
    .orderBy(col("max(salary)").desc())
    .show() 
)

+----------+-----------+-----------+
|    skills|departement|max(salary)|
+----------+-----------+-----------+
|     Spark|          A|       8487|
|        C#|          A|       8487|
|       PHP|          A|       8487|
|    Python|          A|       8487|
|     C/C++|          A|       8376|
|         R|          A|       8376|
|      Java|          A|       8326|
|Javascript|          A|       8082|
|      Java|          B|       6487|
|     Spark|          B|       6487|
|       PHP|          B|       6487|
|    Python|          B|       6487|
|     C/C++|          B|       6472|
|         R|          B|       6472|
|Javascript|          B|       6367|
|        C#|          B|       6238|
|Javascript|          C|       4641|
|     Spark|          C|       4641|
|      Java|          C|       4641|
|    Python|          C|       4641|
+----------+-----------+-----------+
only showing top 20 rows



In [53]:
f1 = open("./ITCompany_10000.csv")
f2 = open("./redo.csv","w")
for l in f1:
    parts = l.split("[")
    nl = parts[1].replace(","," ").replace("]","")
    new_line = parts[0]+nl
    new_line=new_line.strip()
    new_new_line=""
    for x in new_line.split(" "):
        new_new_line+=x.strip()+" "
    new_new_line=new_new_line.strip()+'\n'
    f2.write(new_line)
    f2.write('\n')
    


