In [1]:
pip install pyspark==3.4.1

Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install pandas




In [2]:
pip install findspark

Note: you may need to restart the kernel to use updated packages.


In [4]:
import findspark
findspark.init()

# Contoh membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandsOnPertemuan6').getOrCreate()

data = [('James', 'Sales', 3000),
        ('Michael', 'Sales', 4600),
        ('Robert', 'Sales', 4100),
        ('Maria', 'Finance', 3000)]
columns = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data, schema=columns)
df.show()

+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|       James|     Sales|  3000|
|     Michael|     Sales|  4600|
|      Robert|     Sales|  4100|
|       Maria|   Finance|  3000|
+------------+----------+------+



In [6]:
# Contoh operasi transformasi DataFrame
from pyspark.sql.functions import mean, max, sum, min

print ("Data Tabel Pegawai dan Salary")
df_select = df.select('EmployeeName', 'Salary')
df_select.show()

# Melakukan filter untuk menampilkan data dengan salary > 3000
print("Data Pegawai dengan Salary lebih dari 3000")
df_filter = df.filter(df['Salary'] > 3000)
df_filter.show()

# Menghitung rata-rata dari data salary untuk tiap departemen
print("Rata-rata Salary Untuk Tiap Departement")
df_groupBy = df.groupBy('Department').avg('Salary')
df_groupBy.show()

print("Ringkasan Data")
ringkasan_df = df.groupBy('Department').agg(
    mean("Salary").alias("Rata-rata salary"),
    max("Salary").alias("Salarry Tertinggi"),
    min("Salary").alias("Salary Terendah"),
    sum("Salary").alias("Total Salary")
)
ringkasan_df.show()

Data Tabel Pegawai dan Salary
+------------+------+
|EmployeeName|Salary|
+------------+------+
|       James|  3000|
|     Michael|  4600|
|      Robert|  4100|
|       Maria|  3000|
+------------+------+

Data Pegawai dengan Salary lebih dari 3000
+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|     Michael|     Sales|  4600|
|      Robert|     Sales|  4100|
+------------+----------+------+

Rata-rata Salary Untuk Tiap Departement
+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3900.0|
|   Finance|     3000.0|
+----------+-----------+

Ringkasan Data
+----------+----------------+-----------------+---------------+------------+
|Department|Rata-rata salary|Salarry Tertinggi|Salary Terendah|Total Salary|
+----------+----------------+-----------------+---------------+------------+
|     Sales|          3900.0|             4600|           3000|       11700|
|   Finance|          3000.0|        

In [5]:
# Contoh manipulasi tipe data kompleks
df = df.withColumn('SalaryBonus', df['Salary'] * 0.1)
df = df.withColumn('TotalCompensation', df['Salary'] + df['SalaryBonus'])
df.show()

+------------+----------+------+-----------+-----------------+
|EmployeeName|Department|Salary|SalaryBonus|TotalCompensation|
+------------+----------+------+-----------+-----------------+
|       James|     Sales|  3000|      300.0|           3300.0|
|     Michael|     Sales|  4600|      460.0|           5060.0|
|      Robert|     Sales|  4100|      410.0|           4510.0|
|       Maria|   Finance|  3000|      300.0|           3300.0|
+------------+----------+------+-----------+-----------------+



In [6]:
# Contoh menggunakan window functions
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy('Department').orderBy('Salary')
df.withColumn('Rank', F.rank().over(windowSpec)).show()

+------------+----------+------+-----------+-----------------+----+
|EmployeeName|Department|Salary|SalaryBonus|TotalCompensation|Rank|
+------------+----------+------+-----------+-----------------+----+
|       Maria|   Finance|  3000|      300.0|           3300.0|   1|
|       James|     Sales|  3000|      300.0|           3300.0|   1|
|      Robert|     Sales|  4100|      410.0|           4510.0|   2|
|     Michael|     Sales|  4600|      460.0|           5060.0|   3|
+------------+----------+------+-----------+-----------------+----+



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName("DataKaggle").getOrCreate()

# Membaca file
file_path = "/C:/Users/zhari/Downloads/Salary_Data.csv"
df = spark.read.csv(file_path, header = True, inferSchema = True)
# Menampilkan data
df.show()
# Melihat skema atau struktur dari data
df.printSchema()

hitung_baris = df.count()
print("Jumlah Baris dalam data : ", hitung_baris)

print("\nSalary berdasarkan gender : ")
df.groupBy("Job Title").pivot("Gender").sum("Salary").show()

df_GajiPerTahun = df.withColumn("Gaji Pertahun", col("Salary") / col("Years of Experience"))
df_GajiPerTahun.show()

+---+------+---------------+--------------------+-------------------+------+
|Age|Gender|Education Level|           Job Title|Years of Experience|Salary|
+---+------+---------------+--------------------+-------------------+------+
| 32|  Male|     Bachelor's|   Software Engineer|                5.0| 90000|
| 28|Female|       Master's|        Data Analyst|                3.0| 65000|
| 45|  Male|            PhD|      Senior Manager|               15.0|150000|
| 36|Female|     Bachelor's|     Sales Associate|                7.0| 60000|
| 52|  Male|       Master's|            Director|               20.0|200000|
| 29|  Male|     Bachelor's|   Marketing Analyst|                2.0| 55000|
| 42|Female|       Master's|     Product Manager|               12.0|120000|
| 31|  Male|     Bachelor's|       Sales Manager|                4.0| 80000|
| 26|Female|     Bachelor's|Marketing Coordin...|                1.0| 45000|
| 38|  Male|            PhD|    Senior Scientist|               10.0|110000|