## 1. Libraries

In [23]:
import os
import pandas as pd
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, TimestampType, DoubleType, LongType
from pyspark.sql.functions import col, from_unixtime, floor

from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [27]:
credentials_location = "/home/xiangivyli/data-science-portfolio/part_a_job_posting_linkedin_pipeline/airflow/include/.gc/airflow-gcp-bigquery.json"

jars_location = "/home/xiangivyli/lib/gcs-connector-hadoop3-2.2.5.jar"


# First, stop the existing Spark session if it's running
if 'spark' in locals():
    spark.stop()

# Configure the connection to gcs
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('RepartitionParquetApp') \
    .set("spark.jars", jars_location) \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

# Create or get a Spark session
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .config("spark.sql.parquet.int96RebaseModeInWrite", "LEGACY") \
    .getOrCreate()

## 2. Read data and check schema

### job_postings.csv fact table

In [21]:
# Define the Spark schema
job_postings_schema = StructType([
    StructField("job_id", StringType(), True),
    StructField("company_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("max_salary", FloatType(), True),
    StructField("med_salary", FloatType(), True),
    StructField("min_salary", FloatType(), True),
    StructField("pay_period", StringType(), True),
    StructField("formatted_work_type", StringType(), True),
    StructField("location", StringType(), True),
    StructField("applies", IntegerType(), True),
    StructField("original_listed_time", LongType(), True),
    StructField("remote_allowed", StringType(), True),
    StructField("views", IntegerType(), True),
    StructField("job_posting_url", StringType(), True),
    StructField("application_url", StringType(), True),
    StructField("application_type", StringType(), True),
    StructField("expiry", LongType(), True),
    StructField("closed_time", LongType(), True),
    StructField("formatted_experience_level", StringType(), True),
    StructField("skills_desc", StringType(), True),
    StructField("listed_time", LongType(), True),
    StructField("posting_domain", StringType(), True),
    StructField("sponsored", IntegerType(), True),
    StructField("work_type", StringType(), True),
    StructField("currency", StringType(), True),
    StructField("compensation_type", StringType(), True),
    StructField("scraped", IntegerType(), True)
])

In [28]:
df_posting = spark.read \
    .option("header", "true") \
    .option("escape", "\"") \
    .option("multiline", "true") \
    .schema(job_postings_schema) \
    .csv("gs://de-zoomcamp-xiangivyli/final_project/raw/job_postings.csv")

24/03/30 08:39:33 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: gs://de-zoomcamp-xiangivyli/final_project/raw/job_postings.csv.
java.io.IOException: Error accessing gs://de-zoomcamp-xiangivyli/final_project/raw/job_postings.csv
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getObject(GoogleCloudStorageImpl.java:2155)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getItemInfo(GoogleCloudStorageImpl.java:2043)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem.getFileInfoInternal(GoogleCloudStorageFileSystem.java:1091)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem.getFileInfo(GoogleCloudStorageFileSystem.java:1065)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.getFileStatus(GoogleHadoopFileSystemBase.java:955)


Py4JJavaError: An error occurred while calling o819.csv.
: java.io.IOException: Error accessing gs://de-zoomcamp-xiangivyli/final_project/raw/job_postings.csv
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getObject(GoogleCloudStorageImpl.java:2155)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getItemInfo(GoogleCloudStorageImpl.java:2043)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem.getFileInfoInternal(GoogleCloudStorageFileSystem.java:1091)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem.getFileInfo(GoogleCloudStorageFileSystem.java:1065)
	at com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.getFileStatus(GoogleHadoopFileSystemBase.java:955)
	at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1760)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$4(DataSource.scala:784)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$4$adapted(DataSource.scala:782)
	at org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:372)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1426)
	at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290)
	at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020)
	at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656)
	at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594)
	at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:177)
Caused by: com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.json.GoogleJsonResponseException: 403 Forbidden
GET https://storage.googleapis.com/storage/v1/b/de-zoomcamp-xiangivyli/o/final_project%2Fraw%2Fjob_postings.csv?fields=bucket,name,timeCreated,updated,generation,metageneration,size,contentType,contentEncoding,md5Hash,crc32c,metadata
{
  "code" : 403,
  "errors" : [ {
    "domain" : "global",
    "message" : "airflow-gcs-bigquery@cedar-style-412618.iam.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object. Permission 'storage.objects.get' denied on resource (or it may not exist).",
    "reason" : "forbidden"
  } ],
  "message" : "airflow-gcs-bigquery@cedar-style-412618.iam.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object. Permission 'storage.objects.get' denied on resource (or it may not exist)."
}
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.json.GoogleJsonResponseException.from(GoogleJsonResponseException.java:146)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.json.AbstractGoogleJsonClientRequest.newExceptionOnError(AbstractGoogleJsonClientRequest.java:118)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.json.AbstractGoogleJsonClientRequest.newExceptionOnError(AbstractGoogleJsonClientRequest.java:37)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest$1.interceptResponse(AbstractGoogleClientRequest.java:428)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.http.HttpRequest.execute(HttpRequest.java:1111)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:514)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:455)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.execute(AbstractGoogleClientRequest.java:565)
	at com.google.cloud.hadoop.repackaged.gcs.com.google.cloud.hadoop.gcsio.GoogleCloudStorageImpl.getObject(GoogleCloudStorageImpl.java:2149)
	... 21 more


In [16]:
df_posting.show(10, truncate=50, vertical=True)

24/03/30 08:18:03 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 0:>                                                          (0 + 1) / 1]

-RECORD 0------------------------------------------------------------------------
 job_id                     | 3757940104                                         
 company_id                 | 553718                                             
 title                      | Hearing Care Provider                              
 description                | Overview\n\nHearingLife is a national hearing c... 
 max_salary                 | null                                               
 med_salary                 | 5250.0                                             
 min_salary                 | null                                               
 pay_period                 | MONTHLY                                            
 formatted_work_type        | Full-time                                          
 location                   | Little River, SC                                   
 applies                    | null                                               
 original_listed

                                                                                

In [27]:
# Define a list of your timestamp columns
timestamp_columns = ["original_listed_time", "expiry", "closed_time", "listed_time"]

# Convert from Unix time in milliseconds to a proper timestamp
# Loop through the list and apply the transformation to each column
for column_name in timestamp_columns:
    df_posting = df_posting.withColumn(
        column_name,
        (col(column_name) / 1000).cast("timestamp")
    )

# Show the dataframe
df_posting.show(10, truncate=50, vertical=True)

-RECORD 0------------------------------------------------------------------------
 job_id                     | 3757940104                                         
 company_id                 | 553718                                             
 title                      | Hearing Care Provider                              
 description                | Overview\n\nHearingLife is a national hearing c... 
 max_salary                 | null                                               
 med_salary                 | 5250.0                                             
 min_salary                 | null                                               
 pay_period                 | MONTHLY                                            
 formatted_work_type        | Full-time                                          
 location                   | Little River, SC                                   
 applies                    | null                                               
 original_listed

In [28]:
df_posting = df_posting.repartition(10)

In [29]:
df_posting.write.parquet("./data/pq-linkedin-job-postings/pq_job_postings/", mode="overwrite")

                                                                                

### company_details/companies.csv

In [45]:
companies_schema=StructType([
    StructField("company_id", StringType(), True), 
    StructField("name", StringType(), True),
    StructField("description", StringType(), True),
    StructField("company_size", IntegerType(), True),  
    StructField("state", StringType(), True),
    StructField("country", StringType(), True),
    StructField("city", StringType(), True),
    StructField("zip_code", StringType(), True), 
    StructField("address", StringType(), True),
    StructField("url", StringType(), True)
])



df_companies = spark.read \
    .option("header", "true") \
    .schema(companies_schema) \
    .option("escape", "\"") \
    .option("multiline", "true") \
    .csv("./data/linkedin-job-postings/company_details/companies.csv") 


df_companies.show(10, truncate=10, vertical=True)

-RECORD 0------------------
 company_id   | 1009       
 name         | IBM        
 description  | At IBM,... 
 company_size | 7          
 state        | NY         
 country      | US         
 city         | Armonk,... 
 zip_code     | 10504      
 address      | Interna... 
 url          | https:/... 
-RECORD 1------------------
 company_id   | 1016       
 name         | GE Heal... 
 description  | Every d... 
 company_size | 7          
 state        | 0          
 country      | US         
 city         | Chicago    
 zip_code     | 0          
 address      | -          
 url          | https:/... 
-RECORD 2------------------
 company_id   | 1021       
 name         | GE Power   
 description  | GE Powe... 
 company_size | 7          
 state        | NY         
 country      | US         
 city         | Schenec... 
 zip_code     | 12345      
 address      | 1 River... 
 url          | https:/... 
-RECORD 3------------------
 company_id   | 1025       
 name         | Hewl

In [46]:
df_companies.repartition(5).write.parquet("./data/pq-linkedin-job-postings/pq_companies/", mode="overwrite")

### company_details/company_industries.csv

In [47]:
company_industry_schema=StructType([
    StructField("company_id", StringType(), True), 
    StructField("industry", StringType(), True)
])


df_company_industry = spark.read \
    .option("header", "true") \
    .schema(company_industry_schema) \
    .csv("./data/linkedin-job-postings/company_details/company_industries.csv") 


df_company_industry.show(10)

+----------+--------------------+
|company_id|            industry|
+----------+--------------------+
|  81149246|    Higher Education|
|  10033339|Information Techn...|
|   6049228|          Accounting|
|   2641066|Electrical & Elec...|
|  96649998|Marketing & Adver...|
|  82684341|Hospital & Health...|
|  82296828|Information Techn...|
|  86746333|Logistics & Suppl...|
|    718651|    Medical Practice|
|   4781041|  Mental Health Care|
+----------+--------------------+
only showing top 10 rows



In [48]:
df_company_industry.repartition(2).write.parquet("./data/pq-linkedin-job-postings/pq_company_industries/", mode="overwrite")

### company_details/employee_counts.csv

In [49]:
employee_schema=StructType([
    StructField("company_id", StringType(), True),
    StructField("employee_count", IntegerType(), True),
    StructField("follower_count", IntegerType(), True),
    StructField("time_recorded", DoubleType(), True)
])

df_employee_counts = spark.read \
    .schema(employee_schema) \
    .csv("./data/linkedin-job-postings/company_details/employee_counts.csv", header=True)

df_employee_counts.show(10)

+----------+--------------+--------------+--------------------+
|company_id|employee_count|follower_count|       time_recorded|
+----------+--------------+--------------+--------------------+
|  81149246|             6|            91|1.6926446442779734E9|
|  10033339|             3|           187|1.6926446442779734E9|
|   6049228|            20|            82|1.6926446451013184E9|
|   2641066|            45|          2336|1.6926446459232166E9|
|  96649998|             0|             2|1.6926446459242187E9|
|  82684341|             3|           128|1.6926446463704655E9|
|  82296828|             0|            64|1.6926446463704655E9|
|  86746333|            11|           478|1.6926446468182244E9|
|    718651|             5|            22|1.6926446471765628E9|
|   4781041|            14|            17|  1.69264464762722E9|
+----------+--------------+--------------+--------------------+
only showing top 10 rows



In [50]:
# Floor the 'time_recorded' column to remove decimal part, then cast to long
df_employee_counts = df_employee_counts.withColumn(
    "time_recorded",
    floor(col("time_recorded")).cast("long")
)

df_employee_counts.show(10)

+----------+--------------+--------------+-------------+
|company_id|employee_count|follower_count|time_recorded|
+----------+--------------+--------------+-------------+
|  81149246|             6|            91|   1692644644|
|  10033339|             3|           187|   1692644644|
|   6049228|            20|            82|   1692644645|
|   2641066|            45|          2336|   1692644645|
|  96649998|             0|             2|   1692644645|
|  82684341|             3|           128|   1692644646|
|  82296828|             0|            64|   1692644646|
|  86746333|            11|           478|   1692644646|
|    718651|             5|            22|   1692644647|
|   4781041|            14|            17|   1692644647|
+----------+--------------+--------------+-------------+
only showing top 10 rows



In [51]:
# Now convert from Unix time in seconds to a proper timestamp
df_employee_counts = df_employee_counts.withColumn(
    "time_recorded",
    from_unixtime(col("time_recorded")).cast("timestamp")
)

df_employee_counts.show(10)

+----------+--------------+--------------+-------------------+
|company_id|employee_count|follower_count|      time_recorded|
+----------+--------------+--------------+-------------------+
|  81149246|             6|            91|2023-08-21 19:04:04|
|  10033339|             3|           187|2023-08-21 19:04:04|
|   6049228|            20|            82|2023-08-21 19:04:05|
|   2641066|            45|          2336|2023-08-21 19:04:05|
|  96649998|             0|             2|2023-08-21 19:04:05|
|  82684341|             3|           128|2023-08-21 19:04:06|
|  82296828|             0|            64|2023-08-21 19:04:06|
|  86746333|            11|           478|2023-08-21 19:04:06|
|    718651|             5|            22|2023-08-21 19:04:07|
|   4781041|            14|            17|2023-08-21 19:04:07|
+----------+--------------+--------------+-------------------+
only showing top 10 rows



In [52]:
df_employee_counts.printSchema()

root
 |-- company_id: string (nullable = true)
 |-- employee_count: integer (nullable = true)
 |-- follower_count: integer (nullable = true)
 |-- time_recorded: timestamp (nullable = true)



In [53]:
df_employee_counts.repartition(5).write.parquet("./data/pq-linkedin-job-postings/pq_employee/", mode="overwrite")

### company_details/company_specialities.csv

In [54]:
company_specialities_schema=StructType([
    StructField("company_id", StringType(), True),
    StructField("speciality", StringType(), True)
])

df_company_specialities = spark.read \
    .schema(company_specialities_schema) \
    .csv("./data/linkedin-job-postings/company_details/company_specialities.csv", header=True)

df_company_specialities.show(10)

+----------+--------------------+
|company_id|          speciality|
+----------+--------------------+
|  81149246|Childrens Music E...|
|  81149246|Foundational Musi...|
|  81149246| Child Music Lessons|
|  81149246|social emotional ...|
|  81149246|social emotional ...|
|  81149246|           education|
|  81149246|formative assessment|
|  81149246|   expanded learning|
|  81149246| enrichment programs|
|  10033339|          SharePoint|
+----------+--------------------+
only showing top 10 rows



In [55]:
df_company_specialities.repartition(5).write.parquet("./data/pq-linkedin-job-postings/pq_company_specialities/", mode="overwrite")

### job_details/job_industries.csv

In [3]:
df_job_industries = spark.read \
    .csv("./data/linkedin-job-postings/job_details/job_industries.csv", header=True)

df_job_industries.show(10)


[Stage 0:>                                                          (0 + 1) / 1]

                                                                                

+----------+-----------+
|    job_id|industry_id|
+----------+-----------+
|3378133231|         68|
|3497509795|         96|
|3690843087|         47|
|3691775263|        112|
|3691779379|         80|
|3691786992|         14|
|3691789797|         96|
|3691792844|        116|
|3691793575|         13|
|3691794313|         87|
+----------+-----------+
only showing top 10 rows



In [56]:
df_job_industries.printSchema()

root
 |-- job_id: string (nullable = true)
 |-- industry_id: string (nullable = true)



In [58]:
df_job_industries.repartition(5).write.parquet("./data/pq-linkedin-job-postings/pq_job_industries/", mode="overwrite")

### job_details/job_skills.csv

In [59]:
df_job_skills = spark.read \
    .csv("./data/linkedin-job-postings/job_details/job_skills.csv", header=True)

df_job_skills.show(10)

+----------+---------+
|    job_id|skill_abr|
+----------+---------+
|3690843087|     ACCT|
|3690843087|      FIN|
|3691763971|     MGMT|
|3691763971|     MNFC|
|3691775263|     MGMT|
|3691775263|     MNFC|
|3691786992|     HCPR|
|3691789797|     MGMT|
|3691789797|     MNFC|
|3691789919|     HCPR|
+----------+---------+
only showing top 10 rows



In [60]:
df_job_skills.printSchema()

root
 |-- job_id: string (nullable = true)
 |-- skill_abr: string (nullable = true)



In [61]:
df_job_skills.repartition(5).write.parquet("./data/pq-linkedin-job-postings/pq_job_skills/", mode="overwrite")

### job_details/salaries.csv

In [22]:
df_salaries = pd.read_csv("./data/linkedin-job-postings/job_details/salaries.csv")
df_salaries.head(5)

Unnamed: 0,salary_id,job_id,max_salary,med_salary,min_salary,pay_period,currency,compensation_type
0,1,3378133231,30.0,,22.0,HOURLY,USD,BASE_SALARY
1,2,3690843087,65000.0,,55000.0,YEARLY,USD,BASE_SALARY
2,3,3691794313,22.0,,19.0,HOURLY,USD,BASE_SALARY
3,4,3691795389,70000.0,,68000.0,YEARLY,USD,BASE_SALARY
4,5,3691797089,22.0,,18.0,HOURLY,USD,BASE_SALARY


In [62]:
# Define the Spark schema
salary_schema = StructType([
    StructField("salary_id", StringType(), True),
    StructField("job_id", StringType(), True),
    StructField("max_salary", FloatType(), True),
    StructField("med_salary", FloatType(), True),
    StructField("min_salary", FloatType(), True),
    StructField("pay_period", StringType(), True),
    StructField("currency", StringType(), True),
    StructField("compensation_type", StringType(), True)
])


df_salaries = spark.read \
    .schema(salary_schema) \
    .csv("./data/linkedin-job-postings/job_details/salaries.csv", header=True)

df_salaries.show(10)

+---------+----------+----------+----------+----------+----------+--------+-----------------+
|salary_id|    job_id|max_salary|med_salary|min_salary|pay_period|currency|compensation_type|
+---------+----------+----------+----------+----------+----------+--------+-----------------+
|        1|3378133231|      30.0|      null|      22.0|    HOURLY|     USD|      BASE_SALARY|
|        2|3690843087|   65000.0|      null|   55000.0|    YEARLY|     USD|      BASE_SALARY|
|        3|3691794313|      22.0|      null|      19.0|    HOURLY|     USD|      BASE_SALARY|
|        4|3691795389|   70000.0|      null|   68000.0|    YEARLY|     USD|      BASE_SALARY|
|        5|3691797089|      22.0|      null|      18.0|    HOURLY|     USD|      BASE_SALARY|
|        6|3691797249|      26.0|      null|      21.0|    HOURLY|     USD|      BASE_SALARY|
|        7|3691797979|  120250.0|      null|   98924.0|    YEARLY|     USD|      BASE_SALARY|
|        8|3691798879|   85000.0|      null|   75000.0|    Y

In [64]:
df_salaries.repartition(5).write.parquet("./data/pq-linkedin-job-postings/pq_salaries/", mode="overwrite")

### job_details/benefits.csv

In [65]:
df_benefits = pd.read_csv("./data/linkedin-job-postings/job_details/benefits.csv")
df_benefits.head(5)

Unnamed: 0,job_id,inferred,type
0,3690843087,0,Medical insurance
1,3690843087,0,Dental insurance
2,3690843087,0,401(k)
3,3690843087,0,Paid maternity leave
4,3690843087,0,Disability insurance


In [67]:
benefit_schema = StructType([
    StructField("job_id", StringType(), True),
    StructField("inferred", StringType(), True),
    StructField("type", StringType(), True)
])


df_benefit = spark.read \
    .schema(benefit_schema) \
    .csv("./data/linkedin-job-postings/job_details/benefits.csv", header=True)

df_benefit.show(10)

+----------+--------+--------------------+
|    job_id|inferred|                type|
+----------+--------+--------------------+
|3690843087|       0|   Medical insurance|
|3690843087|       0|    Dental insurance|
|3690843087|       0|              401(k)|
|3690843087|       0|Paid maternity leave|
|3690843087|       0|Disability insurance|
|3690843087|       0|    Vision insurance|
|3691763971|       1|    Dental insurance|
|3691763971|       1|Disability insurance|
|3691763971|       1|              401(k)|
|3691775263|       0|   Medical insurance|
+----------+--------+--------------------+
only showing top 10 rows



In [68]:
df_benefit.repartition(5).write.parquet("./data/pq-linkedin-job-postings/pq_benefits/", mode="overwrite")

### maps/industries.csv

In [3]:
industry_schema = StructType([
    StructField("industry_id", StringType(), True),
    StructField("industry_name", StringType(), True)
])


df_industry = spark.read \
    .schema(industry_schema) \
    .csv("./data/linkedin-job-postings/maps/industries.csv", header=True)

df_industry.show(10)


[Stage 0:>                                                          (0 + 1) / 1]

+-----------+--------------------+
|industry_id|       industry_name|
+-----------+--------------------+
|          1|Defense and Space...|
|          3|Computer Hardware...|
|          4|Software Development|
|          5|Computer Networki...|
|          6|Technology, Infor...|
|          7|Semiconductor Man...|
|          8|  Telecommunications|
|          9|        Law Practice|
|         10|      Legal Services|
|         11|Business Consulti...|
+-----------+--------------------+
only showing top 10 rows




                                                                                

In [4]:
df_industry.write.parquet("./data/pq-linkedin-job-postings/pq_industries/", mode="overwrite")

                                                                                

### maps/skills.csv

In [5]:
skills_schema = StructType([
    StructField("skill_abr", StringType(), True),
    StructField("skill_name", StringType(), True)
])


df_skills = spark.read \
    .schema(skills_schema) \
    .csv("./data/linkedin-job-postings/maps/skills.csv", header=True)

df_skills.show(10)

+---------+------------------+
|skill_abr|        skill_name|
+---------+------------------+
|     PRCH|        Purchasing|
|     SUPL|      Supply Chain|
|       PR|  Public Relations|
|      SCI|           Science|
|     STRA| Strategy/Planning|
|      WRT|   Writing/Editing|
|       QA| Quality Assurance|
|     DIST|      Distribution|
|     PROD|        Production|
|     PRJM|Project Management|
+---------+------------------+
only showing top 10 rows



In [6]:
df_skills.write.parquet("./data/pq-linkedin-job-postings/pq_skills/", mode="overwrite")