In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

# Tạo Spark session
spark = SparkSession.builder.appName("JoinCSV").getOrCreate()

# Đọc file CSV 1
df1 = spark.read.option("header", "true").csv("./linkedin_job_postings_name_normalize.csv")

# Đọc file CSV 2
df2 = spark.read.option("header", "true").csv("./archive/job_skills.csv")

# Join theo cột job_link
joined_df = df1.join(df2, on="job_link", how="left")

# Thêm cột 'branch' với giá trị mặc định là "None"
joined_df = joined_df.withColumn("branch", lit("None"))

# Chọn các cột theo yêu cầu
linkdin_df = joined_df.select(
    "job_title",
    "company",
    "job_location",
    "search_city",
    "search_country",
    "job_level",
    "branch",
    "job_skills",
    "last_processed_time"
)

# (Tùy chọn) Hiển thị vài dòng đầu
linkdin_df.show(5)

+--------------------+--------------------+--------------------+--------------+--------------+----------+------+--------------------+--------------------+
|           job_title|             company|        job_location|   search_city|search_country| job_level|branch|          job_skills| last_processed_time|
+--------------------+--------------------+--------------------+--------------+--------------+----------+------+--------------------+--------------------+
|Retail and Wholes...|            Pattern®|London Area, Unit...|Greater London|United Kingdom|Mid senior|  None|Account managemen...|2024-01-19 09:45:...|
|   Building Finisher|               iHire|          Woburn, MA|        Nashua| United States|Mid senior|  None|Concrete Construc...|2024-01-21 05:24:...|
| Industrial Engineer|Recruiting from S...|     Albuquerque, NM|   Albuquerque| United States|Mid senior|  None|Software Developm...|2024-01-19 09:45:...|
|Health Informatio...|Goldendale School...|          Middle, IN|      

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CombineCSV").getOrCreate()

# Đọc từng file riêng
df2 = spark.read.option("header", "true").csv("./analysis/skills_standard_output.csv")

# Gộp 2 DataFrame
combined_df = linkdin_df.union(df2)

# (Tùy chọn) Hiển thị vài dòng
combined_df.show(5)
combined_df = combined_df.repartition(30)

combined_df.write.mode("overwrite").option("header", "true").csv("output/combined_csv_final.csv")

+--------------------+--------------------+--------------------+--------------+--------------+----------+------+--------------------+--------------------+
|           job_title|             company|        job_location|   search_city|search_country| job_level|branch|          job_skills| last_processed_time|
+--------------------+--------------------+--------------------+--------------+--------------+----------+------+--------------------+--------------------+
|Retail and Wholes...|            Pattern®|London Area, Unit...|Greater London|United Kingdom|Mid senior|  None|Account managemen...|2024-01-19 09:45:...|
|   Building Finisher|               iHire|          Woburn, MA|        Nashua| United States|Mid senior|  None|Concrete Construc...|2024-01-21 05:24:...|
| Industrial Engineer|Recruiting from S...|     Albuquerque, NM|   Albuquerque| United States|Mid senior|  None|Software Developm...|2024-01-19 09:45:...|
|Health Informatio...|Goldendale School...|          Middle, IN|      

In [3]:
print(combined_df.count())

1391804


In [4]:
print(df2.count())

114287
