# Wyzwanie adwentowe - Dane są wszędzie

In [0]:
%run /Users/wojciech.zdziebkowski@gmail.com/Santa_DB_connection_param


## Dzień 1

In [0]:
# importy
from pyspark.sql.functions import col, when, lit, desc, sum

In [0]:
# połączenie


jdbc_url = (
    f"jdbc:sqlserver://{jdbc_hostname}:{jdbc_port};"
    f"database={jdbc_database};"
    "encrypt=true;trustServerCertificate=false;loginTimeout=30;"
)



In [0]:
connection_properties = {
    "user": jdbc_username,
    "password": jdbc_password,
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}


In [0]:
tables_df = (spark.read
    .format("jdbc")
    .option("url", jdbc_url)
    .option("dbtable", "INFORMATION_SCHEMA.TABLES")
    .options(**connection_properties)
    .load())

display(tables_df)


In [0]:

query_counts = """
(
    SELECT 
        t.name AS NazwaTabeli,
        SUM(p.rows) AS LiczbaWierszy
    FROM sys.tables t
    JOIN sys.schemas s ON t.schema_id = s.schema_id
    JOIN sys.partitions p ON t.object_id = p.object_id
    WHERE p.index_id < 2 
    GROUP BY s.name, t.name
) as counts
"""

df_fast_counts = spark.read.jdbc(url=jdbc_url, table=query_counts, properties=connection_properties)

df_sorted = df_fast_counts.orderBy(col("LiczbaWierszy").desc())

display(df_sorted)

## Dzień 2

In [0]:
query_holidays_poland = """
(
    SELECT 
        *
    FROM edw.HolidayCountry
    WHERE Country = 'Poland'
) as holiday_pl
"""

df_hc_pol = spark.read.jdbc(url=jdbc_url, table=query_holidays_poland, properties=connection_properties)

display(df_hc_pol)

In [0]:
query_holidays_japan = """
(
    SELECT 
        *
    FROM edw.HolidayCountry
    WHERE Country = 'Japan'
) as holiday_jp
"""

df_hc_jap = spark.read.jdbc(url=jdbc_url, table=query_holidays_japan, properties=connection_properties)
 
display(df_hc_jap)

In [0]:
query_date = """
(
    SELECT 
        *
    FROM edw.DimDate
) as date
"""

df_date = spark.read.jdbc(url=jdbc_url, table=query_date, properties=connection_properties)

display(df_date)

In [0]:
df_pol_gotowy = df_hc_pol.withColumnRenamed("Country", "CountryPL")
df_jap_gotowy = df_hc_jap.withColumnRenamed("Country", "CountryJP")

In [0]:
df_date_pol = df_date.join(df_pol_gotowy, on="DateKey", how="left")
df_date_both = df_date_pol.join(df_jap_gotowy, on="DateKey", how="left")

In [0]:
df_finalny = df_date_both.withColumn("PL_Holiday", when(col("CountryPL").isNotNull(), lit(1)).otherwise(0)).withColumn("JP_Holiday", when(col("CountryJP").isNotNull(), lit(1)).otherwise(0))

display(df_finalny)

In [0]:
count_start = df_date.count()
count_end = df_finalny.count()

print(f"Start: {count_start}, Koniec: {count_end}")

if count_start == count_end:
    print("Nie ma duplikatów")
else:
    print("Są duplikaty.")

print(count_end)

In [0]:
hd_jap_2024 = df_finalny.filter((col("Year") == 2024) & (col("JP_Holiday") == 1)).count()
hd_pol_2026 = df_finalny.filter((col("Year") == 2026) & (col("PL_Holiday") == 1)).count()
wynik = hd_jap_2024 - hd_pol_2026
print(wynik)

## Dzień 3

In [0]:
df_dim_dummy_elf = spark.read.jdbc(url=jdbc_url, table="edw.DimElf_wDummyRow", properties=connection_properties)

display(df_dim_dummy_elf)

In [0]:
df_dummy_hiredate = df_dim_dummy_elf.filter(col("ElfKey") == -1).select("HireDateKey")
display(df_dummy_hiredate)

In [0]:
df_elf_time = spark.read.jdbc(url=jdbc_url, table="dbo.FactElfTime", properties=connection_properties)

In [0]:
df_elf_dummy_count = df_elf_time.filter(col("ElfKey") == -1).count()

display(df_elf_dummy_count)

In [0]:
df_elf_dummy_time = df_elf_time.filter(col("ElfKey") == -1).agg(sum("WorkedHours").alias("sum_WorkedHours"))

display(df_elf_dummy_time)