In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, lit, rank, desc
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Inicjalizacja sesji
spark = SparkSession.builder.appName("Lab1_Fixed").getOrCreate()

base_path = "/Workspace/Users/13815@eans-nt.edu.pl"
path_pracownicy = f"{base_path}/pracownicy.csv"
path_projekty = f"{base_path}/projekty.csv"
path_tekst = f"{base_path}/tekst_do_liczenia.txt"


# ============================================
# 1 & 2. WCZYTANIE DANYCH (Pracownicy)
# ============================================
print(f"--- Wczytuję plik: {path_pracownicy} ---")

df = spark.read.csv(
    path_pracownicy, 
    header=True, 
    inferSchema=True, 
    encoding="UTF-16"  
)

display(df) 

# ============================================
# 3. FILTROWANIE I WYBÓR KOLUMN
# ============================================
print("\n--- Filtrowanie: Osoby powyżej 30 lat ---")
df_filtered = df.select("name", "age").filter(col("age") > 30)
display(df_filtered)

# ============================================
# 4. SORTOWANIE I AGREGACJE
# ============================================
print("\n--- Sortowanie malejąco po wieku ---")
df_sorted = df.orderBy(col("age").desc())
display(df_sorted)

print("\n--- Agregacja: Średni wiek w dziale ---")
df_grouped = df.groupBy("department").agg(avg("age").alias("avg_age"))
display(df_grouped)

# ============================================
# 5. DODANIE I USUNIĘCIE KOLUMN
# ============================================
print("\n--- Dodanie kolumny 'wiek za 5 lat' ---")
df_new = df.withColumn("age_in_5_years", col("age") + 5).drop("department")
display(df_new)

# ============================================
# 6. ŁĄCZENIE DWÓCH DATAFRAME (JOIN) - NAPRAWIONA WERSJA
# ============================================
print(f"\n--- Wczytuję plik projektów: {path_projekty} ---")
df_projekty = spark.read.csv(
    path_projekty, 
    header=True, 
    inferSchema=True,
    encoding="UTF-8" 
)
display(df_projekty)

print("\n--- Łączenie (Join) pracowników z projektami ---")
df_joined = df.join(df_projekty, on="name", how="left")
display(df_joined)

# ============================================
# 7. OBSŁUGA NULL
# ============================================
print("\n--- Obsługa brakujących danych (NULL) ---")

schema_null_row = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("department", StringType(), True),
    StructField("projects", IntegerType(), True)
])

null_row = spark.createDataFrame(
    [("Duch", None, "Unknown", None)], 
    schema=schema_null_row
)

df_with_null = df_joined.unionByName(null_row, allowMissingColumns=True)

df_filled = df_with_null.na.fill({"age": 0, "projects": 0})
display(df_filled)

# ============================================
# 8. OKNA (RANKING)
# ============================================
print("\n--- Ranking pracowników wg liczby projektów ---")

window_spec_fixed = Window.partitionBy("department").orderBy(col("projects").desc())
df_ranked = df_filled.withColumn("rank_by_dept", rank().over(window_spec_fixed))
display(df_ranked.select("name", "department", "projects", "rank_by_dept"))

# ============================================
# 9. LICZENIE SŁÓW Z PLIKU TEKSTOWEGO
# ============================================
from pyspark.sql.functions import explode, split, lower, regexp_replace, col

print(f"\n--- Analiza tekstu z pliku: {path_tekst}  ---")

df_text = spark.read.text(path_tekst)

df_word_counts = df_text.withColumn(
    "cleaned_text", 
    regexp_replace(lower(col("value")), "[^a-z\\s]", " ")
).withColumn(
    "word",
    explode(split(col("cleaned_text"), "\\s+"))
).filter(col("word") != "")
top_words_df = df_word_counts.groupBy("word").count().orderBy(col("count").desc()).limit(10)

print("TOP 10 słów (DataFrame):")
display(top_words_df)



--- Wczytuję plik: /Workspace/Users/13815@eans-nt.edu.pl/pracownicy.csv (z poprawką kodowania) ---


name,age,department
Jan,28,IT
Anna,34,HR
Piotr,22,Finance
Ewa,45,IT
Marek,31,HR
Joanna,29,Finance



--- Filtrowanie: Osoby powyżej 30 lat ---


name,age
Anna,34
Ewa,45
Marek,31



--- Sortowanie malejąco po wieku ---


name,age,department
Ewa,45,IT
Anna,34,HR
Marek,31,HR
Joanna,29,Finance
Jan,28,IT
Piotr,22,Finance



--- Agregacja: Średni wiek w dziale ---


department,avg_age
HR,32.5
IT,36.5
Finance,25.5



--- Dodanie kolumny 'wiek za 5 lat' ---


name,age,age_in_5_years
Jan,28,33
Anna,34,39
Piotr,22,27
Ewa,45,50
Marek,31,36
Joanna,29,34



--- Wczytuję plik projektów: /Workspace/Users/13815@eans-nt.edu.pl/projekty.csv (z poprawką kodowania) ---


name,projects
Jan,5
Anna,3
Piotr,4
Ewa,7
Marek,2
Joanna,6



--- Łączenie (Join) pracowników z projektami ---


name,age,department,projects
Jan,28,IT,5
Anna,34,HR,3
Piotr,22,Finance,4
Ewa,45,IT,7
Marek,31,HR,2
Joanna,29,Finance,6



--- Obsługa brakujących danych (NULL) ---


name,age,department,projects
Jan,28,IT,5
Anna,34,HR,3
Piotr,22,Finance,4
Ewa,45,IT,7
Marek,31,HR,2
Joanna,29,Finance,6
Duch,0,Unknown,0



--- Ranking pracowników wg liczby projektów ---


name,department,projects,rank_by_dept
Joanna,Finance,6,1
Piotr,Finance,4,2
Anna,HR,3,1
Marek,HR,2,2
Ewa,IT,7,1
Jan,IT,5,2
Duch,Unknown,0,1



--- Analiza tekstu z pliku: /Workspace/Users/13815@eans-nt.edu.pl/tekst_do_liczenia.txt (Metoda DataFrame) ---
TOP 10 słów (DataFrame):


word,count
tekst,3
to,3
s,2
pyspark,2
jak,2
do,2
jest,2
si,1
wiczenie,1
w,1
