In [2]:
from pyspark.sql import SparkSession

In [4]:
spark = spark = SparkSession.builder.appName("lab3").getOrCreate()

In [19]:
df = spark.read.option("recursiveFileLockup", True).csv("./donation", header=True, inferSchema=True)

In [23]:
df.show(n = 10)

+-----+-----+------------+------------+------------+------------+-------+------+------+------+-------+--------+
| id_1| id_2|cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|
+-----+-----+------------+------------+------------+------------+-------+------+------+------+-------+--------+
|53719|60579|           1|           ?|           1|           ?|      1|     1|     1|     1|      1|    TRUE|
|58967|58973|           1|           ?|           1|           ?|      1|     1|     1|     1|      1|    TRUE|
| 1499|23331|           1|           ?|           1|           ?|      1|     1|     1|     1|      1|    TRUE|
|18441|36183|           1|           1|           1|           ?|      1|     1|     1|     1|      1|    TRUE|
| 8902|11508|           1|           ?|           1|           ?|      1|     1|     1|     1|      1|    TRUE|
|17704|21348|           1|           ?|           1|           ?|      1|     1|     1|     1|      1|  

In [24]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import col, lower, regexp_replace

parsed = spark.read.option("header", "true").option("nullValue", "?").\
option("inferSchema", "true").csv("donation/block_1.csv")
# Assuming `parsed` is the DataFrame after loading data
def preprocess_data(df):
    # Convert text to lowercase
    df = df.withColumn("clean_text", lower(col("cmp_bd")))

    # Remove special characters and punctuation
    df = df.withColumn("clean_text", regexp_replace(col("clean_text"), "[^a-zA-Z0-9\\s]", ""))

    # Tokenization
    tokenizer = Tokenizer(inputCol="clean_text", outputCol="tokens")
    df = tokenizer.transform(df)

    # Remove stop words
    remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
    df = remover.transform(df)

    return df

# Apply the preprocessing function to your data
preprocessed_data = preprocess_data(parsed)
preprocessed_data.show(truncate=False)

+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+----------+------+---------------+
|id_1 |id_2 |cmp_fname_c1     |cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|clean_text|tokens|filtered_tokens|
+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+----------+------+---------------+
|37291|53113|0.833333333333333|null        |1.0         |null        |1      |1     |1     |1     |0      |true    |1         |[1]   |[1]            |
|39086|47614|1.0              |null        |1.0         |null        |1      |1     |1     |1     |1      |true    |1         |[1]   |[1]            |
|70031|70237|1.0              |null        |1.0         |null        |1      |1     |1     |1     |1      |true    |1         |[1]   |[1]            |
|84795|97439|1.0              |null        |1.0         |null        |1      |1     |1     |1 