In [78]:
import pyspark
import sys 
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col 
from pyspark.sql.functions import expr




## Sample Reading of Text File 

In [79]:
spark = SparkSession.builder.getOrCreate()
df = spark.read.text("wordcount.txt")

df.show()


+--------------------+
|               value|
+--------------------+
|Hello world world...|
|sample text word ...|
|Word PySpark PySp...|
|Processing text d...|
|example example e...|
+--------------------+



## Clean and Preprocess Data 

In [80]:
spark = SparkSession.builder.getOrCreate()

df = spark.read.option("header","true").option("nullValue","?").option("inferSchema","true").csv("/home/lplab/Desktop/210962021/Lab3/files/")

df.show()

print("Count of Dataframe with all 10 .csv files loaded = ",df.count())


+-----+-----+------------+------------+------------+------------+-------+------+------+------+-------+--------+
| id_1| id_2|cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|
+-----+-----+------------+------------+------------+------------+-------+------+------+------+-------+--------+
|53719|60579|         1.0|        null|         1.0|        null|      1|     1|     1|     1|      1|    true|
|58967|58973|         1.0|        null|         1.0|        null|      1|     1|     1|     1|      1|    true|
| 1499|23331|         1.0|        null|         1.0|        null|      1|     1|     1|     1|      1|    true|
|18441|36183|         1.0|         1.0|         1.0|        null|      1|     1|     1|     1|      1|    true|
| 8902|11508|         1.0|        null|         1.0|        null|      1|     1|     1|     1|      1|    true|
|17704|21348|         1.0|        null|         1.0|        null|      1|     1|     1|     1|      1|  

In [81]:
df.printSchema()

df1 = df[df['is_match'] == 'false']
df1.show()


root
 |-- id_1: integer (nullable = true)
 |-- id_2: integer (nullable = true)
 |-- cmp_fname_c1: double (nullable = true)
 |-- cmp_fname_c2: double (nullable = true)
 |-- cmp_lname_c1: double (nullable = true)
 |-- cmp_lname_c2: double (nullable = true)
 |-- cmp_sex: integer (nullable = true)
 |-- cmp_bd: integer (nullable = true)
 |-- cmp_bm: integer (nullable = true)
 |-- cmp_by: integer (nullable = true)
 |-- cmp_plz: integer (nullable = true)
 |-- is_match: boolean (nullable = true)

+-----+-----+-----------------+-----------------+-----------------+------------+-------+------+------+------+-------+--------+
| id_1| id_2|     cmp_fname_c1|     cmp_fname_c2|     cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|
+-----+-----+-----------------+-----------------+-----------------+------------+-------+------+------+------+-------+--------+
|84014|88799|            0.125|             null|             0.25|        null|      0|     1|     1|     1|      0|   false

In [82]:
df.groupBy("is_match").count().orderBy(col("count").desc()).show()

summary = df.describe()
summary.select("summary", "cmp_fname_c1", "cmp_fname_c2").show()

matches = df.where("is_match = true")
match_summary = matches.describe()
match_summary.select("summary","cmp_fname_c1","cmp_fname_c2").show()

misses = df.where("is_match = false")
misses_summary = misses.describe()
misses_summary.select("summary","cmp_fname_c1","cmp_fname_c2").show()

+--------+-------+
|is_match|  count|
+--------+-------+
|   false|5728201|
|    true|  20931|
+--------+-------+

+-------+-------------------+-------------------+
|summary|       cmp_fname_c1|       cmp_fname_c2|
+-------+-------------------+-------------------+
|  count|            5748125|             103698|
|   mean| 0.7129024704429502| 0.9000176718903219|
| stddev|0.38875835961627936|0.27131761057823345|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+

+-------+-------------------+-------------------+
|summary|       cmp_fname_c1|       cmp_fname_c2|
+-------+-------------------+-------------------+
|  count|              20922|               1333|
|   mean| 0.9973163859635039| 0.9898900320318174|
| stddev|0.03650667584833679|0.08251973727615235|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-----

## Computing Similarity Score 

In [83]:
good_features = ["cmp_lname_c1","cmp_plz","cmp_by","cmp_bd","cmp_bm"]

sum_expression = " + ".join(good_features)

scored = df.fillna(0,subset = good_features).withColumn('score',expr(sum_expression)).select('score','is_match')

scored.show()

+-----+--------+
|score|is_match|
+-----+--------+
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  4.0|    true|
|  5.0|    true|
|  4.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
+-----+--------+
only showing top 20 rows



## Calculating Precision,Recall and F1-Score

In [84]:
def calculate_metrics(scored, threshold):
    tp = scored.filter((col('score') >= threshold) & (col('is_match') == 'true')).count()
    fp = scored.filter((col('score') >= threshold) & (col('is_match') == 'false')).count()
    fn = scored.filter((col('score') < threshold) & (col('is_match') == 'true')).count()

    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return precision, recall, f1_score

threshold = 4.0

precision, recall, f1_score = calculate_metrics(scored, threshold)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

Precision: 0.9703831132601822
Recall: 0.9971334384405905
F1 Score: 0.9835764273427743
