### Impporting Libs

In [None]:
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import regexp_extract, col, avg,round, regexp_replace,max,min,dense_rank,col,split
from pyspark.sql.window import Window

### Create spark session

In [None]:
spark = SparkSession\
    .builder\
    .appName("CompetitiveProgramming")\
    .getOrCreate()

### Read dataset

In [None]:
data = spark\
    .read\
    .format("csv")\
    .option("header", "true")\
    .option("delimiter", ",")\
    .load("/Users/sahilnagpal/Desktop/coding/competitive-programming/Projects/RAISE/clicnical-doctors-dataset.csv")

In [None]:
print(type(data))

### Data Cleansing

In [None]:
from pyspark.sql import functions as F

def clean_doctor_id(dataset, column_name):
    # Check if the column exists in the dataset
    if column_name not in dataset.columns:
        raise ValueError(f"Column {column_name} does not exist in the dataset")

    # Split the column and create a new column "Doctor_Id"
    split_col = F.split(dataset[column_name], ' ')
    dataset = dataset.withColumn("Doctor_Id", split_col.getItem(0))\
                     .withColumn("Doctor_Id", (F.col("Doctor_Id").cast("int") + 1))
    return dataset

def clean_fees(dataset, column_name):
    # Check if the column exists in the dataset
    if column_name not in dataset.columns:
        raise ValueError(f"Column {column_name} does not exist in the dataset")

    # Split the column and create a new column "Doctor_Id"
    split_col = F.split(dataset[column_name], ' ')
    dataset = dataset.withColumn("fees_value", split_col.getItem(0))
    return dataset

def clean_waiting_times(dataset, column_name):
    # Check if the column exists in the dataset
    if column_name not in dataset.columns:
        raise ValueError(f"Column {column_name} does not exist in the dataset")

    # Split the column and create a new column "Doctor_Id"
    split_col = F.split(dataset[column_name], ':')
    dataset = dataset.withColumn("waiting_period", split_col.getItem(1))
    return dataset

def clean_timings(dataset, column_name):
    dataset = dataset.withColumn("hours", regexp_extract(col(column_name), r'(\d+)\s*Hour', 1).cast("int"))\
       .withColumn("minutes", regexp_extract(col(column_name), r'(\d+)\s*Minute', 1).cast("int"))
    
    dataset = dataset.fillna({"hours": 0, "minutes": 0})
    dataset = dataset.withColumn("total_minutes", col("hours") * 60 + col("minutes"))
    dataset = dataset.drop("hours","minutes")
    return dataset

def clean_doctor_views(dataset, column_name):
    # Check if the column exists in the dataset
    if column_name not in dataset.columns:
        raise ValueError(f"Column {column_name} does not exist in the dataset")

    # Split the column and create a new column "Doctor_Id"
    split_col = F.split(dataset[column_name], ' ')
    dataset = dataset.withColumn("doctor_actual_views", split_col.getItem(0))
    dataset = dataset.withColumn("number_without_commas", regexp_replace(col("doctor_actual_views"), ",", "").cast("int"))
    dataset = dataset.drop("doctor_actual_views")
    return dataset
    
    
    
    
data = clean_doctor_id(dataset=data, column_name="Source")
data = clean_fees(dataset=data, column_name="fees")
data = clean_waiting_times(dataset=data, column_name="waiting_time")
data = clean_timings(dataset=data, column_name="waiting_period")
data = clean_doctor_views(dataset=data, column_name="doctor_views")


In [None]:
data.show(truncate=False)

### Analysis

#### Deliverable: SQL query as query_1.sql and results in output_1.csv containing columns such as specialization, average_fees, average_waiting_time, and average_rating.

In [None]:
data\
    .filter(col("specialization").isNotNull())\
    .groupBy("specialization")\
    .agg(avg("avg_rate").alias("average_rating"),
         avg("fees_value").alias("average_fees"),
         avg("total_minutes").alias("average_time"))\
    .withColumn("average_rating",round(col("average_rating"),2))\
    .withColumn("average_fees",round(col("average_fees"),2))\
    .withColumn("average_time",round(col("average_time"),2))

#### Deliverable: SQL query as query_2.sql and a CSV output as output_2.csv with columns doctor_id, specialization, clinic_location, doctor_views.

In [None]:
data\
    .select("Doctor_Id","fees_value","specialization","clinic_location","number_without_commas")\
    .show()

#### Deliverable: SQL script as query_3.sql and a detailed analysis report as output_3.csv with columns clinic_location, average_fees, and average_waiting_time.

In [None]:
data\
    .select("clinic_location","total_minutes","fees_value")\
    .groupby(col("clinic_location"))\
    .agg(max("total_minutes").alias("max_minutes"),
         max("fees_value").alias("max_fees"))\
    .withColumn("rnk",dense_rank().over(Window.orderBy(col("max_minutes").desc(),col("max_fees").desc())))\
    .filter(col("rnk")==1)\
    .drop("rnk")\
    .show()


data\
    .select("clinic_location","total_minutes","fees_value")\
    .groupby(col("clinic_location"))\
    .agg(min("total_minutes").alias("min_minutes"),
         min("fees_value").alias("min_fees"))\
    .withColumn("rnk",dense_rank().over(Window.orderBy(col("min_minutes"),col("min_fees"))))\
    .filter(col("rnk")==1)\
    .drop("rnk")\
    .show()