**Firstly grabbing our filtered dataset according to mappings**

In [1]:
# need to grab mean and std for each column so that we can normalize the df and apply clustering
import pyspark.sql.functions as F
import pyspark.sql.types as T

joined = spark.read.parquet('./FINAL_processed_data')

columns = joined.columns
#print(joined.printSchema())

# below is for mean, standard deviation based scaling

stats = joined.select(*[[F.mean(c).alias(c+'_mean') for c in joined.columns] + [F.stddev_pop(c).alias(c+'_stddev') for c in joined.columns]]).collect()[0]
# scaling join column
for column in columns:
    if column != 'Season' and column!= 'record_id':
        joined = joined.withColumn(column,(joined[column]-stats[column+'_mean'])/(stats[column+'_stddev']))

# below is for max, min based scaling
'''stats = joined.select(*[[F.max(c).alias(c+'_max') for c in joined.columns] + [F.min(c).alias(c+'_min') for c in joined.columns]]).collect()[0]
# scaling join column
for column in columns:
    if column != 'Season':
        joined = joined.withColumn(column,(joined[column]-stats[column+'_min'])/(stats[column+'_max'] - stats[column+'_min']))'''

print(stats)


print(joined.take(1))



Row(ArrDelay_mean=7.159954113241544, DepDelay_mean=8.155785553305384, ActualElapsedTime_mean=122.11903171401364, Distance_mean=715.1781956065518, origin_lat_mean=37.099659857406145, origin_long_mean=-92.27161607308254, dest_lat_mean=37.09915724198568, dest_long_mean=-92.26712071709692, Season_mean=1.4983448205364176, origin_prcp_mean=25.696228836739326, origin_tmax_mean=205.3709193033514, origin_tmin_mean=94.5765910747429, origin_awnd_mean=38.636158741871185, dest_prcp_mean=25.612486723460716, dest_tmax_mean=205.36587148443542, dest_tmin_mean=94.56188964755093, dest_awnd_mean=38.63819560488218, record_id_mean=327715972788.13043, ArrDelay_stddev=30.939682127010055, DepDelay_stddev=28.437824166810305, ActualElapsedTime_stddev=70.31234179026457, Distance_stddev=569.0274209367315, origin_lat_stddev=5.5991075699082264, origin_long_stddev=16.70378425786922, dest_lat_stddev=5.5997072673137644, dest_long_stddev=16.697214735116198, Season_stddev=1.113054367950728, origin_prcp_stddev=83.85304742

In [None]:

from math import sqrt
from pyspark.sql import Row
import numpy as np


# for cure we only choose the distance to the closest representative to x
def cure_tag(clusters,x):
    vals = []
    for val in x:
        vals.append(val)
    vals = np.array(vals)
    distance = None
    closest_cluster = None
    for i,cluster in enumerate(clusters):       
        for representative in cluster:
            dist = np.linalg.norm(representative-vals)
            if distance is None or dist < distance:
                distance=dist
                closest_cluster = i
    return closest_cluster

def wrapped_cure_tag(clusters):
    return F.udf(lambda c: cure_tag(clusters,c),T.IntegerType())
    
# num features
num_features = len(joined.columns)

print(joined.printSchema())

# num_clusters
for num_reps in range(4,8):
    for season in range(5):
        for k in range(3,10):
            fraction = 0.2
            # getting clusters as a list of lists of representatives
            if season != 4:
                clusters = spark.read\
                                .parquet('./CURE_Clusters/k='+str(k)+'_season='+str(season)+'_num_reps='+str(num_reps)+'_fraction='+str(fraction))\
                                .toPandas().to_numpy()
            else:
                clusters = spark.read\
                                .parquet('./CURE_Clusters/k='+str(k)+'_season=EVERYTHING'+'_num_reps='+str(num_reps)+'_fraction='+str(fraction))\
                                .toPandas().to_numpy()
            #print(clusters)
            if season!=4:
                final_dat = joined.where(joined.Season == season).drop('Season')
            else:
                final_dat = joined.drop('Season')
            
            # we select record_id to tag and we are good
            #print(final_dat.columns)
            final_dat = final_dat.withColumn('tag',wrapped_cure_tag(clusters)(F.array(*final_dat.columns[:-1]))).select('record_id','tag')
            print('tagged configuration for cure: k='+str(k)+' num_reps='+str(num_reps) + ' season='+str(season))
            
            # writing mappings to disk
            
            
            if season !=4:
                final_dat.coalesce(20).write.parquet('./CURE_Mapping/k='+str(k)+'_season='+str(season)+'_num_reps='+str(num_reps))
            else:
                final_dat.coalesce(20).write.parquet('./CURE_Mapping/k='+str(k)+'_season=EVERYTHING'+'_num_reps='+str(num_reps))
            
   


root
 |-- ArrDelay: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- origin_lat: double (nullable = true)
 |-- origin_long: double (nullable = true)
 |-- dest_lat: double (nullable = true)
 |-- dest_long: double (nullable = true)
 |-- Season: long (nullable = true)
 |-- origin_prcp: double (nullable = true)
 |-- origin_tmax: double (nullable = true)
 |-- origin_tmin: double (nullable = true)
 |-- origin_awnd: double (nullable = true)
 |-- dest_prcp: double (nullable = true)
 |-- dest_tmax: double (nullable = true)
 |-- dest_tmin: double (nullable = true)
 |-- dest_awnd: double (nullable = true)
 |-- record_id: long (nullable = true)

None
tagged configuration for cure: k=3 num_reps=4 season=0
tagged configuration for cure: k=4 num_reps=4 season=0
tagged configuration for cure: k=5 num_reps=4 season=0
tagged configuration for cure: k=6 num_reps=4 season=0
tagged configuration fo