In [1]:
# need to grab mean and std for each column so that we can normalize the df and apply clustering
import pyspark.sql.functions as F
import pyspark.sql.types as T

joined = spark.read.parquet('./FINAL_processed_data')
#print(joined.count())

columns = joined.columns
#print(joined.printSchema())

# below is for mean, standard deviation based scaling

stats = joined.select(*[[F.mean(c).alias(c+'_mean') for c in joined.columns] + [F.stddev_pop(c).alias(c+'_stddev') for c in joined.columns]]).collect()[0]
# scaling join column
for column in columns:
    if column != 'Season' and column!='record_id':
        joined = joined.withColumn(column,(joined[column]-stats[column+'_mean'])/(stats[column+'_stddev']))

# below is for max, min based scaling
'''stats = joined.select(*[[F.max(c).alias(c+'_max') for c in joined.columns] + [F.min(c).alias(c+'_min') for c in joined.columns]]).collect()[0]
# scaling join column
for column in columns:
    if column != 'Season':
        joined = joined.withColumn(column,(joined[column]-stats[column+'_min'])/(stats[column+'_max'] - stats[column+'_min']))'''

print(stats)


print(joined.take(1))




Row(ArrDelay_mean=7.159954113241544, DepDelay_mean=8.155785553305384, ActualElapsedTime_mean=122.11903171401364, Distance_mean=715.1781956065518, origin_lat_mean=37.099659857406145, origin_long_mean=-92.27161607308254, dest_lat_mean=37.09915724198568, dest_long_mean=-92.26712071709692, Season_mean=1.4983448205364176, origin_prcp_mean=25.696228836739326, origin_tmax_mean=205.3709193033514, origin_tmin_mean=94.5765910747429, origin_awnd_mean=38.636158741871185, dest_prcp_mean=25.612486723460716, dest_tmax_mean=205.36587148443542, dest_tmin_mean=94.56188964755093, dest_awnd_mean=38.63819560488218, record_id_mean=327715972788.13043, ArrDelay_stddev=30.939682127010055, DepDelay_stddev=28.437824166810305, ActualElapsedTime_stddev=70.31234179026457, Distance_stddev=569.0274209367315, origin_lat_stddev=5.5991075699082264, origin_long_stddev=16.70378425786922, dest_lat_stddev=5.5997072673137644, dest_long_stddev=16.697214735116198, Season_stddev=1.113054367950728, origin_prcp_stddev=83.85304742

In [2]:
# bfr loop
from BFR_Logic import BFR
from math import sqrt
from pyspark.sql import Row
from pyspark.sql.functions import struct, array
import numpy as np



# function for getting stddev and centroid numpy arrays from the centroid dataframes
# adding standard deviation for centroids
def get_centroids_and_stddevs(centroids):
    use_centroids = []
    use_stddevs = []
    for i,centroid in enumerate(centroids):
        centroid_val = np.array(centroid[0])
        use_centroids.append(centroid_val.tolist())
        num = centroid[1]
        add_sum_examples = centroid[2]
        add_sum_squared_examples = centroid[3]
        for j in range(len(add_sum_squared_examples)):
                if add_sum_squared_examples[j] ==0 and add_sum_examples[j]!=0:
                    add_sum_examples[j]=0
        sum_val = np.array(add_sum_examples)
        sum_squared = np.array(add_sum_squared_examples)
        stddev = np.sqrt((sum_squared/num) - ((sum_val/num)**2))
        use_stddevs.append(stddev.tolist())
    return use_centroids,use_stddevs

# squared error of record based on euclidean distance
def record_error(x,use_centroids):
    # last column is the record_id so we do not take it for purposes of distance
    vals = np.array(x[:-1])
    tagged_centroid = x['centroid_id']
    #tagged_centroid = id_mapping_broadcast.value[x['record_id']]
    centroid = use_centroids[tagged_centroid]
    # item to return python number instead of numpy datatype
    #return np.square(np.linalg.norm(vals-centroid))
    return np.square(np.linalg.norm(vals-centroid))
    

def record_error_wrapped(centroids):
    return F.udf(lambda c: record_error(c,centroids), returnType='double')


def squared_error(centers, x):
    tagged_centroid = x['tag']
    centroid = np.array(centers[tagged_centroid])
    # getting values of x which are not the tag
    row_dict = x.asDict()
    vals = []
    for key in row_dict:
        print(key)
        if key!='tag':
            vals.append(row_dict[key])
    vals = np.array(vals)
    return np.square(np.linalg.norm(centroid-vals))
    
# num features
num_features = len(joined.columns)

print(joined.printSchema())

# num_clusters
for season in range(5):
    # season==4 means we take EVERYTHING
    if season!=4:
        continue
    errors = np.zeros(7)
    for k in range(3,10):
        
        # getting centroids
        if season!=4:
            centroids = spark.read.parquet('./New_BFR_Clusters/k='+str(k)+'_season='+str(season)).toPandas().to_numpy()
            # getting id_mapping (record_id, cluster_id) that was generated during clustering
            id_mapping = spark.read.parquet('./New_BFR_Mapping/k='+str(k)+'_season='+str(season))\
                                                                                                 .rdd.map(lambda x: (x['record_id'],x['cluster_id']))\
                                                                                                 .collectAsMap()
            
        else:
            centroids = spark.read.parquet('./New_BFR_Clusters/k='+str(k)+'_season=EVERYTHING').toPandas().to_numpy()
            id_mapping = spark.read.parquet('./New_BFR_Mapping/k='+str(k)+'_season=EVERYTHING')\
                                                                                               .rdd.map(lambda x: (x['record_id'],x['cluster_id']))\
                                                                                               .collectAsMap()
        
        
        
        use_centroids,_ = get_centroids_and_stddevs(centroids)
        
        print('got centroids')
        print(use_centroids)

        if season!=4:
            final_dat = joined.where(joined.Season == season).drop('Season')
        else:
            final_dat = joined.drop('Season')
            
        
        # joining the cluster_id based on record_id and then dropping record_id since we do not need it anymore
        #final_dat = final_dat.join(id_mapping,'record_id').drop('record_id')
        
        #error = final_dat.rdd.map(lambda x: record_error(x,use_centroids)).reduce(lambda x,y: x+y)
        
        #print(final_dat.count())
        
        # looping through by collecting in batches (since ilab has too many open files limit)
        
        error = 0
        final_dat = final_dat.withColumn('batch_id',F.spark_partition_id())
        print(final_dat.printSchema())
        unique_partitions = final_dat.select('batch_id').distinct().rdd.map(lambda x: x['batch_id']).collect()
        for i in unique_partitions:
            print('on partition: ' + str(i))
            batch = final_dat.where(final_dat.batch_id == i).drop('batch_id').collect()
            for record in batch:
                actual_record = np.array(record[:-1])
                record_id = record['record_id']
                tagged_centroid = id_mapping[record_id]
                usable_centroid = use_centroids[tagged_centroid]
                error += np.square(np.linalg.norm(actual_record-np.array(use_centroids[tagged_centroid])))
        
        del(id_mapping)
        
        errors[k-3]=error
        print('Season = '+str(season)+', K = '+str(k)+', Error = '+str(errors[k-3]))
 
    # writing errors to disk
    spark_errors = [(k+3,error.item()) for k,error in enumerate(errors)]
    if season!=4:
        sc.parallelize(spark_errors).toDF(['k','error']).coalesce(1).write.option('header',True)\
                                    .csv('./New_BFR_Errors/season='+str(season))
    else:
        sc.parallelize(spark_errors).toDF(['k','error']).coalesce(1).write.option('header',True)\
                                    .csv('./New_BFR_Errors/season=EVERYTHING')
    
   


root
 |-- ArrDelay: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- origin_lat: double (nullable = true)
 |-- origin_long: double (nullable = true)
 |-- dest_lat: double (nullable = true)
 |-- dest_long: double (nullable = true)
 |-- Season: long (nullable = true)
 |-- origin_prcp: double (nullable = true)
 |-- origin_tmax: double (nullable = true)
 |-- origin_tmin: double (nullable = true)
 |-- origin_awnd: double (nullable = true)
 |-- dest_prcp: double (nullable = true)
 |-- dest_tmax: double (nullable = true)
 |-- dest_tmin: double (nullable = true)
 |-- dest_awnd: double (nullable = true)
 |-- record_id: long (nullable = true)

None
got centroids
[[-0.012146903829759479, -0.014144457528313573, -0.0006079829804583737, -0.00033503296649035246, 0.00023663578282912627, -0.0002730693648188614, 0.0002421773419912244, -0.0001877857550575201, -0.005119784567767242, -0.00025790868