# Reading in our dataset and normalizing before BFR

In [1]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
# need to grab mean and std for each column so that we can normalize the df and apply clustering

joined = spark.read.parquet('./FINAL_processed_data')
print(joined.printSchema())

columns = joined.columns



# below is for mean, standard deviation based scaling

stats = joined.select(*[[F.mean(c).alias(c+'_mean') for c in joined.columns] + [F.stddev_pop(c).alias(c+'_stddev') for c in joined.columns]]).collect()[0]
# scaling join column
for column in columns:
    if column != 'Season' and column!= 'record_id':
        joined = joined.withColumn(column,(joined[column]-stats[column+'_mean'])/(stats[column+'_stddev']))

# below is for max, min based scaling
'''stats = joined.select(*[[F.max(c).alias(c+'_max') for c in joined.columns] + [F.min(c).alias(c+'_min') for c in joined.columns]]).collect()[0]
# scaling join column
for column in columns:
    if column != 'Season' and column!='record_id':
        joined = joined.withColumn(column,(joined[column]-stats[column+'_min'])/(stats[column+'_max'] - stats[column+'_min']))'''

print(stats)


print(joined.take(1))




root
 |-- ArrDelay: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- ActualElapsedTime: integer (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- origin_lat: double (nullable = true)
 |-- origin_long: double (nullable = true)
 |-- dest_lat: double (nullable = true)
 |-- dest_long: double (nullable = true)
 |-- Season: long (nullable = true)
 |-- origin_prcp: long (nullable = true)
 |-- origin_tmax: long (nullable = true)
 |-- origin_tmin: long (nullable = true)
 |-- origin_awnd: long (nullable = true)
 |-- dest_prcp: long (nullable = true)
 |-- dest_tmax: long (nullable = true)
 |-- dest_tmin: long (nullable = true)
 |-- dest_awnd: long (nullable = true)
 |-- record_id: long (nullable = true)

None
Row(ArrDelay_mean=7.159954113241544, DepDelay_mean=8.155785553305384, ActualElapsedTime_mean=122.11903171401364, Distance_mean=715.1781956065518, origin_lat_mean=37.099659857406145, origin_long_mean=-92.27161607308254, dest_lat_mean=37.09915724198568, des

In [2]:
# bfr loop
from BFR_Logic import BFR
from math import sqrt
from pyspark.sql import Row
import numpy as np


# num features
num_features = len(joined.columns)

print(joined.printSchema())

# num_clusters
for season in range(5):
    # if season is 4, then we do not filter by season at all! we take collective results
    
    for k in range(3,10):

        model=BFR.BFR_Algorithm(k,4*sqrt(num_features),4*sqrt(num_features))
        
        # this is a mapping from record id to cluster number (we can save this in parquet format as well after creating)
        id_mapping = []
        # getting a sample of our data to initialize bfr
        #approx 96 million records, so 0.1% of these is 96000 records
        if season!=4:
            final_dat = joined.where(joined.Season == season).drop('Season')
        else:
            final_dat = joined.drop('Season')
        
        #dont need record id when sampling
        sample = final_dat.drop('record_id').sample(fraction=0.0001)
        #print(sample.printSchema())
        test = sample.toPandas().to_numpy()
        #print(test)

        model.process_initial(test)
        del(test)


        # looping over chunks and performing steps
        # we have approx 96 million records, I will use the default partitions to get our batched data
        #joined = joined.repartition(30)
        final_dat = final_dat.withColumn('batch_id',F.spark_partition_id())
        unique_partitions = final_dat.select('batch_id').distinct().rdd.map(lambda x: x['batch_id']).collect()
        for i in unique_partitions:
            print('on batch partition: ' + str(i))
            batch = final_dat.filter(final_dat.batch_id == i).drop('batch_id').collect()

            ids = [record[-1] for record in batch]
            records = [np.array(record[:-1]) for record in batch]
            
            print(len(batch))
           
            # performing BFR step
            ids_categorized = model.process_batch(records,ids)
            id_mapping.extend(ids_categorized)
            del(batch)





        # finalizing bfr
        ids_categorized = model.finalize()
        id_mapping.extend(ids_categorized)



        # saving our centers to disk and the associated id mappings
        cluster_centers = [(cluster.centroid.tolist(),cluster.num_examples,cluster.sum_examples.tolist(),cluster.sum_squared_examples.tolist()) for cluster in model.clusters]
        print(cluster_centers)
        if season!=4:
            sc.parallelize(cluster_centers).toDF(['centroid','num','sum','sum_squared'])\
                        .coalesce(1).write.parquet('./New_BFR_Clusters/k='+str(k)+'_season='+str(season))
            sc.parallelize(id_mapping).toDF(['record_id','cluster_id'])\
                        .coalesce(1).write.parquet('./New_BFR_Mapping/k='+str(k)+'_season='+str(season))
            print('wrote clusters of k=' + str(k) + ' and season: ' + str(season) + ' to disk!')
        else:
            sc.parallelize(cluster_centers).toDF(['centroid','num','sum','sum_squared'])\
                        .coalesce(1).write.parquet('./New_BFR_Clusters/k='+str(k)+'_season=EVERYTHING')
            sc.parallelize(id_mapping).toDF(['record_id','cluster_id'])\
                        .coalesce(1).write.parquet('./New_BFR_Mapping/k='+str(k)+'_season=EVERYTHING')
            print('wrote clusters of k=' + str(k) + ' and season: EVERYTHING ' + ' to disk!')
        


root
 |-- ArrDelay: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- origin_lat: double (nullable = true)
 |-- origin_long: double (nullable = true)
 |-- dest_lat: double (nullable = true)
 |-- dest_long: double (nullable = true)
 |-- Season: long (nullable = true)
 |-- origin_prcp: double (nullable = true)
 |-- origin_tmax: double (nullable = true)
 |-- origin_tmin: double (nullable = true)
 |-- origin_awnd: double (nullable = true)
 |-- dest_prcp: double (nullable = true)
 |-- dest_tmax: double (nullable = true)
 |-- dest_tmin: double (nullable = true)
 |-- dest_awnd: double (nullable = true)
 |-- record_id: long (nullable = true)

None
on batch partition: 0
595852
on batch partition: 1
660083
on batch partition: 2
654871
on batch partition: 4
613506
on batch partition: 6
631935
on batch partition: 8
665864
on batch partition: 12
613740
on batch partition: 19
941203
on batch 

In [3]:
test = spark.read.parquet('./New_BFR_Clusters/k=6_season=0')

In [4]:
print(test.printSchema())
print(test.take(1))
print(test.count())

root
 |-- centroid: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- num: long (nullable = true)
 |-- sum: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- sum_squared: array (nullable = true)
 |    |-- element: double (containsNull = true)

None
[Row(centroid=[0.04324155713877842, 0.03979374946371594, 0.008185494380741381, -0.007687764678705816, -0.01621026988829133, 0.006570885942134952, -0.016091546237603102, 0.007253052857454054, -0.03444646046763022, -0.9498703215125259, -0.9305413807368075, 0.07281703923584763, -0.03385186784508647, -0.9499992335378172, -0.9306071021934933, 0.07358192961673748], num=23626244, sum=[1021635.5799007207, 940176.8345046219, 193392.48750002476, -181633.00411368522, -382987.7916866237, 155245.35456505025, -380182.7977468929, 171362.3965551067, -813840.4799445856, -22441867.984413385, -21985197.713384714, 1720393.1363437097, -799792.4895637671, -22444913.691377454, -21986750.464556407, 1738464.62311