In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, DateType

spark = SparkSession.builder.appName("Flight analysis").getOrCreate()
# caricare poi lo json schema
flights_df = spark.read.csv("../../data.nosync/cleaned/cleaned_flights.csv", inferSchema=True, header=True)
#flights_df.show(10)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/07 12:18:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/07 12:18:07 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


                                                                                

In [2]:
flights_df.dtypes

[('Year', 'int'),
 ('Quarter', 'int'),
 ('Month', 'int'),
 ('DayofMonth', 'int'),
 ('DayOfWeek', 'int'),
 ('FlightDate', 'timestamp'),
 ('Reporting_Airline', 'string'),
 ('Tail_Number', 'string'),
 ('Flight_Number_Reporting_Airline', 'int'),
 ('Origin', 'string'),
 ('Dest', 'string'),
 ('CRSDepTime', 'int'),
 ('DepTime', 'int'),
 ('DepDelay', 'double'),
 ('DepDelayMinutes', 'double'),
 ('DepDel15', 'double'),
 ('DepartureDelayGroups', 'int'),
 ('DepTimeBlk', 'string'),
 ('TaxiOut', 'double'),
 ('WheelsOff', 'int'),
 ('WheelsOn', 'int'),
 ('TaxiIn', 'double'),
 ('CRSArrTime', 'int'),
 ('ArrTime', 'int'),
 ('ArrDelay', 'double'),
 ('ArrDelayMinutes', 'double'),
 ('ArrDel15', 'double'),
 ('ArrivalDelayGroups', 'int'),
 ('ArrTimeBlk', 'string'),
 ('Cancelled', 'double'),
 ('Diverted', 'double'),
 ('CRSElapsedTime', 'double'),
 ('ActualElapsedTime', 'double'),
 ('AirTime', 'double'),
 ('Distance', 'double'),
 ('DistanceGroup', 'int'),
 ('DivAirportLandings', 'int'),
 ('ORIGIN_AIRPORT_FULL_N

In [4]:
flights_df.select("Dest", "DEST_AIRPORT_FULL_NAME").show(10)

+----+----------------------+
|Dest|DEST_AIRPORT_FULL_NAME|
+----+----------------------+
| OMA|       Eppley Airfield|
| MSP|  Minneapolis-St Pa...|
| CLT|  Charlotte Douglas...|
| ATL|  William B Hartsfi...|
| ATL|  William B Hartsfi...|
| ATL|  William B Hartsfi...|
| ATL|  William B Hartsfi...|
| ATL|  William B Hartsfi...|
| ATL|  William B Hartsfi...|
| ATL|  William B Hartsfi...|
+----+----------------------+
only showing top 10 rows



In [12]:
column_aliases = {"DEST_STATE": "Destination state", "ORIGIN_STATE": "Origin state", "Dest": "Destination airport", 
                    "Origin": "Origin airport",}

In [47]:
print(flights_df.count())



6246739


                                                                                

In [48]:
flights_df.select("Year","Month", "DayofMonth").summary("min","max").show()

                                                                                

+-------+----+-----+----------+
|summary|Year|Month|DayofMonth|
+-------+----+-----+----------+
|    min|2013|    1|         1|
|    max|2013|   12|        31|
+-------+----+-----+----------+



In [5]:
STARTING_MONTH = 1
ENDING_MONTH = 9
STARTING_DAY_OF_MONTH = 1

In [6]:
from pyspark.sql.functions import col
import numpy as np
import plotly.express as px

# plot della classifica dei primi x migliori in base allo stato di destinazione o aereporto di destinazione. 
def plot_top_x_places_by_interval(flights_df, x, start_month, end_month, start_day, end_day, place_attribute, sort_by): 
    place_attributes = ["Dest", "Origin", "DEST_STATE", "ORIGIN_STATE"]
    
    if place_attribute not in place_attributes:
        raise Exception("place_attribute must be one of the following : ", place_attribute)

    if x < 0 or x > flights_df.count():
        raise Exception("X cannot be negative or bigger than dataframe size!")

    if start_month > ENDING_MONTH or end_month < STARTING_MONTH or start_month < 0 or end_month < 0:
        raise Exception("Please specificy a well formed month interval")

    if (start_day > end_day) or (end_day < start_day) or (start_day < 0) or (end_day < 0):
        raise Exception("Please specificy a well formed day interval")

    # selezione delle top destinations (magari da scrivere meglio) 
    flights_per_states = flights_df.filter((start_month <= flights_df.Month) \
                                            & (flights_df.Month <= end_month) \
                                            & (start_day <= flights_df.DayofMonth) \
                                            & (flights_df.DayofMonth <= end_day)).\
                                    select(col(place_attribute)).\
                                    groupBy(col(place_attribute)).\
                                    count()

    if sort_by == "Top":
        flights_per_states = flights_per_states.sort(col("count").desc())
    else:
        flights_per_states = flights_per_states.sort(col("count").asc())
                                    
    # prendo le prime top x destinazioni 
    destinations = flights_per_states.limit(x)
    # Rinomino le colonne per rendere il grafico più comprensibile
    destinations = destinations.withColumnRenamed("count", "Count")
    # plotting della classifica con un istogramma
    place_column_alias = column_aliases[place_attribute]
    title = sort_by + " " + str(x) + " " + place_column_alias
    top_x_hist_plot = px.histogram(destinations.toPandas(), x=place_attribute, y="Count", color=px.colors.qualitative.Vivid[0:x], title=title,
                                   labels = {
                                            place_attribute: place_column_alias,
                                            "sum of Count": "Count"})
    top_x_hist_plot.update_layout(showlegend=False) 
    top_x_hist_plot.show()

In [7]:
plot_top_x_places_by_interval(flights_df, 10, 1, 9, 1, 31, "Dest", "Bottom")

                                                                                

In [8]:
def pie_plot_top_x_places(flights_df, x, start_month, end_month, start_day, end_day, place_attribute, sort_by):
    place_attributes = ["Dest", "Origin", "DEST_STATE", "ORIGIN_STATE"]
    
    if place_attribute not in place_attributes:
        raise Exception("place_attribute must be one of the following : ", place_attribute)

    if x < 0 :
        raise Exception("x cannot be negative!")
        
    if start_month > ENDING_MONTH or end_month < STARTING_MONTH:
        raise Exception("Please specificy a well formed month interval")

    if (start_day > end_day) or (end_day < start_day) or (start_day < 0) or (end_day < 0):
        raise Exception("Please specificy a well formed day interval")

    flights_per_states = flights_df.filter((start_month <= flights_df.Month) \
                            & (flights_df.Month <= end_month) \
                            & (start_day <= flights_df.DayofMonth)  
                            & (flights_df.DayofMonth <= end_day)).\
                            select(col(place_attribute)).\
                            groupBy(col(place_attribute)).\
                            count()

    if sort_by == "Top":
        flights_per_states = flights_per_states.sort(col("count").desc())
    else:
        flights_per_states = flights_per_states.sort(col("count").asc())
        
    destinations = flights_per_states.limit(x)
    destinations = destinations.withColumnRenamed("count", "Count")

    place_column_alias = column_aliases[place_attribute]

    title = sort_by + " " + str(x) + " " + place_column_alias 
    flights_pie_plot = px.pie(destinations.toPandas(), values='Count', names=place_attribute, title=title,
                              labels = { 
                                        place_attribute: place_column_alias,
                                        "sum of Count": "Count"})
    flights_pie_plot.show()

In [9]:
pie_plot_top_x_places(flights_df, 10, 1, 9, 1, 31, "DEST_STATE", "Top")

                                                                                

In [13]:
from pyspark.sql.functions import *
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import datetime
import pandas as pd

def facet_plot_top_x_over_interval(flights_df, x, start_month, end_month, start_day, end_day, sort_by, place_attribute):
    place_attributes = ["Dest", "Origin", "DEST_STATE", "ORIGIN_STATE"]

    if place_attribute not in place_attributes:
        raise Exception("place_attribute must be one of the following : ", place_attribute)

    if x < 0 or x > flights_df.count():
        raise Exception("X cannot be negative or bigger than dataframe size!")

    if start_month > ENDING_MONTH or end_month < STARTING_MONTH or start_month < 0 or end_month < 0:
        raise Exception("Please specificy a well formed month interval")

    if (start_day > end_day) or (end_day < start_day) or (start_day < 0) or (end_day < 0):
        raise Exception("Please specificy a well formed day interval")
        
    flights_per_place = flights_df.filter((start_month <= flights_df.Month) \
                                            & (flights_df.Month <= end_month) \
                                            & (start_day <= flights_df.DayofMonth) \
                                            & (flights_df.DayofMonth <= end_day)).\
                                    select(col(place_attribute), "FlightDate").\
                                    groupBy(col(place_attribute), "FlightDate").\
                                    count().\
                                    orderBy(col("FlightDate"))
    
    flights_per_place_aggr = flights_df.filter((start_month <= flights_df.Month) \
                                            & (flights_df.Month <= end_month) \
                                            & (start_day <= flights_df.DayofMonth) \
                                            & (flights_df.DayofMonth <= end_day)).\
                                            select(place_attribute).\
                                            groupBy(place_attribute).\
                                            count()
    if sort_by == "Top":
        flights_per_place_aggr = flights_per_place_aggr.sort(col("count").desc()).limit(x)
    else:
        flights_per_place_aggr = flights_per_place_aggr.sort(col("count").asc()).limit(x)

    places = flights_per_place_aggr.select(place_attribute).toPandas().values.reshape(-1)
    
    fig = make_subplots(rows=x, cols=1) 

    for i in range(len(places)):
        place = places[i]
        flights_per_place_i = flights_per_place.filter(flights_per_place[place_attribute] == place)
        flights_per_place_i_pd = flights_per_place_i.toPandas()
        place_column_alias = column_aliases[place_attribute]

        fig.add_trace(
            go.Scatter(x=flights_per_place_i_pd['FlightDate'], y=flights_per_place_i_pd['count'], 
                        name=place),
            row=i+1, col=1
        )
        
    title = sort_by + " " + str(x) + " "  + place_column_alias 
    fig.update_layout(height=1100, width=1200, title_text=title)
    fig.show()
    

In [14]:
facet_plot_top_x_over_interval(flights_df, 5, 1, 9, 1, 31, "Top", "Origin")

                                                                                

In [15]:
from pyspark.sql.functions import *
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import datetime
import pandas as pd


def facet_plot_over_interval(flights_df, start_month, end_month, start_day, end_day, places, place_attribute):
    place_attributes = ["Dest", "Origin", "DEST_STATE", "ORIGIN_STATE"]
    
    if place_attribute not in place_attributes:
        raise Exception("place_attribute must be one of the following : ", place_attribute)

    if start_month > ENDING_MONTH or end_month < STARTING_MONTH or start_month < 0 or end_month < 0:
        raise Exception("Please specificy a well formed month interval")

    if (start_day > end_day) or (end_day < start_day) or (start_day < 0) or (end_day < 0):
        raise Exception("Please specificy a well formed day interval")


    flights_df = flights_df.filter((start_month <= flights_df.Month) \
                                            & (flights_df.Month <= end_month) \
                                            & (start_day <= flights_df.DayofMonth) \
                                            & (flights_df.DayofMonth <= end_day)\
                                            & (flights_df[place_attribute].isin(places))).\
                                            select(col(place_attribute), "FlightDate").\
                                            groupBy(col(place_attribute), "FlightDate").\
                                            count().\
                                            orderBy(col("FlightDate"))
    
    fig = make_subplots(rows=len(places), cols=1) 

    for i in range(len(places)):
        flights_per_state_pd = flights_df.filter(flights_df[place_attribute] == places[i]).toPandas()
        fig.add_trace(
            go.Scatter(x=flights_per_state_pd['FlightDate'], y=flights_per_state_pd['count'], name=places[i]),
            row=i+1, col=1
        )
        
    word = "from" if ((place_attribute == "Origin") or (place_attribute == "ORIGIN_STATE")) else "to"   
    place_column_alias = column_aliases[place_attribute]
    title = "Daily number of flighs " + word + " " + place_column_alias
    fig.update_layout(height=1100, width=1200, title_text="Daily number of flights to states")
    fig.show()
    

In [16]:
dest_airports = ['ATL', 'ORD']
facet_plot_over_interval(flights_df, 1, 9, 1, 31, dest_airports, "Dest")

                                                                                

In [115]:
column_per_aggregation_level = {"Daily": "FlightDate", "Weekly": "WeekofMonth", "Monthly": "Month"}

In [126]:
def plot_mean_arr_delay_per_dest(flights_df, destinations, dest_attribute, aggregation_level):
    if not ((dest_attribute == "Dest") or (dest_attribute == "DEST_STATE")):
        raise Exception("Place attribute must be Dest or DEST_STATE")
        
    period = column_per_aggregation_level[aggregation_level]

    mean_arr_delay_per_dest = flights_df.filter(flights_df[dest_attribute].isin(destinations)).\
                                    select(col(dest_attribute), period, "ArrDelayMinutes").\
                                    groupBy(col(dest_attribute), period).\
                                    agg({"ArrDelayMinutes": "avg"}).\
                                    orderBy(col(period))

    y = "avg(" + "ArrDelayMinutes" + ")"
    dest_column_alias = column_aliases[dest_attribute]
    mean_arr_delay_plot = px.line(mean_arr_delay_per_dest.toPandas(), x=period, y=y, color=dest_attribute,
                                        labels = {dest_attribute: dest_column_alias,
                                                  "avg(ArrDelayMinutes)": "Average arrival delay (Minutes)"})
    mean_arr_delay_plot.show()

In [127]:
def plot_mean_dep_delay_per_origin(flights_df, origins, origin_attribute, aggregation_level):
    if not ((origin_attribute == "Origin") or (origin_attribute == "ORIGIN_STATE")):
        raise Exception("Place attribute must be Origin or ORIGIN_STATE")
        
    period = column_per_aggregation_level[aggregation_level]

    mean_dep_delay_per_origin = flights_df.filter(flights_df[origin_attribute].isin(origins)).\
                                    select(col(origin_attribute), period, "DepDelayMinutes").\
                                    groupBy(col(origin_attribute), period).\
                                    agg({"DepDelayMinutes": "avg"}).\
                                    orderBy(col(period))

    y = "avg(" + "DepDelayMinutes" + ")"
    origin_column_alias = column_aliases[origin_attribute]
    mean_dep_delay_plot = px.line(mean_dep_delay_per_origin.toPandas(), x=period, y=y, color=origin_attribute,
                                        labels = {origin_attribute: origin_column_alias,
                                                  "avg(DepDelayMinutes)": "Average departure delay (Minutes)"})
    mean_dep_delay_plot.show()

In [129]:
states = ['VA', 'CA', 'WA']
plot_mean_dep_delay_per_origin(flights_df, states, "ORIGIN_STATE", "Weekly")

                                                                                

+------------+-------------------+--------------------+
|ORIGIN_STATE|        WeekofMonth|avg(DepDelayMinutes)|
+------------+-------------------+--------------------+
|          CA|2013-01-01 00:00:00|   8.069711296650237|
|          WA|2013-01-01 00:00:00|   5.786348122866894|
|          VA|2013-01-01 00:00:00|  10.282978723404256|
|          VA|2013-01-02 00:00:00|   6.001131861912847|
|          WA|2013-01-02 00:00:00|   7.388978930307942|
|          CA|2013-01-02 00:00:00|   7.735620209837041|
|          VA|2013-01-03 00:00:00|    9.20160137260509|
|          WA|2013-01-03 00:00:00|   7.192943770672547|
|          CA|2013-01-03 00:00:00|   6.870873129277281|
|          CA|2013-01-04 00:00:00|   8.013745185408958|
+------------+-------------------+--------------------+
only showing top 10 rows



                                                                                

In [25]:
states = ['VA', 'CA', 'WA']
plot_mean_dep_delay_per_origin(flights_df, states, "ORIGIN_STATE", "Daily")

                                                                                

In [138]:
def plot_delay_groups(flights_df, destination, dest_attribute, aggregation_level):
    if not ((dest_attribute == "Dest") or (dest_attribute == "DEST_STATE")):
        raise Exception("Dest attribute must be either Dest or Dest state")

    period = column_per_aggregation_level[aggregation_level]

    delay_groups = flights_df.filter(flights_df[dest_attribute] == destination).\
                                    select(col(dest_attribute), period, "DepDel15").\
                                    groupBy(col(dest_attribute), period, "DepDel15").\
                                    count().\
                                    orderBy(col(period))

    delay_groups = delay_groups.withColumnRenamed("count", "Count")
    delay_groups_plot = px.bar(delay_groups.toPandas(), x=period, y="Count", color='DepDel15')
    delay_groups_plot.show()

In [142]:
plot_delay_groups(flights_df, "CA", "DEST_STATE", "Daily")

                                                                                

In [47]:
def plot_scatter_for_delays(flights_df, airport_dest):
    matrix = flights_df.filter(flights_df.Dest.isin(airport_dest)).\
                        select("Dest", "ArrDelayMinutes", "DepDelayMinutes", "Month", "DayofWeek").\
                        groupBy("DayofWeek","Month", "Dest").\
                        agg({"ArrDelayMinutes": "avg", "DepDelayMinutes": "avg"})
                        
    matrix = matrix.withColumnRenamed("avg(ArrDelayMinutes)", "Avg arrival delay").\
                    withColumnRenamed("avg(DepDelayMinutes)", "Avg departure delay").\
                    withColumnRenamed("Dest", "Destination airport")            

    matrix_pd = matrix.toPandas()
    matrix_scatter_plot = px.scatter_matrix(matrix_pd, dimensions=["Avg arrival delay", "Avg departure delay"], color="Destination airport")
    matrix_scatter_plot.show()

In [48]:
airport_dest = ['ATL']
plot_scatter_for_delays(flights_df,  airport_dest)

                                                                                

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import sin, cos

# evitare di hardcodare
def sample(flights_df):
    columns = ['DepDelayMinutes', 'DepartureDelayGroups', 'TaxiOut', 'WheelsOff', 'WheelsOn', 'TaxiIn', 'ArrDelay', 
                'Cancelled', 'CRSElapsedTime', 'Distance', 'DistanceGroup', 'CRSDepTime', 'DepTime', 'CRSArrTime']
    regression_flights_df = flights_df.select(columns)
    regression_flights_df = regression_flights_df.sample(0.09, 42)
    regression_flights_df = regression_flights_df.withColumnRenamed("ArrDelay", "label")
    return regression_flights_df

def add_cyclic_variables(regression_flights_df):
    hours_regressors = ['CRSDepTime', 'DepTime','CRSArrTime']
    for hour_regressor in hours_regressors:
        regression_flights_df = regression_flights_df.withColumn(hour_regressor + "_sin", (2*3.14*sin(regression_flights_df[hour_regressor]))/24)\
                                                    .withColumn(hour_regressor + "_cos", (2*3.14*cos(regression_flights_df[hour_regressor]))/24)
    return regression_flights_df

# evitare di hardcodare
def get_train_test(sampled_regression_flights_df):
    vectorAssembler = VectorAssembler(inputCols = ['DepDelayMinutes', 'DepartureDelayGroups', 'TaxiOut', 'WheelsOff', 'WheelsOn', 'TaxiIn', 
                'Cancelled', 'CRSElapsedTime',  'Distance', 'DistanceGroup', 'CRSDepTime_sin', 'CRSDepTime_cos', 'DepTime_sin', 
                'DepTime_cos', 'CRSArrTime_sin', 'CRSArrTime_cos'], outputCol = 'regressors')

    flights_df_transformed = vectorAssembler.transform(sampled_regression_flights_df)
    flights_df_transformed = flights_df_transformed.select(['regressors', 'label'])
    dataframes = flights_df_transformed.randomSplit([0.9, 0.1], seed=26)
    train_set, test_set = dataframes[0], dataframes[1]
    return train_set, test_set    

In [25]:
import plotly.express as px

sampled_regression_flights_df = sample(flights_df)

regression_df = sampled_regression_flights_df.toPandas()
corr_matrix = regression_df.corr()
fig = px.imshow(corr_matrix,
                x  = corr_matrix.columns,
                y = corr_matrix.columns
               )
fig.update_xaxes(side="top")
fig.show()

                                                                                

In [8]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np

def compute_optimal_model(model, train_set, param_grid):
    regression_evaluator = RegressionEvaluator()
    cv = CrossValidator(estimator=model, estimatorParamMaps=param_grid, evaluator=regression_evaluator, parallelism=2, numFolds=5)
    cv_model = cv.fit(train_set)
    best_model = cv_model.bestModel
    return best_model

def compute_optimal_linear_regression(elastic_net_param_values, lambdas, train_set):
    linear_regression = LinearRegression(maxIter=1000, featuresCol='regressors', labelCol='label')
    regression_evaluator = RegressionEvaluator()
    param_grid = ParamGridBuilder().addGrid(linear_regression.regParam, lambdas).\
                                    addGrid(linear_regression.fitIntercept, [False, True]).\
                                    addGrid(linear_regression.elasticNetParam, elastic_net_param_values).build()
    return compute_optimal_model(linear_regression, train_set, param_grid)

def compute_optimal_random_regressor(train_set):
    random_forest_regressor = RandomForestRegressor(featuresCol='regressors', labelCol='label')
    regression_evaluator = RegressionEvaluator()
    param_grid = ParamGridBuilder().addGrid(random_forest_regressor.maxDepth, [int(x) for x in np.linspace(start = 4, stop = 30, num = 10)]).\
                                    addGrid(random_forest_regressor.numTrees, [int(x) for x in np.linspace(start = 10, stop = 100, num = 20)]).\
                                    addGrid(random_forest_regressor.featureSubsetStrategy, ['log2', 'sqrt', 'auto']).build()
    return compute_optimal_model(random_forest_regressor, train_set, param_grid)

def compute_optimal_ridge_regression(train_set, lambdas, elastic_net_param_values=[0]):
    return compute_optimal_linear_regression(elastic_net_param_values, lambdas, train_set)

def compute_optimal_lasso_regression(train_set, lambdas, elastic_net_param_values=[1]):
    return compute_optimal_linear_regression(elastic_net_param_values, lambdas, train_set)

def compute_optimal_elastic_net_regression(train_set, lambdas, elastic_net_param_values):
    return compute_optimal_linear_regression(elastic_net_param_values, lambdas, train_set)


In [9]:
sampled_regression_flights_df = sample(flights_df)
sampled_regression_flights_df = add_cyclic_variables(sampled_regression_flights_df)
train_set, test_set = get_train_test(sampled_regression_flights_df)

In [10]:
lambdas = np.linspace(0.002, 4, 40)
best_elastic_net_model = compute_optimal_lasso_regression(train_set, lambdas)

22/12/31 15:15:42 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 4:>                 (0 + 8) / 18][Stage 6:>                 (0 + 0) / 18]

22/12/31 15:15:51 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/12/31 15:15:51 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

In [44]:
best_elastic_net_model

LinearRegressionModel: uid=LinearRegression_956f18b9833b, numFeatures=16

In [11]:
best_lambda = best_elastic_net_model._java_obj.parent().getRegParam()
best_lambda

0.002

In [12]:
lasso_regression = LinearRegression(featuresCol = 'regressors', labelCol='label', maxIter=1000, standardization=True, regParam=best_lambda)
lasso_model = lasso_regression.fit(train_set)
print("Coefficients: " + str(lasso_model.coefficients))
print("Intercept: " + str(lasso_model.intercept))

                                                                                

22/12/31 15:21:38 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

Coefficients: [0.9203597154327462,1.9323484515057654,0.8937002071739824,-0.0008493668279007551,0.0009050583753266614,0.8367117458134044,0.0,-0.3138218353334317,0.033536411804866786,0.2690708371341901,-0.12017315229319005,0.049179835545120086,-0.018908443999625026,0.10054674894959363,-0.2350559799453143,0.12809144675917775]
Intercept: -10.145751114624316


In [13]:
from pyspark.ml.evaluation import RegressionEvaluator

lasso_predictions = lasso_model.transform(test_set)

lasso_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % lasso_evaluator.evaluate(lasso_predictions))



R Squared (R2) on test data = 0.938214


                                                                                

In [18]:
lasso_predictions.select("label", "prediction").filter(lasso_predictions.prediction > 0).show(10)

[Stage 2416:>                                                       (0 + 1) / 1]

+-----+-------------------+
|label|         prediction|
+-----+-------------------+
|-13.0| 0.2768435078228553|
|  9.0|  7.734734145375024|
|  5.0| 10.325830039213495|
|  1.0|  2.642429330580276|
| 27.0| 10.948354712081214|
|  0.0| 0.7508415523534833|
| -3.0| 0.1928202275976556|
| -2.0| 1.5254886689359015|
|  3.0|  1.769716270900263|
|-11.0|0.46214577478959207|
+-----+-------------------+
only showing top 10 rows



                                                                                

22/12/31 17:49:59 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1039066 ms exceeds timeout 120000 ms
22/12/31 17:49:59 WARN SparkContext: Killing executors is not supported by current scheduler.


In [14]:
lasso_predictions.select("predictions", "label").show(10)

AnalysisException: Column 'predictions' does not exist. Did you mean one of the following? [prediction, regressors, label];
'Project ['predictions, label#225]
+- Project [regressors#356, label#225, UDF(regressors#356) AS prediction#112406]
   +- Sample 0.9, 1.0, false, 26
      +- Sort [regressors#356 ASC NULLS FIRST, label#225 ASC NULLS FIRST], false
         +- Project [regressors#356, label#225]
            +- Project [DepDelayMinutes#136, DepartureDelayGroups#138, TaxiOut#140, WheelsOff#141, WheelsOn#142, TaxiIn#143, label#225, Cancelled#151, CRSElapsedTime#153, Distance#156, DistanceGroup#157, CRSDepTime#133, DepTime#134, CRSArrTime#144, CRSDepTime_sin#240, CRSDepTime_cos#256, DepTime_sin#273, DepTime_cos#291, CRSArrTime_sin#310, CRSArrTime_cos#330, UDF(struct(DepDelayMinutes, DepDelayMinutes#136, DepartureDelayGroups_double_VectorAssembler_ebfb71a1eb75, cast(DepartureDelayGroups#138 as double), TaxiOut, TaxiOut#140, WheelsOff_double_VectorAssembler_ebfb71a1eb75, cast(WheelsOff#141 as double), WheelsOn_double_VectorAssembler_ebfb71a1eb75, cast(WheelsOn#142 as double), TaxiIn, TaxiIn#143, Cancelled, Cancelled#151, CRSElapsedTime, CRSElapsedTime#153, Distance, Distance#156, DistanceGroup_double_VectorAssembler_ebfb71a1eb75, cast(DistanceGroup#157 as double), CRSDepTime_sin, CRSDepTime_sin#240, CRSDepTime_cos, CRSDepTime_cos#256, ... 8 more fields)) AS regressors#356]
               +- Project [DepDelayMinutes#136, DepartureDelayGroups#138, TaxiOut#140, WheelsOff#141, WheelsOn#142, TaxiIn#143, label#225, Cancelled#151, CRSElapsedTime#153, Distance#156, DistanceGroup#157, CRSDepTime#133, DepTime#134, CRSArrTime#144, CRSDepTime_sin#240, CRSDepTime_cos#256, DepTime_sin#273, DepTime_cos#291, CRSArrTime_sin#310, ((COS(cast(CRSArrTime#144 as double)) * 6.28) / cast(24 as double)) AS CRSArrTime_cos#330]
                  +- Project [DepDelayMinutes#136, DepartureDelayGroups#138, TaxiOut#140, WheelsOff#141, WheelsOn#142, TaxiIn#143, label#225, Cancelled#151, CRSElapsedTime#153, Distance#156, DistanceGroup#157, CRSDepTime#133, DepTime#134, CRSArrTime#144, CRSDepTime_sin#240, CRSDepTime_cos#256, DepTime_sin#273, DepTime_cos#291, ((SIN(cast(CRSArrTime#144 as double)) * 6.28) / cast(24 as double)) AS CRSArrTime_sin#310]
                     +- Project [DepDelayMinutes#136, DepartureDelayGroups#138, TaxiOut#140, WheelsOff#141, WheelsOn#142, TaxiIn#143, label#225, Cancelled#151, CRSElapsedTime#153, Distance#156, DistanceGroup#157, CRSDepTime#133, DepTime#134, CRSArrTime#144, CRSDepTime_sin#240, CRSDepTime_cos#256, DepTime_sin#273, ((COS(cast(DepTime#134 as double)) * 6.28) / cast(24 as double)) AS DepTime_cos#291]
                        +- Project [DepDelayMinutes#136, DepartureDelayGroups#138, TaxiOut#140, WheelsOff#141, WheelsOn#142, TaxiIn#143, label#225, Cancelled#151, CRSElapsedTime#153, Distance#156, DistanceGroup#157, CRSDepTime#133, DepTime#134, CRSArrTime#144, CRSDepTime_sin#240, CRSDepTime_cos#256, ((SIN(cast(DepTime#134 as double)) * 6.28) / cast(24 as double)) AS DepTime_sin#273]
                           +- Project [DepDelayMinutes#136, DepartureDelayGroups#138, TaxiOut#140, WheelsOff#141, WheelsOn#142, TaxiIn#143, label#225, Cancelled#151, CRSElapsedTime#153, Distance#156, DistanceGroup#157, CRSDepTime#133, DepTime#134, CRSArrTime#144, CRSDepTime_sin#240, ((COS(cast(CRSDepTime#133 as double)) * 6.28) / cast(24 as double)) AS CRSDepTime_cos#256]
                              +- Project [DepDelayMinutes#136, DepartureDelayGroups#138, TaxiOut#140, WheelsOff#141, WheelsOn#142, TaxiIn#143, label#225, Cancelled#151, CRSElapsedTime#153, Distance#156, DistanceGroup#157, CRSDepTime#133, DepTime#134, CRSArrTime#144, ((SIN(cast(CRSDepTime#133 as double)) * 6.28) / cast(24 as double)) AS CRSDepTime_sin#240]
                                 +- Project [DepDelayMinutes#136, DepartureDelayGroups#138, TaxiOut#140, WheelsOff#141, WheelsOn#142, TaxiIn#143, ArrDelay#146 AS label#225, Cancelled#151, CRSElapsedTime#153, Distance#156, DistanceGroup#157, CRSDepTime#133, DepTime#134, CRSArrTime#144]
                                    +- Sample 0.0, 0.09, false, 42
                                       +- Project [DepDelayMinutes#136, DepartureDelayGroups#138, TaxiOut#140, WheelsOff#141, WheelsOn#142, TaxiIn#143, ArrDelay#146, Cancelled#151, CRSElapsedTime#153, Distance#156, DistanceGroup#157, CRSDepTime#133, DepTime#134, CRSArrTime#144]
                                          +- Relation [Year#122,Quarter#123,Month#124,DayofMonth#125,DayOfWeek#126,FlightDate#127,Reporting_Airline#128,Tail_Number#129,Flight_Number_Reporting_Airline#130,Origin#131,Dest#132,CRSDepTime#133,DepTime#134,DepDelay#135,DepDelayMinutes#136,DepDel15#137,DepartureDelayGroups#138,DepTimeBlk#139,TaxiOut#140,WheelsOff#141,WheelsOn#142,TaxiIn#143,CRSArrTime#144,ArrTime#145,... 20 more fields] csv


In [None]:
predictions.select("prediction", "label").show(10)

In [49]:
from pyspark.ml.evaluation import RegressionEvaluator

test_result = lasso_model.evaluate(test_set)
mse = test_result.rootMeanSquaredError * test_result.rootMeanSquaredError
print("Mean Squared Error (MSE) on test data = %g" % mse)



Mean Squared Error (MSE) on test data = 85.8549


                                                                                

# Feature utili per la regressione 

FlightDate                           0.000000       ## Da considerare successivamnete (o forse no)

Origin                               0.000000       VAR CATEGORICA, da consideare in seguito
Dest                                 0.000000       VAR CATEGORICA, da considerare in seguito

CRSDepTime                           0.000000       ## DATA Da non considerare SI (scheduled)

DepTime                              2.753034       ## DATA DA NON considerare (SI) (attuale)
DepDelayMinutes                      2.753034
DepartureDelayGroups                 2.753034

DepTimeBlk                           0.000000       # DATA (DA NON CONSIDERARE)

TaxiOut                              2.824056
WheelsOff                            2.824056
WheelsOn                             2.910754
TaxiIn                               2.910754
CRSArrTime                           0.000000       ## DATA (SI)

ArrDelayMinutes                      3.077569       ### VAR DIPENDENTE
Cancelled                            0.000000
CRSElapsedTime                       0.000000


Distance                             0.000000
DistanceGroup                        0.000000


