In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, DateType

spark = SparkSession.builder.appName("Flight analysis").getOrCreate()
# caricare poi lo json schema
flights_df = spark.read.csv("../../data.nosync/cleaned/cleaned_flights.csv", inferSchema=True, header=True)


                                                                                

In [26]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import sin, cos

def sample(flights_df, columns, percentage):
    regression_flights_df = flights_df.select(columns)
    regression_flights_df = regression_flights_df.sample(percentage, 42)
    regression_flights_df = regression_flights_df.withColumnRenamed("ArrDelay", "label")
    return regression_flights_df

def add_cyclic_variables(regression_flights_df):
    hours_regressors = ['CRSDepTime', 'DepTime','CRSArrTime', 'WheelsOff', 'WheelsOn']
    for hour_regressor in hours_regressors:
        regression_flights_df = regression_flights_df.withColumn(hour_regressor + "_sin", (2*3.14*sin(regression_flights_df[hour_regressor]))/24)\
                                                    .withColumn(hour_regressor + "_cos", (2*3.14*cos(regression_flights_df[hour_regressor]))/24)
    return regression_flights_df

def get_train_test(sampled_regression_flights_df, columns):
    vectorAssembler = VectorAssembler(inputCols = columns, outputCol = 'regressors')
    flights_df_transformed = vectorAssembler.transform(sampled_regression_flights_df)
    flights_df_transformed = flights_df_transformed.select(['regressors', 'label'])
    dataframes = flights_df_transformed.randomSplit([0.9, 0.1], seed=26)
    train_set, test_set = dataframes[0], dataframes[1]
    return train_set, test_set    

In [27]:
import plotly.express as px
selected_regressors = ['DepDelayMinutes', 'DepartureDelayGroups', 'TaxiOut', 'WheelsOff', 'WheelsOn', 'TaxiIn', 'ArrDelay', 
                        'Cancelled', 'CRSElapsedTime', 'Distance', 'DistanceGroup', 'CRSDepTime', 'DepTime', 'CRSArrTime']

sampled_regression_flights_df = sample(flights_df, selected_regressors, 0.09)
regression_df = sampled_regression_flights_df.toPandas()
corr_matrix = regression_df.corr()
fig = px.imshow(corr_matrix,
                x  = corr_matrix.columns,
                y = corr_matrix.columns
               )
fig.update_xaxes(side="top")
fig.show()

                                                                                

In [28]:
input_cols = ['DepDelayMinutes', 'DepartureDelayGroups', 'TaxiOut', 'WheelsOff_sin', 'WheelsOff_cos', 'WheelsOn_sin', 'WheelsOn_cos',  'TaxiIn', 
                'Cancelled', 'CRSElapsedTime',  'Distance', 'DistanceGroup', 'CRSDepTime_sin', 'CRSDepTime_cos', 'DepTime_sin', 
                'DepTime_cos', 'CRSArrTime_sin', 'CRSArrTime_cos']

sampled_regression_flights_df = add_cyclic_variables(sampled_regression_flights_df)
train_set, test_set = get_train_test(sampled_regression_flights_df, input_cols)

In [29]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np

def compute_optimal_model(model, train_set, param_grid):
    regression_evaluator = RegressionEvaluator()
    cv = CrossValidator(estimator=model, estimatorParamMaps=param_grid, evaluator=regression_evaluator, parallelism=2, numFolds=5)
    cv_model = cv.fit(train_set)
    best_model = cv_model.bestModel
    return best_model

def compute_optimal_linear_regression(elastic_net_param_values, lambdas, train_set):
    linear_regression = LinearRegression(maxIter=1000, featuresCol='regressors', labelCol='label')
    regression_evaluator = RegressionEvaluator()
    param_grid = ParamGridBuilder().addGrid(linear_regression.regParam, lambdas).\
                                    addGrid(linear_regression.fitIntercept, [False, True]).\
                                    addGrid(linear_regression.elasticNetParam, elastic_net_param_values).build()
    return compute_optimal_model(linear_regression, train_set, param_grid)

def compute_optimal_random_regressor(train_set):
    random_forest_regressor = RandomForestRegressor(featuresCol='regressors', labelCol='label')
    regression_evaluator = RegressionEvaluator()
    param_grid = ParamGridBuilder().addGrid(random_forest_regressor.maxDepth, [int(x) for x in np.linspace(start = 4, stop = 30, num = 10)]).\
                                    addGrid(random_forest_regressor.numTrees, [int(x) for x in np.linspace(start = 10, stop = 100, num = 20)]).\
                                    addGrid(random_forest_regressor.featureSubsetStrategy, ['log2', 'sqrt', 'auto']).build()
    return compute_optimal_model(random_forest_regressor, train_set, param_grid)

def compute_optimal_ridge_regression(train_set, lambdas, elastic_net_param_values=[0]):
    return compute_optimal_linear_regression(elastic_net_param_values, lambdas, train_set)

def compute_optimal_lasso_regression(train_set, lambdas, elastic_net_param_values=[1]):
    return compute_optimal_linear_regression(elastic_net_param_values, lambdas, train_set)

def compute_optimal_elastic_net_regression(train_set, lambdas, elastic_net_param_values):
    return compute_optimal_linear_regression(elastic_net_param_values, lambdas, train_set)


In [30]:
best_lambda = 0.001

ridge_regression = LinearRegression(featuresCol = 'regressors', labelCol='label', maxIter=1000, standardization=True, regParam=best_lambda, elasticNetParam=0)
ridge_model = ridge_regression.fit(train_set)

lasso_regression = LinearRegression(featuresCol = 'regressors', labelCol='label', maxIter=1000, standardization=True, regParam=best_lambda, elasticNetParam=1)
lasso_model = lasso_regression.fit(train_set)

elastic_net_regression = LinearRegression(featuresCol = 'regressors',  predictionCol='prediction', maxIter=1000, standardization=True, regParam=best_lambda, elasticNetParam=0.43)
elastic_net_model = elastic_net_regression.fit(train_set)

random_forest_regression = RandomForestRegressor(featuresCol='regressors', labelCol='label', maxDepth=9, numTrees=50, featureSubsetStrategy='sqrt')
random_forest_model = random_forest_regression.fit(train_set)

models = [ridge_model, lasso_model, elastic_net_model, random_forest_model]
models_names = ["Ridge", "Lasso", "Elastic net", "Random forest"]
models_mse = np.zeros(len(models))
models_r_squared = np.zeros(len(models))

                                                                                

23/01/07 18:42:01 WARN DAGScheduler: Broadcasting large task binary with size 1062.7 KiB


                                                                                

23/01/07 18:42:04 WARN DAGScheduler: Broadcasting large task binary with size 2017.5 KiB


                                                                                

23/01/07 18:42:08 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB




23/01/07 18:42:16 WARN DAGScheduler: Broadcasting large task binary with size 1022.9 KiB


                                                                                

In [31]:
for i in range(len(models)-1):
    test_result = models[i].evaluate(test_set)
    models_mse[i] = test_result.rootMeanSquaredError * test_result.rootMeanSquaredError

predictions = random_forest_model.transform(test_set)
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
models_mse[len(models)-1] = rmse * rmse

                                                                                

In [32]:
import pandas as pd

mse_per_model =  {'Models': models_names, 'MSE': models_mse}
mse_per_model = pd.DataFrame(mse_per_model)

mse_per_model_plot = px.bar(mse_per_model, x='Models', y='MSE', color='Models')
mse_per_model_plot.show()

In [33]:
model_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")

for i in range(len(models_mse)):
    predictions = models[i].transform(test_set)
    models_r_squared[i] = model_evaluator.evaluate(predictions)

                                                                                

In [34]:
r_squared_per_model =  {'Models': models_names, 'R2': models_r_squared}
r_squared_per_model = pd.DataFrame(r_squared_per_model)
r_squared_per_model_plot = px.bar(r_squared_per_model, x='Models', y='R2', color='Models')
r_squared_per_model_plot.show()

In [35]:
regression_flights_df = sample(flights_df, selected_regressors, 0.99)

In [36]:
regression_flights_df = add_cyclic_variables(regression_flights_df)
train_set, test_set = get_train_test(regression_flights_df, input_cols)

In [37]:
best_lambda = 0.001

ridge_regression = LinearRegression(featuresCol = 'regressors', labelCol='label', maxIter=1000, standardization=True, regParam=best_lambda, elasticNetParam=0)
ridge_model = ridge_regression.fit(train_set)

lasso_regression = LinearRegression(featuresCol = 'regressors', labelCol='label', maxIter=1000, standardization=True, regParam=best_lambda, elasticNetParam=1)
lasso_model = lasso_regression.fit(train_set)

elastic_net_regression = LinearRegression(featuresCol = 'regressors',  predictionCol='prediction', maxIter=1000, standardization=True, regParam=best_lambda, elasticNetParam=0.43)
elastic_net_model = elastic_net_regression.fit(train_set)

                                                                                

In [38]:
for i in range(len(models)-1):
    test_result = models[i].evaluate(test_set)
    models_mse[i] = test_result.rootMeanSquaredError * test_result.rootMeanSquaredError

                                                                                

In [39]:
import pandas as pd

mse_per_model_whole_dataset =  {'Models': models_names[0:len(models_names)-1], 'MSE': models_mse[0:len(models_mse)-1]}
mse_per_model_whole_dataset = pd.DataFrame(mse_per_model_whole_dataset)

mse_per_model_whole_dataset_plot = px.bar(mse_per_model_whole_dataset, x='Models', y='MSE', color='Models')
mse_per_model_whole_dataset_plot.show()

In [40]:
model_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")

for i in range(len(models_mse)-1):
    predictions = models[i].transform(test_set)
    models_r_squared[i] = model_evaluator.evaluate(predictions)

                                                                                

In [41]:
r_squared_per_model_whole_dataset =  {'Models': models_names[0:len(models_names)-1], 'R2': models_r_squared[0:len(models_mse)-1]}
r_squared_per_model_whole_dataset = pd.DataFrame(r_squared_per_model_whole_dataset)
r_squared_per_model_whole_dataset_plot = px.bar(r_squared_per_model_whole_dataset, x='Models', y='R2', color='Models')
r_squared_per_model_whole_dataset_plot.show()