# Common Feature Selection Techniques

####At this stage, we examine how complete the dataset is:

In [3]:
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

In [4]:
df = df.dropna()

In [5]:
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

In [6]:
df

In [7]:
df['DepDel15'] = pd.to_numeric(df['DepDel15'], downcast='integer')

In [8]:
df

In [9]:
# Exclude columns that are possible target leakers.
# DepDelay,DepDel15,ArrDelay,Cancelled,Year
columns_to_drop = ['ArrDelay', 'DepDelay', 'Cancelled', 'Year']
df.drop(columns_to_drop, axis = 1, inplace = True)

In [10]:
df

##Optimizing Conversion between Apache Spark and pandas DataFrames

In [13]:
# Convert pandas dataframe to spark dataframe
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Create a Spark DataFrame from a pandas DataFrame using Arrow
sdf = spark.createDataFrame(df)

##One hot encoding and feature scaling

For simplicity's sake, we will use One-Hot Encoding to convert all categorical variables into binary vectors. It is possible here to improve prediction accuracy by converting each categorical column with an appropriate method.

Here, we will use a combination of StringIndexer and OneHotEncoderEstimator to convert the categorical variables. The OneHotEncoderEstimator will return a SparseVector.

Since we will have more than 1 stage of feature transformations, we use a Pipeline to tie the stages together. This simplifies our code.

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

categoricalColumns = ["Carrier", "OriginAirportID", "DestAirportID"]
stages = [] # stages in our Pipeline

In [17]:
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

The above code basically indexes each categorical column using the StringIndexer, and then converts the indexed categories into one-hot encoded variables. The resulting output has the binary vectors appended to the end of each row.

We use the StringIndexer again to encode our labels to label indices.

In [19]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="DepDel15", outputCol="label")
stages += [label_stringIdx]

Use a VectorAssembler to combine all the feature columns into a single vector column. This includes both the numeric columns and the one-hot encoded binary vector columns in our dataset.

In [21]:
# Transform all features into a vector using VectorAssembler
numericCols = ["Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "CRSArrTime", "ArrDel15"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler] 

Run the stages as a Pipeline. This puts the data through all of the feature transformations we described in a single call.

In [23]:
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(sdf)
preppedDataDF = pipelineModel.transform(sdf)

In [24]:
display(preppedDataDF)

Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,CRSDepTime,DepDel15,CRSArrTime,ArrDel15,CarrierIndex,CarrierclassVec,OriginAirportIDIndex,OriginAirportIDclassVec,DestAirportIDIndex,DestAirportIDclassVec,label,features
4,19,5,DL,11433,13303,837,0,1138,0.0,1.0,"List(0, 15, List(1), List(1.0))",12.0,"List(0, 69, List(12), List(1.0))",24.0,"List(0, 69, List(24), List(1.0))",0.0,"List(0, 159, List(1, 27, 108, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 837.0, 1138.0))"
4,19,5,DL,14869,12478,1705,0,2336,0.0,1.0,"List(0, 15, List(1), List(1.0))",20.0,"List(0, 69, List(20), List(1.0))",15.0,"List(0, 69, List(15), List(1.0))",0.0,"List(0, 159, List(1, 35, 99, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1705.0, 2336.0))"
4,19,5,DL,14057,14869,600,0,851,0.0,1.0,"List(0, 15, List(1), List(1.0))",30.0,"List(0, 69, List(30), List(1.0))",20.0,"List(0, 69, List(20), List(1.0))",0.0,"List(0, 159, List(1, 45, 104, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 600.0, 851.0))"
4,19,5,DL,15016,11433,1630,1,1903,1.0,1.0,"List(0, 15, List(1), List(1.0))",29.0,"List(0, 69, List(29), List(1.0))",12.0,"List(0, 69, List(12), List(1.0))",1.0,"List(0, 159, List(1, 44, 96, 153, 154, 155, 156, 157, 158), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1630.0, 1903.0, 1.0))"
4,19,5,DL,11193,12892,1615,0,1805,0.0,1.0,"List(0, 15, List(1), List(1.0))",40.0,"List(0, 69, List(40), List(1.0))",2.0,"List(0, 69, List(2), List(1.0))",0.0,"List(0, 159, List(1, 55, 86, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1615.0, 1805.0))"
4,19,5,DL,10397,15016,1726,0,1818,0.0,1.0,"List(0, 15, List(1), List(1.0))",0.0,"List(0, 69, List(0), List(1.0))",29.0,"List(0, 69, List(29), List(1.0))",0.0,"List(0, 159, List(1, 15, 113, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1726.0, 1818.0))"
4,19,5,DL,15016,10397,1900,0,2133,0.0,1.0,"List(0, 15, List(1), List(1.0))",29.0,"List(0, 69, List(29), List(1.0))",0.0,"List(0, 69, List(0), List(1.0))",0.0,"List(0, 159, List(1, 44, 84, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1900.0, 2133.0))"
4,19,5,DL,10397,14869,2145,1,2356,1.0,1.0,"List(0, 15, List(1), List(1.0))",0.0,"List(0, 69, List(0), List(1.0))",20.0,"List(0, 69, List(20), List(1.0))",1.0,"List(0, 159, List(1, 15, 104, 153, 154, 155, 156, 157, 158), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 2145.0, 2356.0, 1.0))"
4,19,5,DL,10397,10423,2157,1,2333,1.0,1.0,"List(0, 15, List(1), List(1.0))",0.0,"List(0, 69, List(0), List(1.0))",34.0,"List(0, 69, List(34), List(1.0))",1.0,"List(0, 159, List(1, 15, 118, 153, 154, 155, 156, 157, 158), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 2157.0, 2333.0, 1.0))"
4,19,5,DL,11278,10397,1900,1,2055,1.0,1.0,"List(0, 15, List(1), List(1.0))",23.0,"List(0, 69, List(23), List(1.0))",0.0,"List(0, 69, List(0), List(1.0))",1.0,"List(0, 159, List(1, 38, 84, 153, 154, 155, 156, 157, 158), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1900.0, 2055.0, 1.0))"


In [25]:
# Keep relevant columns
col = ["Month"]
selectedcols = ["label", "features"] + col
dataset = preppedDataDF.select(selectedcols)
display(dataset)

label,features,Month
0.0,"List(0, 159, List(1, 27, 108, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 837.0, 1138.0))",4
0.0,"List(0, 159, List(1, 35, 99, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1705.0, 2336.0))",4
0.0,"List(0, 159, List(1, 45, 104, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 600.0, 851.0))",4
1.0,"List(0, 159, List(1, 44, 96, 153, 154, 155, 156, 157, 158), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1630.0, 1903.0, 1.0))",4
0.0,"List(0, 159, List(1, 55, 86, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1615.0, 1805.0))",4
0.0,"List(0, 159, List(1, 15, 113, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1726.0, 1818.0))",4
0.0,"List(0, 159, List(1, 44, 84, 153, 154, 155, 156, 157), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1900.0, 2133.0))",4
1.0,"List(0, 159, List(1, 15, 104, 153, 154, 155, 156, 157, 158), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 2145.0, 2356.0, 1.0))",4
1.0,"List(0, 159, List(1, 15, 118, 153, 154, 155, 156, 157, 158), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 2157.0, 2333.0, 1.0))",4
1.0,"List(0, 159, List(1, 38, 84, 153, 154, 155, 156, 157, 158), List(1.0, 1.0, 1.0, 4.0, 19.0, 5.0, 1900.0, 2055.0, 1.0))",4
