## Preprocessed & Cleaned Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

### Task 1: Find and Clean a New Dataset

Dataset Link from kaggle: https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction?resource=download

In [3]:
import kagglehub

path = kagglehub.dataset_download("teejmahal20/airline-passenger-satisfaction")

data_train = pd.read_csv(path + "/train.csv")
data_test = pd.read_csv(path + "/test.csv")

# Will perform my own train, validation, and test samples, will be shuffled later.
data = pd.concat([data_train, data_test])

data.head(5)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/teejmahal20/airline-passenger-satisfaction?dataset_version_number=1...


100%|██████████| 2.71M/2.71M [00:00<00:00, 17.4MB/s]

Extracting files...





Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [4]:
print(f"Shape: {data.shape}")
print(f"Data Column Types: {data.dtypes}")

Shape: (129880, 25)
Data Column Types: Unnamed: 0                             int64
id                                     int64
Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness     

In [None]:
def clean_data(train_data, test_data, validation_data):
								
		# Dropping unnecessary columns
		train_data.drop(columns=["Unnamed: 0", "id"], axis=1, inplace=True)
		test_data.drop(columns=["Unnamed: 0", "id"], axis=1, inplace=True)
		validation_data.drop(columns=["Unnamed: 0", "id"], axis=1, inplace=True)
		
		# If missing categorical columns exist, replace by the mode
		cols_cat = train_data.select_dtypes(include="object").columns
		cat_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent") 
		
		# Fit on the train, and transform for the test missing data in the features
		train_data.loc[:, cols_cat] = cat_imputer.fit_transform(train_data[cols_cat])
		test_data.loc[:, cols_cat] = cat_imputer.transform(test_data[cols_cat])
		validation_data.loc[:, cols_cat] = cat_imputer.transform(validation_data[cols_cat])

		# If missing numerical columns exist, replace by the mean	in the features
		cols_num = train_data.select_dtypes(include=["int64", "float64"]).columns
		imputer_num = SimpleImputer(missing_values=np.nan, strategy="mean") 

		train_data.loc[:, cols_num] = imputer_num.fit_transform(train_data[cols_num])
		test_data.loc[:, cols_num] = imputer_num.transform(test_data[cols_num])
		validation_data.loc[:, cols_num] = imputer_num.transform(validation_data[cols_num])
		
		# Convert label to binary
		label_encoder = LabelEncoder()

		# 1 Male, 0 female
		gen = "Gender"

		train_data.loc[:, gen] = label_encoder.fit_transform(train_data[gen])
		test_data.loc[:, gen] = label_encoder.transform(test_data[gen])
		validation_data.loc[:, gen] = label_encoder.transform(validation_data[gen])	

		# Encoding target
		label = train_data.columns[-1]
		
		train_data.loc[:, label] = label_encoder.fit_transform(train_data[label])
		test_data.loc[:, label] = label_encoder.transform(test_data[label])
		validation_data.loc[:, label] = label_encoder.transform(validation_data[label])		

		return train_data, test_data, validation_data

In [None]:
def preprocess_data(train_data, test_data, validation_data):
		
		# Encode categorical features using OneHotEncoder, excluding last column
		categorical_features = train_data.iloc[:, :-1].select_dtypes(include="object").columns
		
		# Standardize numerical features using StandardScaler()
		numerical_features = train_data.iloc[:, :-1].select_dtypes(include=['int64', 'float64']).columns

	# Transformation from encoding categorical and numerical values
		t = [
				("encoder", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"), categorical_features),
				("scaler", StandardScaler(), numerical_features)]

		# Applying the transformation
		col_transform = ColumnTransformer(transformers=t)
		col_transform.set_output(transform="pandas") 

		# Fit and transform on the training data 
		processed_X_train = col_transform.fit_transform(train_data)
		processed_X_test = col_transform.transform(test_data)
		processed_X_val = col_transform.transform(validation_data)

		# Adding back the last column
		processed_X_train["satisfaction"] = train_data.iloc[:, -1]
		processed_X_test["satisfaction"] = test_data.iloc[:, -1]
		processed_X_val["satisfaction"] = validation_data.iloc[:, -1]
	
		# Return processed training and test sets
		return processed_X_train, processed_X_test, processed_X_val

In [8]:
def into_csv_files(train_data, test_data, validation_data):

	train_data.to_csv("clean_processed_train_data.csv", index=False)
	test_data.to_csv("clean_processed_test_data.csv", index=False)
	validation_data.to_csv("clean_processed_validation_data.csv", index=False)

In [9]:
X_temp, X_test = train_test_split(data, test_size=0.7, shuffle=True)
X_train, X_val = train_test_split(X_temp, test_size=0.25, shuffle=True)

In [10]:
train_data, test_data, validation_data = clean_data(X_train, X_test, X_val)

In [28]:
print(f"Size of train set is: {train_data.shape}")
print(f"Size of validation set is: {validation_data.shape}")
print(f"Size of test set is: {test_data.shape}")

Size of train set is: (29223, 23)
Size of validation set is: (9741, 23)
Size of test set is: (90916, 23)


In [11]:
display(train_data.head(5), test_data.head(5), validation_data.head(5))

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
48562,1,Loyal Customer,32,Business travel,Business,3771,1,1,1,1,...,2,4,2,2,5,2,2,0,0.0,1
9201,1,Loyal Customer,19,Business travel,Business,2475,3,1,1,1,...,2,2,4,4,3,2,3,4,0.0,0
68808,0,Loyal Customer,49,Business travel,Business,3843,5,5,5,5,...,5,5,5,5,3,5,5,6,0.0,1
6226,1,Loyal Customer,40,Business travel,Business,1683,0,4,0,4,...,5,5,5,5,3,5,5,0,9.0,1
45891,1,Loyal Customer,50,Business travel,Business,1428,3,5,3,3,...,5,5,5,5,2,5,2,0,0.0,1


Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
1372,0,Loyal Customer,40,Business travel,Business,537,2,2,2,2,...,4,4,4,4,2,4,1,0,0.0,1
43421,0,Loyal Customer,54,Personal Travel,Eco,453,2,4,2,3,...,5,5,2,2,4,5,5,6,0.0,0
75411,0,Loyal Customer,35,Personal Travel,Business,646,2,1,2,2,...,2,2,2,2,1,2,2,0,0.0,0
101581,0,Loyal Customer,57,Business travel,Business,107,2,2,2,2,...,4,4,4,3,3,4,2,0,0.0,1
22046,0,Loyal Customer,56,Personal Travel,Eco,1136,2,5,2,1,...,5,5,2,5,1,5,3,8,0.0,0


Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
49538,0,Loyal Customer,50,Business travel,Business,134,4,4,1,4,...,4,4,4,5,3,4,3,4,8.0,1
92873,0,Loyal Customer,42,Business travel,Business,190,4,4,4,4,...,4,4,4,5,5,4,3,1,0.0,1
14774,0,Loyal Customer,21,Personal Travel,Eco,590,3,1,3,4,...,4,1,3,3,4,3,4,0,0.0,0
39889,1,Loyal Customer,63,Personal Travel,Eco,577,2,1,2,2,...,4,2,5,2,4,2,4,19,2.0,0
70818,0,Loyal Customer,61,Business travel,Business,3850,5,5,3,5,...,5,5,5,5,4,5,3,3,0.0,1


In [12]:
processed_X_train, processed_X_test, processed_X_val = preprocess_data(train_data, test_data, validation_data)

In [13]:
print(f"Size of processed train set is: {processed_X_train.shape}")
print(f"Size of processed validation set is: {processed_X_test.shape}")
print(f"Size of processed test set is: {processed_X_val.shape}")

Size of processed train set is: (29223, 24)
Size of processed validation set is: (90916, 24)
Size of processed test set is: (9741, 24)


In [14]:
display(processed_X_train.head(5), processed_X_test.head(5), processed_X_val.head(5))

Unnamed: 0,encoder__Gender_1,encoder__Customer Type_disloyal Customer,encoder__Type of Travel_Personal Travel,encoder__Class_Eco,encoder__Class_Eco Plus,scaler__Age,scaler__Flight Distance,scaler__Inflight wifi service,scaler__Departure/Arrival time convenient,scaler__Ease of Online booking,...,scaler__Inflight entertainment,scaler__On-board service,scaler__Leg room service,scaler__Baggage handling,scaler__Checkin service,scaler__Inflight service,scaler__Cleanliness,scaler__Departure Delay in Minutes,scaler__Arrival Delay in Minutes,satisfaction
48562,1.0,0.0,0.0,0.0,0.0,-0.494616,2.580539,-1.303864,-1.343537,-1.253359,...,-1.0206,0.486018,-1.024811,-1.374168,1.339189,-1.387566,-0.993811,-0.393171,-0.399071,1
9201,1.0,0.0,0.0,0.0,0.0,-1.352478,1.284432,0.197547,-1.343537,-1.253359,...,-1.0206,-1.061054,0.495009,0.315563,-0.249667,-1.387566,-0.229403,-0.285805,-0.399071,0
68808,0.0,0.0,0.0,0.0,0.0,0.627202,2.652545,1.698959,1.26876,1.587864,...,1.228589,1.259554,1.254918,1.160429,-0.249667,1.154416,1.299412,-0.232123,-0.399071,1
6226,1.0,0.0,0.0,0.0,0.0,0.033298,0.492367,-2.054569,0.615686,-1.963665,...,1.228589,1.259554,1.254918,1.160429,-0.249667,1.154416,1.299412,-0.393171,-0.160439,1
45891,1.0,0.0,0.0,0.0,0.0,0.693192,0.237346,0.197547,1.26876,0.167252,...,1.228589,1.259554,1.254918,1.160429,-1.044095,1.154416,-0.993811,-0.393171,-0.399071,1


Unnamed: 0,encoder__Gender_1,encoder__Customer Type_disloyal Customer,encoder__Type of Travel_Personal Travel,encoder__Class_Eco,encoder__Class_Eco Plus,scaler__Age,scaler__Flight Distance,scaler__Inflight wifi service,scaler__Departure/Arrival time convenient,scaler__Ease of Online booking,...,scaler__Inflight entertainment,scaler__On-board service,scaler__Leg room service,scaler__Baggage handling,scaler__Checkin service,scaler__Inflight service,scaler__Cleanliness,scaler__Departure Delay in Minutes,scaler__Arrival Delay in Minutes,satisfaction
1372,0.0,0.0,0.0,0.0,0.0,0.033298,-0.653727,-0.553158,-0.690462,-0.543053,...,0.478859,0.486018,0.495009,0.315563,-1.044095,0.307088,-1.758219,-0.393171,-0.399071,1
43421,0.0,0.0,1.0,1.0,0.0,0.957149,-0.737734,-0.553158,0.615686,-0.543053,...,1.228589,1.259554,-1.024811,-1.374168,0.544761,1.154416,1.299412,-0.232123,-0.399071,0
75411,0.0,0.0,1.0,0.0,0.0,-0.296648,-0.544718,-0.553158,-1.343537,-0.543053,...,-1.0206,-1.061054,-1.024811,-1.374168,-1.838523,-1.387566,-0.993811,-0.393171,-0.399071,0
101581,0.0,0.0,0.0,0.0,0.0,1.155117,-1.083763,-0.553158,-0.690462,-0.543053,...,0.478859,0.486018,0.495009,-0.529302,-0.249667,0.307088,-0.993811,-0.393171,-0.399071,1
22046,0.0,0.0,1.0,1.0,0.0,1.089128,-0.054678,-0.553158,1.26876,-0.543053,...,1.228589,1.259554,-1.024811,1.160429,-1.838523,1.154416,-0.229403,-0.17844,-0.399071,0


Unnamed: 0,encoder__Gender_1,encoder__Customer Type_disloyal Customer,encoder__Type of Travel_Personal Travel,encoder__Class_Eco,encoder__Class_Eco Plus,scaler__Age,scaler__Flight Distance,scaler__Inflight wifi service,scaler__Departure/Arrival time convenient,scaler__Ease of Online booking,...,scaler__Inflight entertainment,scaler__On-board service,scaler__Leg room service,scaler__Baggage handling,scaler__Checkin service,scaler__Inflight service,scaler__Cleanliness,scaler__Departure Delay in Minutes,scaler__Arrival Delay in Minutes,satisfaction
49538,0.0,0.0,0.0,0.0,0.0,0.693192,-1.056761,0.948253,0.615686,-1.253359,...,0.478859,0.486018,0.495009,1.160429,-0.249667,0.307088,-0.229403,-0.285805,-0.186954,1
92873,0.0,0.0,0.0,0.0,0.0,0.165277,-1.000756,0.948253,0.615686,0.877558,...,0.478859,0.486018,0.495009,1.160429,1.339189,0.307088,-0.229403,-0.366329,-0.399071,1
14774,0.0,0.0,1.0,1.0,0.0,-1.220499,-0.600723,0.197547,-1.343537,0.167252,...,0.478859,-1.83459,-0.264901,-0.529302,0.544761,-0.540239,0.535004,-0.393171,-0.399071,0
39889,1.0,0.0,1.0,1.0,0.0,1.551053,-0.613724,-0.553158,-1.343537,-0.543053,...,0.478859,-1.061054,1.254918,-1.374168,0.544761,-1.387566,0.535004,0.116815,-0.346042,0
70818,0.0,0.0,0.0,0.0,0.0,1.419074,2.659546,1.698959,1.26876,0.167252,...,1.228589,1.259554,1.254918,1.160429,0.544761,1.154416,-0.229403,-0.312647,-0.399071,1


In [15]:
# Preprocessed data for machine learning
into_csv_files(processed_X_train, processed_X_test, processed_X_val)