**1. Loading the Dataset**

In [1]:
import pandas as pd

# Load the flights dataset
flights = pd.read_csv('flights.csv')


**2. Data Preprocessing**

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

# Handle categorical variables
encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(flights[['from', 'to', 'flightType', 'agency']])

# Scale numerical features (removed 'age' as it's not in the flights dataset)
scaler = StandardScaler()
numerical_features = scaler.fit_transform(flights[['time', 'distance']])

# Combine features
X = np.concatenate([categorical_features.toarray(), numerical_features], axis=1)

# Target variable
y = flights['price'].values

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**3. Feature Selection**

In [4]:
from sklearn.feature_selection import SelectKBest, f_regression

# Select top k features
selector = SelectKBest(f_regression, k='all')
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)


**4. Model Training**

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train a regression model
regressor = LinearRegression()
regressor.fit(X_train_selected, y_train)

# Predict on test data
y_pred = regressor.predict(X_test_selected)


**5. Model Validation**

In [6]:
from sklearn.metrics import r2_score

# Calculate R^2 and RMSE
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'R^2: {r2}')
print(f'RMSE: {rmse}')


R^2: 0.9192493441024164
RMSE: 103.15255513142583


**6.Saving the Trained Model**

In [7]:
import joblib

joblib.dump(regressor, 'flight_price_model.pkl')


['flight_price_model.pkl']

**7.Saving the Encoder and Scaler**

In [8]:
joblib.dump(encoder, 'encoder.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']