In [None]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from pycaret.regression import *
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [None]:
data = pd.read_excel('/Users/gim-yeon-u/Desktop/SejongUniv/2024-1/창의학기제2/flights_yeon.xlsx')

In [None]:
import random

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
data

In [None]:
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data

In [None]:
data = data.dropna(subset=['Price'])
data

In [None]:
data = pd.DataFrame(data)
data

In [None]:
target = 'Price'

X_rest = data.drop(['Airline', 'Price'], axis=1).values
y = data[target].values.reshape(-1, 1)

le = LabelEncoder()
airlines_encoded = le.fit_transform(data['Airline'])
airlines_encoded = airlines_encoded.reshape(-1, 1)

X = np.concatenate([X_rest, airlines_encoded], axis=1)

In [None]:
scaler_X = RobustScaler()
scaler_y = RobustScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

def create_sequences(data, targets, seq_len):
    sequences = []
    target_list = []
    for i in range(len(data) - seq_len + 1):
        seq = data[i:i+seq_len]
        sequences.append(seq)
        target_list.append(targets[i + seq_len - 1])
    return np.array(sequences), np.array(target_list)

seq_len = 5

X_train_seq, y_train_seq = create_sequences(X_train, y_train, seq_len)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, seq_len)

X_train_tensor = torch.FloatTensor(X_train_seq)
y_train_tensor = torch.FloatTensor(y_train_seq)
X_test_tensor = torch.FloatTensor(X_test_seq)
y_test_tensor = torch.FloatTensor(y_test_seq)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)

print(X_train_tensor.shape)
print(y_train_tensor.shape)

In [None]:
device = torch.device("mps")
print(device)

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout_prob=0.3):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)

        for name, param in self.lstm.named_parameters():
            if "weight" in name:
                nn.init.xavier_uniform_(param)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        output = self.fc(out[:, -1, :])
        return output

input_dim = X_train_tensor.shape[2]
hidden_dim = 128
num_layers = 3
output_dim = 1

model = LSTMModel(input_dim, hidden_dim, num_layers, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.SmoothL1Loss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

In [None]:
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        predictions = model(batch_x)
        loss = criterion(predictions.squeeze(-1), batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)

    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            predictions = model(batch_x)
            val_loss = criterion(predictions.squeeze(-1), batch_y)
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(test_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor.to(device)).cpu().numpy()
    y_test_np = y_test_tensor.cpu().numpy()

mse = mean_squared_error(y_test_np, predictions)
rmse = np.sqrt(mse)
print(f"Test MSE: {mse:.4f}, RMSE: {rmse:.4f}")

In [None]:
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor.to(device)).cpu().numpy()
    y_test_np = y_test_tensor.cpu().numpy()

mse = mean_squared_error(y_test_np, predictions)
rmse = np.sqrt(mse)
print(f"Test MSE: {mse:.4f}, RMSE: {rmse:.4f}")

In [None]:
def calculate_mape(y_true, y_pred):
    y_true = y_true + np.finfo(float).eps
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mape

model.eval()

with torch.no_grad():
    predictions = model(X_test_tensor.to(device)).cpu().numpy()
    y_test_np = y_test_tensor.cpu().numpy()

mape = calculate_mape(y_test_np, predictions)
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

In [None]:
import shap

model.eval()
background_data = X_train_tensor[:300].to(device)
test_data = X_test_tensor[:150].to(device)
test_labels = y_test_tensor[:150].to(device)
explainer = shap.GradientExplainer(model, background_data)
shap_values = explainer.shap_values(test_data)
shap_values = np.squeeze(np.array(shap_values), axis=-1)
shap_values = shap_values.mean(axis=1)
test_numpy = test_data.mean(dim=1).cpu().numpy()

feature_names = [
    'SearchYear', 'SearchMonth', 'SearchDay', 'FlightYear', 'FlightMonth',
    'FlightDay', 'IsFrom', 'Day_left', 'DepartureTime', 'ArrivalTime',
    'AirborneTime', 'Airline_encoded'
]

print(f"Adjusted SHAP values shape: {shap_values.shape}")
print(f"Adjusted test data shape: {test_numpy.shape}")
print(f"Feature names count: {len(feature_names)}")

shap.summary_plot(shap_values, features=test_numpy, feature_names=feature_names)
shap.summary_plot(shap_values, features=test_numpy, feature_names=feature_names,plot_type = 'bar')
