In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [2]:
years = ['2015', '2016', '2017', '2018', '2019']
data_frames = []
for year in years:
    df = pd.read_excel(f'TripDataset/Dataset/{year}.xlsx')  # Replace with actual file names
    df['Year'] = year  # Add year column for temporal tracking
    data_frames.append(df)

dt = pd.concat(data_frames, ignore_index=True)
dt = dt.sort_values(by='Year')

In [3]:
data = dt

In [4]:
# 参数定义
missing_threshold = 0.2  # 缺省值比例阈值
outlier_std_threshold = 3  # 离群值的标准差倍数

In [5]:
# 计算缺省值比例，删除超过阈值的列
missing_ratios = data.isnull().mean()
print(missing_ratios['Purpose of visit to CITY'])
print(missing_ratios['Number of nights in CITY'])

0.0
0.004366450186541385


In [6]:
cols_to_drop = missing_ratios[missing_ratios > missing_threshold].index
data = data.drop(columns=cols_to_drop)

In [7]:
# 定义数值和类别列等
numerical_cols = ['Age', 'Number of nights in CITY', 'Number of visits to CITY', 'Tour price', 'Airfare', 
                  'Total expenditures', 'Accommodation expenses', 'Food and drink expenses', 
                  'Transportation expenses in CITY', 'Entertainment expenses', 'Shopping expenses', 
                  'Other expenditures', 'Purpose of visit to CITY']
numerical_cols_without_target = ['Age', 'Number of visits to CITY', 'Tour price', 'Airfare', 
                  'Total expenditures', 'Accommodation expenses', 'Food and drink expenses', 
                  'Transportation expenses in CITY', 'Entertainment expenses', 'Shopping expenses', 
                  'Other expenditures']
true_categorical_cols = ['Nationality', 'Country of residence', 'Gender', 'Immigration airport', 
                         'Travel type', 'Having mileage or not', 'Intention to revisit CITY', 
                         'Most desired place', 'Most satisfied place']
ordinal_cols = ['Satisfaction level', 'Intention to revisit CITY', 'Satisfaction level by item 01', 
                'Satisfaction level by item 02', 'Satisfaction level by item 03', 'Satisfaction level by item 04', 
                'Satisfaction level by item 05', 'Satisfaction level by item 06', 'Satisfaction level by item 07', 
                'Satisfaction level by item 08', 'Satisfaction level by item 09', 'Satisfaction level by item 10', 
                'Satisfaction level by item 11']

In [8]:
# 获取data中的实际列名
existing_columns = data.columns

# 筛选出在data中的数值列和类别列
numerical_cols = [col for col in numerical_cols if col in existing_columns]
numerical_cols_without_target = [col for col in numerical_cols_without_target if col in existing_columns]
true_categorical_cols = [col for col in true_categorical_cols if col in existing_columns]
ordinal_cols = [col for col in ordinal_cols if col in existing_columns]

In [9]:
# 检查并删除数值特征中的离群值样本
# numerical_cols = [col for col in data.columns if pd.api.types.is_numeric_dtype(data[col])]

for col in numerical_cols_without_target:
    if col not in data.columns:
        continue
    mean = data[col].mean()
    std = data[col].std()
    lower_bound = mean - outlier_std_threshold * std
    upper_bound = mean + outlier_std_threshold * std
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

In [10]:
# 数据处理
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
data[numerical_cols_without_target] = num_imputer.fit_transform(data[numerical_cols_without_target])

data[true_categorical_cols] = cat_imputer.fit_transform(data[true_categorical_cols])
data[true_categorical_cols] = data[true_categorical_cols].astype(str)

ohe = OneHotEncoder(handle_unknown='ignore')
cat_encoded = pd.DataFrame(
    ohe.fit_transform(data[true_categorical_cols]).toarray(),
    columns=ohe.get_feature_names_out(true_categorical_cols)
)

ord_encoder = OrdinalEncoder()
ord_encoded = pd.DataFrame(ord_encoder.fit_transform(data[ordinal_cols]), 
                           columns=ordinal_cols)

In [11]:
data_processed = pd.concat([data[numerical_cols_without_target], cat_encoded, ord_encoded, data['Number of nights in CITY'], data['Purpose of visit to CITY']], axis=1)

In [12]:
# data['Number of nights in CITY'] = data['Number of nights in CITY'].replace(0, 1e-1)
# data_processed['Expenditure_per_night'] = data['Total expenditures'] / data['Number of nights in CITY']

data_processed = data_processed.drop_duplicates() # 删除重复行

data_processed = data_processed.fillna(data_processed.mode().iloc[0])  # 填充数值型特征的缺失值

In [13]:
# 保存处理后的数据
data_processed.to_pickle(
    f"data/data_processed_augmented_missing_threshold_{int(missing_threshold * 10)}_outlier_std_threshold_{outlier_std_threshold}.pkl")

# data_processed.to_pickle(
#     f"data/data_processed_augmented_missing_threshold_{int(missing_threshold * 10)}.pkl")