In [7]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [8]:
train = pd.read_csv('df_saturday_train.csv')
test = pd.read_csv('df_saturday_test.csv')

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103505 entries, 0 to 103504
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           103505 non-null  int64  
 1   Podcast_Name                 103505 non-null  object 
 2   Episode_Title                103505 non-null  object 
 3   Episode_Length_minutes       91442 non-null   float64
 4   Genre                        103505 non-null  object 
 5   Host_Popularity_percentage   103505 non-null  float64
 6   Publication_Day              103505 non-null  object 
 7   Publication_Time             103505 non-null  object 
 8   Guest_Popularity_percentage  82283 non-null   float64
 9   Number_of_Ads                103505 non-null  float64
 10  Episode_Sentiment            103505 non-null  object 
 11  Listening_Time_minutes       103505 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 9.5+ MB


In [None]:
class KalmanFilter:
    def __init__(self, A=1, H=1, Q=1e-5, R=10, initial_state=0, initial_uncertainty=1):
        self.A = A
        self.H = H
        self.Q = Q
        self.R = R
        self.x = initial_state
        self.P = initial_uncertainty

    def predict(self):
        self.x = self.A * self.x
        self.P = self.A * self.P * self.A + self.Q
        return self.x

    def update(self, z):
        K = self.P * self.H / (self.H * self.P * self.H + self.R)
        self.x = self.x + K * (z - self.H * self.x)
        self.P = (1 - K * self.H) * self.P
        return self.x
    
X = train.drop(columns=['Listening_Time_minutes'])
y = train['Listening_Time_minutes']
X_test = test.drop(columns=['Listening_Time_minutes'], errors='ignore')


categorical_cols = X.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
], remainder='passthrough')

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', xgb.XGBRegressor(n_estimators=100))
])


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# Dự đoán cho test
y_pred = pipeline.predict(X_test)

# ===== Làm mượt với Kalman Filter =====
kf = KalmanFilter(initial_state=y_pred[0])
smoothed_pred = []
for z in y_pred:
    kf.predict()
    smoothed_pred.append(kf.update(z))

# ===== Ghi kết quả =====
test['Predicted_Listening_Time_minutes'] = smoothed_pred
test.to_csv("submission_XGB.csv", index=False)