In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
train = pd.read_csv('df_saturday_train.csv')
test = pd.read_csv('df_saturday_test.csv')

for col in train.columns:
    train[col] = train[col].fillna(train[col].mode()[0])

for col in test.columns:
    test[col] = test[col].fillna(test[col].mode()[0])

In [3]:
train.isnull().sum()

id                             0
Podcast_Name                   0
Episode_Title                  0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
Listening_Time_minutes         0
dtype: int64

In [4]:
test.isnull().sum()

id                             0
Podcast_Name                   0
Episode_Title                  0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
dtype: int64

In [None]:
class KalmanFilter:
    def __init__(self, A=1, H=1, Q=1e-5, R=10, initial_state=0, initial_uncertainty=1):
        self.A = A
        self.H = H
        self.Q = Q
        self.R = R
        self.x = initial_state
        self.P = initial_uncertainty

    def predict(self):
        self.x = self.A * self.x
        self.P = self.A * self.P * self.A + self.Q
        return self.x

    def update(self, z):
        K = self.P * self.H / (self.H * self.P * self.H + self.R)
        self.x = self.x + K * (z - self.H * self.x)
        self.P = (1 - K * self.H) * self.P
        return self.x
    

target_col = 'Listening_Time_minutes'
feature_cols = [col for col in train.columns if col != target_col]

X_train = train[feature_cols]
y_train = train[target_col]
X_test = test[feature_cols]

categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])

pipeline.fit(X_train, y_train)

linear_preds = pipeline.predict(X_test)

kf = KalmanFilter(initial_state=linear_preds[0])
smoothed_preds = []

for z in linear_preds:
    kf.predict()
    smoothed_val = kf.update(z)
    smoothed_preds.append(smoothed_val)

test['Predicted_Listening_Time_minutes'] = smoothed_preds
test.to_csv('submission_LR.csv', index=False)


✅ Dự đoán đã được lưu vào submission_LR.csv
