In [1]:
pip install xgboost

SyntaxError: invalid syntax (1313025939.py, line 1)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# 1️⃣ Load data from test.csv with appropriate encoding / Tải dữ liệu từ test.csv với encoding thích hợp
def load_data(file_path):
    for encoding in ["euc-kr", "ISO-8859-1", "utf-8"]:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
    raise ValueError("Unable to load CSV file with known encodings.")

df = load_data("test.csv")

# 2️⃣ Check initial dataset information / Kiểm tra thông tin dữ liệu ban đầu
def display_data_info(df):
    print("Dataset Information:")
    print(df.info())
    print("\nPreview Data:")
    print(df.head())

display_data_info(df)

# Convert object columns to numeric if applicable / Chuyển đổi các cột object thành kiểu số nếu có thể
def convert_numeric_columns(df, columns):
    for col in columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

convert_numeric_columns(df, ["Total_Emp", "Actual_Emp", "Avg_Temp", "previous_orders"])

# 3️⃣ Data preprocessing / Tiền xử lý dữ liệu
def preprocess_data(df):
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df['day_of_week'] = df['Date'].dt.dayofweek
        df['month'] = df['Date'].dt.month
        df.dropna(subset=['Date'], inplace=True)

preprocess_data(df)

# 4️⃣ Check and handle missing values / Kiểm tra và xử lý giá trị thiếu
def handle_missing_values(df):
    print("\nMissing Values Count:")
    print(df.isnull().sum())
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col].fillna(df[col].mean(), inplace=True)

handle_missing_values(df)

# 5️⃣ Encode dish data using TF-IDF if column 'Lunch_Menu' exists / Mã hóa dữ liệu món ăn bằng TF-IDF nếu có cột 'Lunch_Menu'
def encode_lunch_menu(df):
    if 'Lunch_Menu' in df.columns:
        vectorizer = TfidfVectorizer()
        return vectorizer.fit_transform(df["Lunch_Menu"].astype(str))
    return None

X_dish = encode_lunch_menu(df)

# 6️⃣ Select relevant features if available / Chọn các đặc trưng phù hợp nếu có
features = [col for col in ["day_of_week", "month", "Avg_Temp", "Holiday", "previous_orders"] if col in df.columns]
if not features or "Lunch_Count" not in df.columns:
    raise ValueError("Insufficient data for model training. / Dữ liệu không đủ để huấn luyện mô hình.")

X = df[features]
y = df["Lunch_Count"]

# 7️⃣ Split train/test dataset / Chia tập train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 8️⃣ Initialize prediction models / Khởi tạo các mô hình dự đoán
def get_models():
    return {
        "Linear Regression": LinearRegression(),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
        "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    }

models = get_models()

# 9️⃣ Train and evaluate models / Huấn luyện và đánh giá các mô hình
def train_and_evaluate(models, X_train, X_test, y_train, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = {
            "MSE": mean_squared_error(y_test, y_pred),
            "MAE": mean_absolute_error(y_test, y_pred)
        }
        print(f"{name} - MSE: {results[name]['MSE']:.4f}, MAE: {results[name]['MAE']:.4f}")
    return results

results = train_and_evaluate(models, X_train, X_test, y_train, y_test)

# 🔟 LSTM Model / Mô hình LSTM
def train_lstm(X_train, X_test, y_train, y_test):
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    X_train_reshaped = np.reshape(X_train_scaled, (X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
    X_test_reshaped = np.reshape(X_test_scaled, (X_test_scaled.shape[0], X_test_scaled.shape[1], 1))
    
    model = Sequential([
        LSTM(50, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)),
        Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train_reshaped, y_train, epochs=50, batch_size=16, verbose=0)
    
    y_pred_lstm = model.predict(X_test_reshaped)
    mse = mean_squared_error(y_test, y_pred_lstm)
    mae = mean_absolute_error(y_test, y_pred_lstm)
    
    print(f"LSTM - MSE: {mse:.4f}, MAE: {mae:.4f}")
    return {"MSE": mse, "MAE": mae}

results["LSTM"] = train_lstm(X_train, X_test, y_train, y_test)

# 1️⃣1️⃣ Identify the best model and suggest improvements / Xác định mô hình tốt nhất và đề xuất cải thiện
def identify_best_model(results):
    best_model = min(results, key=lambda x: results[x]["MSE"])
    print(f"Best Model: {best_model}")
    return best_model

best_model = identify_best_model(results)
