In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load Dataset
df = pd.read_excel('./toto_cleaned_file_with_trends.xlsx')

# Display first few rows
display(df.head())

# Data Cleaning & Feature Engineering
def preprocess_data(df):
    winning_cols = [col for col in df.columns if 'Winning Number' in col]
    data = df[winning_cols].copy()
    data_matrix = data.to_numpy()
    
    # Frequency-based features
    all_numbers = np.arange(1, 50)
    frequency = {num: np.sum(data_matrix == num) for num in all_numbers}
    freq_df = pd.DataFrame(list(frequency.items()), columns=['Number', 'Frequency']).sort_values(by='Frequency', ascending=False)
    
    # Hot/Cold Labeling (Top 20% are Hot)
    threshold = int(len(freq_df) * 0.2)
    freq_df['Label'] = ['Hot' if i < threshold else 'Cold' for i in range(len(freq_df))]
    return data, freq_df

# Preprocess Data
data, freq_df = preprocess_data(df)

# Visualizing Frequency Distribution
plt.figure(figsize=(12,6))
sns.barplot(x=freq_df['Number'], y=freq_df['Frequency'], palette='viridis')
plt.xlabel('Lottery Numbers')
plt.ylabel('Frequency of Appearance')
plt.title('Frequency Distribution of Winning Numbers')
plt.xticks(rotation=90)
plt.show()

# Prepare the Data for Predicting Next Set of Numbers (Multi-Output Classification)
def prepare_next_draw_data(df):
    X = df.iloc[:-1, 2:8].values  # Use past draws as features
    y = df.iloc[1:, 2:8].values   # Use next draw as target
    return X, y

X, y = prepare_next_draw_data(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Separate Models for Each Winning Number
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Evaluate models for each winning number position
next_draw_accuracies = {}
for i in range(y_train.shape[1]):  # Loop over each winning number position
    print(f'\nTraining models for Winning Number {i+1}')
    y_train_i = y_train[:, i]
    y_test_i = y_test[:, i]
    
    for name, model in models.items():
        model.fit(X_train, y_train_i)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test_i, y_pred)
        next_draw_accuracies[name] = next_draw_accuracies.get(name, 0) + accuracy
        print(f'\n{name} Accuracy for Winning Number {i+1}: {accuracy:.4f}')
        print(classification_report(y_test_i, y_pred))

# Average Accuracy for Next Draw Models
for name in next_draw_accuracies:
    next_draw_accuracies[name] /= y_train.shape[1]  # Average accuracy over all winning numbers

# Predicting Hot/Cold Numbers (Binary Classification)
def prepare_hot_cold_data(df, freq_df):
    # Generate Hot/Cold labels based on frequency
    winning_numbers = df.iloc[:, 2:8].values
    # Ensure that we have the correct shape of labels corresponding to the number of rows
    labels = np.array([['Hot' if num in freq_df[freq_df['Label'] == 'Hot']['Number'].values else 'Cold' for num in draw] for draw in winning_numbers])
    
    return winning_numbers, labels

X_hotcold, y_hotcold = prepare_hot_cold_data(df, freq_df)

# Ensure correct shape of y_hotcold for training
X_train, X_test, y_train, y_test = train_test_split(X_hotcold, y_hotcold, test_size=0.2, random_state=42)

# Train Hot/Cold Classifiers
hot_cold_accuracies = {}
hot_cold_models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Fix the shape of labels and train models
for name, model in hot_cold_models.items():
    for i in range(y_train.shape[1]):  # Loop over each winning number position
        print(f'\nTraining {name} for Hot/Cold classification on Winning Number {i+1}')
        y_train_i = y_train[:, i]  # Select the i-th column of labels
        y_test_i = y_test[:, i]
        
        model.fit(X_train, y_train_i)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test_i, y_pred)
        hot_cold_accuracies[name] = hot_cold_accuracies.get(name, 0) + accuracy
        print(f'\n{name} Hot/Cold Accuracy for Winning Number {i+1}: {accuracy:.4f}')
        print(classification_report(y_test_i, y_pred))

# Average Hot/Cold Accuracy for Models
for name in hot_cold_accuracies:
    hot_cold_accuracies[name] /= y_train.shape[1]  # Average accuracy over all winning numbers

# Display Model Comparison for Next Draw
plt.figure(figsize=(8,5))
sns.barplot(x=list(next_draw_accuracies.keys()), 
            y=list(next_draw_accuracies.values()), 
            palette='coolwarm')
plt.xlabel('Machine Learning Models for Next Draw')
plt.ylabel('Average Accuracy Score')
plt.title('Model Performance Comparison (Next Draw)')
plt.show()

# Display Model Comparison for Hot/Cold Classification
plt.figure(figsize=(8,5))
sns.barplot(x=list(hot_cold_accuracies.keys()), 
            y=list(hot_cold_accuracies.values()), 
            palette='coolwarm')
plt.xlabel('Machine Learning Models for Hot/Cold')
plt.ylabel('Average Accuracy Score')
plt.title('Model Performance Comparison (Hot/Cold)')
plt.show()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbc in position 10: invalid start byte