In [12]:
import sys
print(sys.executable)


D:\Anaconda\anaconda\python.exe


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
import pandas as pd
import pickle

# Load your dataset
data_articles = pd.read_csv('forarticles.csv')

# Initialize OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Fit and transform 'Day' column with OneHotEncoder
day_encoded = encoder.fit_transform(data_articles[['Day']])

# Convert the array back to a DataFrame
day_encoded_df = pd.DataFrame(day_encoded.toarray(), columns=encoder.get_feature_names_out(['Day']))

# Features for all articles
X = pd.concat([day_encoded_df, data_articles[['Customer Count']]], axis=1)

# Targets for each article, directly using the count
y_A = data_articles['ArticleA']
y_B = data_articles['ArticleB']
y_C = data_articles['ArticleC']

# Splitting the data into training and testing sets for each article
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X, y_A, test_size=0.2, random_state=42)
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X, y_B, test_size=0.2, random_state=42)
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X, y_C, test_size=0.2, random_state=42)

# Initialize models to evaluate
models = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'ExtraTrees': ExtraTreesRegressor(random_state=42),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Function to evaluate models and return the best model based on R^2 score
def evaluate_models(X_train, X_test, y_train, y_test):
    best_model = None
    best_r2_score = float('-inf')
    model_accuracies = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        accuracy_percentage = r2 * 100
        model_accuracies[name] = accuracy_percentage
        print(f"{name} R^2 Score: {r2:.4f} (Accuracy: {accuracy_percentage:.2f}%)")
        if r2 > best_r2_score:
            best_r2_score = r2
            best_model = model
    return best_model, best_r2_score, model_accuracies

# Evaluate and select the best model for each target
best_model_A, best_r2_A, accuracies_A = evaluate_models(X_train_A, X_test_A, y_train_A, y_test_A)
best_model_B, best_r2_B, accuracies_B = evaluate_models(X_train_B, X_test_B, y_train_B, y_test_B)
best_model_C, best_r2_C, accuracies_C = evaluate_models(X_train_C, X_test_C, y_train_C, y_test_C)

# Save the best models and the encoder
pickle.dump(best_model_A, open('best_model_a.pkl', 'wb'))
pickle.dump(best_model_B, open('best_model_b.pkl', 'wb'))
pickle.dump(best_model_C, open('best_model_c.pkl', 'wb'))
pickle.dump(encoder, open('encoder.pkl', 'wb'))

print("Best models saved successfully.")

# Print the accuracy percentage for each model
print("\nAccuracy of models for Article A:")
for name, accuracy in accuracies_A.items():
    print(f"{name}: {accuracy:.2f}%")

print("\nAccuracy of models for Article B:")
for name, accuracy in accuracies_B.items():
    print(f"{name}: {accuracy:.2f}%")

print("\nAccuracy of models for Article C:")
for name, accuracy in accuracies_C.items():
    print(f"{name}: {accuracy:.2f}%")


RandomForest R^2 Score: -0.6965 (Accuracy: -69.65%)
GradientBoosting R^2 Score: -1.1079 (Accuracy: -110.79%)
AdaBoost R^2 Score: -1.0175 (Accuracy: -101.75%)
ExtraTrees R^2 Score: -0.7134 (Accuracy: -71.34%)
DecisionTree R^2 Score: -0.8214 (Accuracy: -82.14%)
XGBoost R^2 Score: -0.5332 (Accuracy: -53.32%)
RandomForest R^2 Score: -0.6843 (Accuracy: -68.43%)
GradientBoosting R^2 Score: -0.3960 (Accuracy: -39.60%)
AdaBoost R^2 Score: -0.7068 (Accuracy: -70.68%)
ExtraTrees R^2 Score: -0.9191 (Accuracy: -91.91%)
DecisionTree R^2 Score: -1.8511 (Accuracy: -185.11%)
XGBoost R^2 Score: -1.6595 (Accuracy: -165.95%)
RandomForest R^2 Score: -0.5429 (Accuracy: -54.29%)
GradientBoosting R^2 Score: -1.6423 (Accuracy: -164.23%)
AdaBoost R^2 Score: -0.2359 (Accuracy: -23.59%)
ExtraTrees R^2 Score: -1.8439 (Accuracy: -184.39%)
DecisionTree R^2 Score: -1.1542 (Accuracy: -115.42%)
XGBoost R^2 Score: -1.0847 (Accuracy: -108.47%)
Best models saved successfully.

Accuracy of models for Article A:
RandomFore

In [10]:
print("Best Model Parameters:")
print(best_model_A.get_params())


Best Model Parameters:
{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


In [11]:
import pandas as pd
import pickle

# Load the trained model
with open('best_model_a.pkl', 'rb') as f:
    model_a = pickle.load(f)

# Manually create a DataFrame with the features
# Let's assume we are predicting for Monday with 100 customers
# We need to set the correct flags for one-hot encoded days
data = {
    'Day_Friday': [1],
    'Day_Monday': [0],
    'Day_Saturday': [0],
    'Day_Sunday': [0],
    'Day_Thursday': [0],
    'Day_Tuesday': [0],
    'Day_Wednesday': [0],
    'Customer Count': [100]
}

features_df = pd.DataFrame(data)

# Now use the model to predict
prediction = model_a.predict(features_df)
print("Prediction for Article A:", prediction)


Prediction for Article A: [51.463856]
