## Machine Learning Model

In [1]:
# import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
from scipy.stats import randint as sp_randint
from scipy.stats import uniform


In [2]:
# read in model data
data = pd.read_csv('model_df.csv')

In [3]:
data = data.drop(['Unnamed: 0'], axis = 1)

In [4]:
# split test and train data
train = data[data['data_id'] == 'df1']
test = data[data['data_id'] == 'df2']

In [6]:
# choose features and targets

X_train = train.drop(['id','data_id','sales'], axis = 1)
y_train = train['sales']

X_test = train.drop(['id','data_id','sales'], axis = 1)

In [7]:
# create validation set for hyperparameter tuning
X_train_val, X_val, y_train_val,  y_val = train_test_split(X_train, y_train, test_size = .2, random_state = 42)

In [15]:
X_train_val

Unnamed: 0,city_Babahoyo,city_Cayambe,city_Cuenca,city_Daule,city_El Carmen,city_Esmeraldas,city_Guaranda,city_Guayaquil,city_Ibarra,city_Latacunga,...,sale_cat_leisure,sale_cat_personal_home,sale_cat_specialty,onpromotion,dcoilwtico,cluster,transactions,month,day,year
2902383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-0.052204,-0.993453,0.541648,1.776742,-0.066597,0.601969,1.580235
1651514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.214373,-0.676075,1.186846,2.858887,0.229776,0.260967,0.104597
1948349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,-0.214373,-1.291525,0.326582,-1.086633,-1.548457,-1.557711,0.842416
2893871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-0.133289,-0.904707,-0.103550,0.516900,-0.066597,0.033632,1.580235
2867501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.214373,-0.788352,-1.178881,-1.029173,-0.066597,-1.557711,1.580235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692743,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.214373,-0.896424,-1.178881,-0.822747,0.526148,-0.648372,0.104597
2356330,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-0.214373,-0.767841,0.111516,-0.798273,0.526148,0.260967,0.842416
2229084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-0.214373,-0.682251,0.971780,-0.356690,-0.066597,-0.989374,0.842416
2768307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.001889,-0.608099,-1.178881,-0.847220,-0.659341,-0.989374,1.580235


### Hyperparameter tuning for Random Forest

In [None]:
# first round of random search

# Define the parameter grid
rf_param1 = {
    'n_estimators': sp_randint(100, 1000),
    'max_depth': sp_randint(2, 8),
    'min_samples_split': sp_randint(2, 20),
}

# instanciate random forest regressor class
rf_reg = RandomForestRegressor(random_state = 42)

# Instantiate RandomizedSearchCV
random_search1 = RandomizedSearchCV(
    estimator=rf_reg,
    param_distributions=rf_param1,
    n_iter=10,  
    scoring='neg_mean_squared_error',
    cv=3, 
    random_state=42
)

# Fit the RandomizedSearchCV instance
random_search1.fit(X_val, y_val)

# print best parameters
print("Best Parameters:", random_search1.best_params_)

In [None]:
# second round of random search


# Define the parameter grid
rf_param2 = {
    'n_estimators': sp_randint(),  
    'max_depth': sp_randint(),   
    'min_samples_split': sp_randint(),
}


# Instantiate RandomizedSearchCV
random_search2 = RandomizedSearchCV(
    estimator=rf_reg,
    param_distributions=rf_param2,
    n_iter=10,  
    scoring='neg_mean_squared_error',
    cv=3,  # Cross-validation folds
    random_state=42
)


# Fit the RandomizedSearchCV instance
random_search2.fit(X_val, y_val)

# print best parameters
print("Best Parameters:", random_search2.best_params_)

### Hyperparameter tuning for Gradient Boosting Regresssor

In [None]:
# first round of random search

# set parameter range
gb1_params = {
    'n_estimators': sp_randint(50, 200),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': sp_randint(3, 8),
    'min_samples_split': sp_randint(2, 20),
}

# instanciate GBR
gb_reg = GradientBoostingRegressor(random_state = 42)

# initialize random search model
gb1_random_search = RandomizedSearchCV(
    estimator=gb_reg,
    param_distributions=gb1_params,
    n_iter=10,
    scoring='neg_mean_squared_error',
    cv=3,
    random_state=42,
    n_jobs=-1
)

# fit data
gb1_random_search.fit(X_val, y_val)

# print best parameters
print("Best Parameters:", gb1_random_search.best_params_)

In [None]:
# second round of random search

# select parameter range
gb2_params = {
    'n_estimators': sp_randint(),
    'learning_rate': uniform(),
    'max_depth': sp_randint(),
    'min_samples_split': sp_randint(),
}

# create object for random search 
gb2_random_search = RandomizedSearchCV(
    estimator=gb_reg,
    param_distributions=gb2_params,
    n_iter=10,
    scoring='neg_mean_squared_error',
    cv=3,
    random_state=42,
    n_jobs=-1
)

# fit data
gb2_random_search.fit(X_val, y_val)

# print best parameters
print("Best Parameters:", gb2_random_search.best_params_)

## Hyperparameter tuning for Logistic Regression

In [None]:
# random search for logistic regression

lr1_params = {
    'C': uniform(loc=0, scale=4), 
    'penalty': ['l1', 'l2'],         
    'solver': ['sag','saga'] 
}

# instanciate logistic regression object 
log_reg = LogisticRegression(random_state = 42)

# create object for random search
lr1_random_search = RandomizedSearchCV(
    log_reg,
    param_distributions=lr1_params,
    n_iter=10,
    cv=3,  
    random_state=42, 
    scoring='accuracy',  
    n_jobs=-1  
)

# fit data
lr1_random_search.fit(X_val, y_val)

# print best parameters
print("Best Parameters:", lr1_random_search.best_params_)

In [None]:
# model using voting regressor

# choose models based on best parameters
rf_model = RandomForest(random_search2.best_params_, random_state = 42)
gb_model = GradientBoostingRegressor(gb2_random_search.best_params_, random_state = 42)
logreg_model = LogisticRegression(lr1_random_search.best_params_, random_state = 42)

# make voting model
voting_model = VotingRegressor(estimators = [
                                    ('rf', rf_model),
                                    ('gb', gb_model),
                                    ('log', logreg_model)
                                            ]
                               random_state = 42)

# fit model
voting_model.fit(X_train, y_train)



In [None]:
# make predictions
voting_model.predict(X_test)