In [1]:
require(stringr)
project_name = 'ML-MADNESS'
repo_path = file.path(substr(getwd(),0,gregexpr(pattern = project_name ,
                                                getwd())[[1]][1]-2),
                      project_name)
setwd(repo_path)

Loading required package: stringr



In [64]:
source(file.path(repo_path,'objects','helper_functions.R'))
p = scan(file.path(repo_path,'features/requirements.txt'),what="",sep="\n")
load_all_packages(p)
p = scan(file.path(repo_path,'sagemaker/requirements.txt'),what="",sep="\n")
load_all_packages(p)
slice = dplyr::slice
rename = dplyr::rename

also installing the dependency ‘bitops’


Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [3]:
# Sagemaker Settings
sagemaker <- import('sagemaker')
session <- sagemaker$Session()
bucket <- session$default_bucket()
role_arn <- sagemaker$get_execution_role()

In [4]:
source(file.path(repo_path,'features','api_calls.r'))

In [17]:
source(file.path(repo_path,'features','modeling.r'))
boxscores = load_box_score_data()

In [30]:
all_team_data = initial_cleaning(boxscores)

In [33]:
team_info = all_team_data %>%
  ungroup() %>% 
  select(team_id,team_display_name.x) %>% 
  group_by(team_id,team_display_name.x) %>% 
  summarize(APPEARANCES = n()) %>% 
  rename("team_name" = "team_display_name.x") %>% 
  arrange(desc(APPEARANCES))

team_data_refined = all_team_data %>% 
  ungroup() %>% 
  select(-c(team_display_name.x,team_display_name.y))

[1m[22m`summarise()` has grouped output by 'team_id'. You can override using the
`.groups` argument.


In [48]:
source(file.path(repo_path,'features','elo_helper.r'))

In [53]:
df = BOOSTED_calculate_relative_elo(team_data_refined = team_data_refined %>% 
                                filter(SEASON==2022),
                            features = c("PTS.x","PTS.y"),
                            rename_fs = c("PTS_SCORED","PTS_ALLOWED"))

In [54]:
tail(df)

game_date,game_id,team_id,opponent_id,APPEARANCE,OPP_APPEARANCE,PTS.x,PTS.y,PTS_SCORED_AVG,PTS_SCORED_SD,⋯,PTS_SCORED_DEF_ELO,PTS_SCORED_ELO_NEXT,PTS_SCORED_DEF_ELO_NEXT,PTS_ALLOWED_AVG,PTS_ALLOWED_SD,PTS_ALLOWED_PERF,PTS_ALLOWED_ELO,PTS_ALLOWED_DEF_ELO,PTS_ALLOWED_ELO_NEXT,PTS_ALLOWED_DEF_ELO_NEXT
<date>,<int>,<chr>,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2022-04-02,401408634,222,2305,38,39,65,81,67.85714,67.85714,⋯,2303.809,1434.775,2269.215,65.57143,65.57143,0.23529412,1400.181,2303.809,1434.775,2269.215
2022-04-02,401408634,2305,222,39,38,81,65,74.75,74.75,⋯,2097.92,1822.469,2105.573,68.5,68.5,-0.05109489,1830.122,2097.92,1822.469,2105.573
2022-04-03,401408635,153,150,38,39,81,77,67.85714,67.85714,⋯,2181.78,2089.144,2155.643,65.57143,65.57143,0.17429194,2063.007,2181.78,2089.144,2155.643
2022-04-03,401408635,150,153,39,38,77,81,74.75,74.75,⋯,1937.582,1833.482,1910.221,68.5,68.5,0.18248175,1806.121,1937.582,1833.482,1910.221
2022-04-05,401408636,153,2305,39,40,69,72,74.75,74.75,⋯,2105.573,2096.808,2097.909,68.5,68.5,0.05109489,2089.144,2105.573,2096.808,2097.909
2022-04-05,401408636,2305,153,40,39,72,69,72.0,72.0,⋯,2155.643,1818.254,2159.858,71.0,71.0,-0.02816901,1822.469,2155.643,1818.254,2159.858


In [61]:
predictors = c('PTS_SCORED_ELO','PTS_SCORED_DEF_ELO','PTS_ALLOWED_ELO','PTS_ALLOWED_DEF_ELO')
non_predictors = c('game_date','game_id','team_id','opponent_id')

pre_modeling = df %>% 
            filter(APPEARANCE!=1,
            OPP_APPEARANCE!=1) %>%
            select(PTS.x,PTS.y,game_date,game_id,team_id,opponent_id,predictors)%>%
            mutate(MARGIN = PTS.x - PTS.y,
                   WIN = ifelse(MARGIN >0,1,0))
modeling = pre_modeling %>%
    select(WIN,non_predictors,predictors)
tail(modeling)

“[1m[22mUsing an external vector in selections was deprecated in tidyselect 1.1.0.
[36mℹ[39m Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(non_predictors)

  # Now:
  data %>% select(all_of(non_predictors))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.”


WIN,game_date,game_id,team_id,opponent_id,PTS_SCORED_ELO,PTS_SCORED_DEF_ELO,PTS_ALLOWED_ELO,PTS_ALLOWED_DEF_ELO
<dbl>,<date>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
0,2022-04-02,401408634,222,2305,1400.181,2303.809,1400.181,2303.809
1,2022-04-02,401408634,2305,222,1830.122,2097.92,1830.122,2097.92
1,2022-04-03,401408635,153,150,2063.007,2181.78,2063.007,2181.78
0,2022-04-03,401408635,150,153,1806.121,1937.582,1806.121,1937.582
0,2022-04-05,401408636,153,2305,2089.144,2105.573,2089.144,2105.573
1,2022-04-05,401408636,2305,153,1822.469,2155.643,1822.469,2155.643


In [67]:
source(file.path(repo_path,'modeling','xgboost_helper.r'))
project_name = 'ML-MADNESS'
model_name = "elo-small-1"
pre_model_info = train_test_upload_s3(

        session = session,
        sagemaker = sagemaker,
    
        bucket = bucket,   
    
        model_dataset = modeling,
        project_name = project_name,
        model_name = model_name,
        non_predictors = non_predictors,
        response = "WIN",
    
        repo_path = '/home/ec2-user/SageMaker/ML-MADNESS'



)

[1] "done uploading to S3"
[1] "Getting XGB Docker container for Sagemaker"
XGBoost Container Image URL:  811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest

In [68]:
xgb = sagemaker_xgb(

    pre_model_info = pre_model_info,
    session = session,
    bucket = bucket,
    role_arn = role_arn,
    sagemaker = sagemaker,
    
    model_name = model_name,
    project_name = project_name,
    model_prefix = 'models',
    
    instance_type = "ml.m5.4xlarge",
    
    models_to_try = 12L
    
    )

[1] "Started tuner for job: tune-xgboost-20230314-05-05-24"


In [70]:
# Check Session Status
job_name = xgb
print(session$describe_tuning_job(job_name)[["HyperParameterTuningJobStatus"]])
print(paste0(session$describe_tuning_job(job_name)$TrainingJobStatusCounters$Completed,'/',session$describe_tuning_job(job_name)$TrainingJobStatusCounters$Completed + session$describe_tuning_job(job_name)$TrainingJobStatusCounters$InProgress))


[1] "InProgress"
[1] "0/4"


In [None]:
tuning_job_results <- sagemaker$HyperparameterTuningJobAnalytics(job_name)
tuning_results_df <- tuning_job_results$dataframe()
tuning_results_df

In [None]:
ggplot(tuning_results_df, aes(TrainingEndTime, FinalObjectiveValue)) +
  geom_point() +
  xlab("Time") +
  ylab(tuning_job_results$description()$TrainingJobDefinition$StaticHyperParameters$`_tuning_objective_metric`) +
  ggtitle("Hyperparameter tuning objective metric",  
          "Progression over the period of all 30 training jobs") +
  theme_minimal()

In [None]:
best_tuned_model <- tuning_results_df %>%
  filter(FinalObjectiveValue == max(FinalObjectiveValue)) %>%
  pull(TrainingJobName)
best_tuned_model

In [None]:
training_job_stats <- session$describe_training_job(job_name = best_tuned_model)

final_metrics <-  map_df(training_job_stats$FinalMetricDataList, 
                          ~tibble(metric_name = .x[["MetricName"]],
                                  value = .x[["Value"]]))
final_metrics

In [None]:
predictions_path <- paste0("s3://", bucket, "/", project_name, "/", model_name, "/", 'models')

predictions_path