In [5]:
library(tidyverse)
library(progress)
library(readr)
library(dplyr)
library(glue)


In [6]:
# Define the base path
base_path <- "/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/DataPreparation/CompData/march-machine-learning-mania-2024/"

# Read the data using the base path
result_tourney_mens <- read_csv(glue("{base_path}MNCAATourneyCompactResults.csv"))
result_regular_mens <- read_csv(glue("{base_path}MRegularSeasonCompactResults.csv"))
result_tourney_2nd_mens <- read_csv(glue("{base_path}MSecondaryTourneyCompactResults.csv")) %>% 
  select(-SecondaryTourney)
teams_mens <- read_csv(glue("{base_path}MTeams.csv"))

result_tourney_womens <- read_csv(glue("{base_path}WNCAATourneyCompactResults.csv"))
result_regular_womens <- read_csv(glue("{base_path}WRegularSeasonCompactResults.csv"))
teams_womens <- read_csv(glue("{base_path}WTeams.csv"))


[1mRows: [22m[34m2451[39m [1mColumns: [22m[34m8[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (1): WLoc
[32mdbl[39m (7): Season, DayNum, WTeamID, WScore, LTeamID, LScore, NumOT

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m187289[39m [1mColumns: [22m[34m8[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (1): WLoc
[32mdbl[39m (7): Season, DayNum, WTeamID, WScore, LTeamID, LScore, NumOT

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m1756[39m [1mColumns: [22m[34m9[39m
[

In [7]:
result_regular_mens <- result_regular_mens %>% mutate(type = "regular")
result_tourney_mens <- result_tourney_mens %>% mutate(type = "tourney")
result_tourney_2nd_mens <- result_tourney_2nd_mens %>% mutate(type = "tourney_2nd")

result_merged_mens <- bind_rows(result_regular_mens, result_tourney_mens, result_tourney_2nd_mens) %>% 
  arrange(Season, DayNum) %>% 
  mutate(
    WRating_before = 50,
    LRating_before = 50,
    WRating_after = 50,
    LRating_after = 50
  )

result_regular_womens <- result_regular_womens %>% mutate(type = "regular")
result_tourney_womens <- result_tourney_womens %>% mutate(type = "tourney")

result_merged_womens <- bind_rows(result_regular_womens, result_tourney_womens) %>% 
  arrange(Season, DayNum) %>% 
  mutate(
    WRating_before = 50,
    LRating_before = 50,
    WRating_after = 50,
    LRating_after = 50
  )

In [8]:
update_elo_rating_mens <- function(df, teams, K) {
  
  n <- nrow(df)
  pb <- progress_bar$new(total = n)
  
  for (i in 1:n) {
    
    WTeamID = df[i,3] %>% as.numeric()
    LTeamID = df[i,5] %>% as.numeric()
    
    WTeamRating_before <- teams %>% filter(TeamID == WTeamID) %>% select(Rating) %>% as.numeric()
    LTeamRating_before <- teams %>% filter(TeamID == LTeamID) %>% select(Rating) %>% as.numeric()
    WTeam_num <- teams %>% filter(TeamID == WTeamID) %>% select(num) %>% as.numeric()
    LTeam_num <- teams %>% filter(TeamID == LTeamID) %>% select(num) %>% as.numeric()
    
    WTeamRating_after <- WTeamRating_before + K*(1/(10^((WTeamRating_before-LTeamRating_before)/10)+1))
    LTeamRating_after <- LTeamRating_before - K*(1/(10^((WTeamRating_before-LTeamRating_before)/10)+1))
    
    df[i,10] <- WTeamRating_before
    df[i,11] <- LTeamRating_before
    df[i,12] <- WTeamRating_after
    df[i,13] <- LTeamRating_after
    
    teams[WTeam_num,5] <- WTeamRating_after
    teams[LTeam_num,5] <- LTeamRating_after
    
    pb$tick()
  }
  
  return(df)
}

update_elo_rating_womens <- function(df, teams, K) {
  
  n <- nrow(df)
  pb <- progress_bar$new(total = n)
  
  for (i in 1:n) {
    
    WTeamID = df[i,3] %>% as.numeric()
    LTeamID = df[i,5] %>% as.numeric()
    
    WTeamRating_before <- teams %>% filter(TeamID == WTeamID) %>% select(Rating) %>% as.numeric()
    LTeamRating_before <- teams %>% filter(TeamID == LTeamID) %>% select(Rating) %>% as.numeric()
    WTeam_num <- teams %>% filter(TeamID == WTeamID) %>% select(num) %>% as.numeric()
    LTeam_num <- teams %>% filter(TeamID == LTeamID) %>% select(num) %>% as.numeric()
    
    WTeamRating_after <- WTeamRating_before + K*(1/(10^((WTeamRating_before-LTeamRating_before)/10)+1))
    LTeamRating_after <- LTeamRating_before - K*(1/(10^((WTeamRating_before-LTeamRating_before)/10)+1))
    
    df[i,10] <- WTeamRating_before
    df[i,11] <- LTeamRating_before
    df[i,12] <- WTeamRating_after
    df[i,13] <- LTeamRating_after
    
    teams[WTeam_num,3] <- WTeamRating_after
    teams[LTeam_num,3] <- LTeamRating_after
    
    pb$tick()
  }
    
  return(df)
}

In [9]:
K <- 1

teams_add_mens <- teams_mens %>% mutate(Rating = 50, num = row_number())
teams_add_womens <- teams_womens %>% mutate(Rating = 50, num = row_number())

res_mens <- update_elo_rating_mens(result_merged_mens, teams_add_mens, K)
res_womens <- update_elo_rating_womens(result_merged_womens, teams_add_womens, K)

In [10]:
res_regular <- res_mens %>% filter(type == "regular")

tmp <- res_regular %>% select(Season, DayNum, WTeamID, WRating_before, WRating_after) %>% rename(TeamID = WTeamID, Rating_before = WRating_before, Rating_after = WRating_after)
tmp2 <- res_regular %>% select(Season, DayNum, LTeamID, LRating_before, LRating_after) %>% rename(TeamID = LTeamID, Rating_before = LRating_before, Rating_after = LRating_after)

tmp3 <- bind_rows(tmp, tmp2)

tmp4 <- tmp3 %>% group_by(Season, TeamID) %>% summarise(DayNum = max(DayNum))

elo_mens <- tmp3 %>% 
  inner_join(tmp4, by = c("Season", "TeamID", "DayNum")) %>% 
  filter(Season >= 2010) %>% 
  select(TeamID, Rating_after, Season) %>% 
  rename(Rating = Rating_after) %>% 
  left_join(teams_mens, by = c("TeamID")) %>% 
  select(TeamID, TeamName, Season, Rating)

[1m[22m`summarise()` has grouped output by 'Season'. You can override using the
`.groups` argument.


In [16]:
res_regular <- res_womens %>% filter(type == "regular")

tmp <- res_regular %>% select(Season, DayNum, WTeamID, WRating_before, WRating_after) %>% rename(TeamID = WTeamID, Rating_before = WRating_before, Rating_after = WRating_after)
tmp2 <- res_regular %>% select(Season, DayNum, LTeamID, LRating_before, LRating_after) %>% rename(TeamID = LTeamID, Rating_before = LRating_before, Rating_after = LRating_after)

tmp3 <- bind_rows(tmp, tmp2)

tmp4 <- tmp3 %>% group_by(Season, TeamID) %>% summarise(DayNum = max(DayNum))

elo_womens <- tmp3 %>% 
  inner_join(tmp4, by = c("Season", "TeamID", "DayNum")) %>% 
  filter(Season >= 2010) %>% 
  select(TeamID, Rating_after, Season) %>% 
  rename(Rating = Rating_after) %>% 
  left_join(teams_womens, by = c("TeamID")) %>% 
  select(TeamID, TeamName,FirstD1Season,LastD1Season, Rating, num, Season, )

[1m[22m`summarise()` has grouped output by 'Season'. You can override using the
`.groups` argument.


ERROR: [1m[33mError[39m in `select()`:[22m
[33m![39m Can't select columns that don't exist.
[31m✖[39m Column `FirstD1Season` doesn't exist.


In [14]:
elo_mens_sorted <- elo_mens %>%
  arrange(desc(Season), TeamName)


elo_womens_sorted <- elo_womens %>%
  arrange(desc(Season), TeamName)


elo_mens_sorted %>% write_csv("Mine_EloRating_mens_10_sorted.csv")
elo_womens_sorted %>% write_csv("Mine_EloRating_womens_10_sorted.csv")

In [15]:
flaty_mens = read_csv('/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/Feature_Eng/Flaty_Data_Ref/EloRating_mens_10.csv')
flaty_womens = read_csv('/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/Feature_Eng/Flaty_Data_Ref/EloRating_womens_10.csv')


flaty_mens_sorted <- flaty_mens %>%
  arrange(desc(Season), TeamName)


flaty_womens_sorted <- flaty_womens %>%
  arrange(desc(Season), TeamName)


flaty_mens_sorted %>% write_csv("flaty_mens_sorted.csv")
flaty_womens_sorted %>% write_csv("flaty_womens_sorted.csv")

[1mRows: [22m[34m5670[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (1): TeamName
[32mdbl[39m (6): TeamID, FirstD1Season, LastD1Season, Rating, num, Season

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m5640[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (1): TeamName
[32mdbl[39m (4): TeamID, Rating, num, Season

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
