<a href="https://colab.research.google.com/gist/DenizGuelcicek/d9ae1654ab0e30459199c11f7b177c2e/playingaround.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# Load packages
library(tidyverse)
library(ggplot2)
library(googledrive)

In [None]:
install.packages("googledrive")


In [None]:
# Load the data

# Time Period 2009 - 2019
first <- 2009 #first season to grab. min available=2009
last <- 2019 # most recent season

datalist = list()

for (yr in first:last) {
    pbp <- read_csv(url(paste0("https://github.com/ryurko/nflscrapR-data/raw/master/play_by_play_data/regular_season/reg_pbp_", yr, ".csv")))
    games <- read_csv(url(paste0("https://raw.githubusercontent.com/ryurko/nflscrapR-data/master/games_data/regular_season/reg_games_", yr, ".csv")))
    pbp <- pbp %>% 
      inner_join(games %>% distinct(game_id, week, season)) %>% 
      select(-fumble_recovery_2_yards, -blocked_player_id, -fumble_recovery_2_player_id)
    datalist[[yr]] <- pbp 
}

pbp_all <- dplyr::bind_rows(datalist)

In [None]:
# Check the team name consistency over periods
pbp_all %>% group_by(home_team) %>%summarize(n=n(), seasons=n_distinct(season), minyr=min(season), maxyr=max(season)) %>% 
	arrange(seasons)

In [18]:
# Rename inconsistent team names 
pbp_all <- pbp_all %>% 
	mutate_at(vars(home_team, away_team, posteam, defteam), funs(case_when(
            . %in% "JAX" ~ "JAC",
            . %in% "STL" ~ "LA",
            . %in% "SD" ~ "LAC",
            TRUE ~ .
        ))) 

In [21]:
# Save the dataframe
saveRDS(pbp_all, file="NFLdata.rds")
pbp_all <- readRDS("NFLdata.rds")

In [None]:
#Clean up

pbp_all_rp <- pbp_all %>%
#removing plays that don't have values for epa, and that are either pass plays, run plays, or penalties ("no play")
      filter(!is.na(epa), !is.na(posteam), play_type=="no_play" | play_type=="pass" | play_type=="run") %>% 


	mutate(
#"pass" searches the "desc" variable for plays with "pass", "sacked", or "scramble", 
	pass = if_else(str_detect(desc, "( pass)|(sacked)|(scramble)"), 1, 0),
# rush searches the "desc" for a successful play (using the simple definition for success of positive EPA)
	rush = if_else(str_detect(desc, "(left end)|(left tackle)|(left guard)|(up the middle)|(right guard)|(right tackle)|(right end)") & pass == 0, 1, 0),
	success = ifelse(epa>0, 1 , 0),

# passer player names
	passer_player_name = ifelse(play_type == "no_play" & pass == 1, 
              str_extract(desc, "(?<=\\s)[A-Z][a-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?(?=\\s((pass)|(sack)|(scramble)))"),
              passer_player_name),
# receiver player names					
        receiver_player_name = ifelse(play_type == "no_play" & str_detect(desc, "pass"), 
              str_extract(desc, "(?<=to\\s)[A-Z][a-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?"),
              receiver_player_name),
# rusher player names								
        rusher_player_name = ifelse(play_type == "no_play" & rush == 1, 
              str_extract(desc, "(?<=\\s)[A-Z][a-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?(?=\\s((left end)|(left tackle)|(left guard)|		      (up the middle)|(right guard)|(right tackle)|(right end)))"),
              rusher_player_name),
	name = ifelse(!is.na(passer_player_name), passer_player_name, rusher_player_name),


	yards_gained=ifelse(play_type=="no_play",NA,yards_gained),
	play=1
	) %>%
	filter(pass==1 | rush==1)  # keep only the run or pass plays

In [None]:
pbp_all_rp %>%

           filter(rusher_player_name == "C.Newton") %>%
           select(desc, yardline_100, home_team, defteam, play_type, down, season, week)