# Result

In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
library(GGally)
library(ISLR)
options(repr.matrix.max.rows = 6)
source("cleanup.R")

In [None]:
URL <- "https://drive.google.com/uc?export=download&id=1w_vUI6QgOW2d9bF07o1XM4MAaSF3dpea"
player_data <- read_csv(URL)
player_data

In [None]:
player_data_filtered <- player_data |> 
                        select(experience, subscribe, played_hours, gender, Age) |>
                        filter(played_hours != 0.0) |>
                        drop_na(Age) |>
                        mutate (gender = as_factor (gender)) |>
                        mutate (experience = as_factor (experience)) |>
                        filter (gender != "Agender" )

player_data_filtered

In [None]:
players_split <- initial_split (player_data_filtered, prop=0.75, strata=played_hours)
training_players <- training(players_split)
testing_players <- testing (players_split)
training_players
testing_players

The code above splits the dataset into training and testing sets. 75% of the datasets will go to the training set, and 25% to the testing test. (strata = played_hours) ensures the distribution of played_hours is similar in both the training and testing sets.

In [None]:
players_spec <- linear_reg()|>
            set_engine ("lm") |>
            set_mode ("regression")

players_recipe <- recipe (played_hours~Age, data = training_players)

players_fit <- workflow()|>
            add_recipe (players_recipe)|>
            add_model (players_spec) |>
            fit (data = training_players)
players_fit

This code builds a linear regression model to predict how many hours someone plays based on their age.

In [None]:
players_test_result <- players_fit |>
                    predict (testing_players) |>
                    bind_cols (testing_players) |>
                    metrics (truth = played_hours, estimate=.pred)

players_test_result

In [None]:
players_age_min <- player_data_filtered |>
                    select (Age) |>
                    min()
players_age_max <-  player_data_filtered |>
                    select (Age) |>
                    max()
players_hrs_prediction <- tibble (Age = c(players_age_min, players_age_max))

players_hrs_prediction

The code below filtered and creates a tibble of the maximum and minimum age from our dataset.

In [None]:
age_plot <- ggplot(player_data_filtered, aes(x = Age, y = played_hours)) +
            geom_point(alpha = 0.4) +
            geom_line(data = players_preds,
                      mapping = aes(x = Age, y = .pred), 
                      color = "blue") +
            xlab("Age (in Years)") +
            ylab("Hours Played (in hours)") +
            ggtitle ("The Linear Regression of Played Hours and Age") +
            theme(text = element_text(size = 20))
age_plot

### Gender

In [None]:
players_split <- initial_split (player_data_filtered, prop=0.75, strata=played_hours)
training_players <- training(players_split)
testing_players <- testing (players_split)
training_players
testing_players

In [None]:
players_spec <- linear_reg()|>
            set_engine ("lm") |>
            set_mode ("regression")

players_recipe <- recipe (played_hours~gender, data = training_players)

players_fit <- workflow()|>
            add_recipe (players_recipe)|>
            add_model (players_spec) |>
            fit (data = training_players)
players_fit

In [None]:
players_test_result <- players_fit |>
                    predict (testing_players) |>
                    bind_cols (testing_players) 
players_test_result

In [None]:
gender_plot <- ggplot(players_test_result, aes(x = gender, y = played_hours)) +
            geom_point(alpha = 0.4) +
            geom_line(aes (y = .pred), 
                      color = "blue") +
            xlab("gender") +
            ylab("Hours Played (in hours)") +
            ggtitle ("The Linear Regression of Played Hours and gender") +
            theme(text = element_text(size = 20))
gender_plot