# Individual Project Proposal - Yolanda Peng

In [None]:
# loading libraries
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

# PART 1: Data Description

In [None]:
# reading in data
players_data <- read_csv("data/players.csv")
sessions_data <- read_csv("data/sessions.csv")

players_data
sessions_data

### Explaining the datasets
The observations from the above datasets were collected through a Minecraft server set up by a research group in Computer Science UBC led by Frank Wood looking to collect data regarding how individuals play video games. The first dataset includes information about a list of all unique players, including data about each player. The second dataset refers to a list of individual play sessions by each player, including data about the session. The data was collected through a MineCraft server set up by the research group through PLAICraft, where players' actions and information were recorded as they navigated through the world.

In [None]:
# Calculating Summary Statistics

print("players.csv summary statistics")
players_summary <- summary(players_data) 
players_summary

print("sessions.csv summary statistics")
sessions_summary <- summary(sessions_data)
sessions_summary


In [None]:
print("**players.csv summary statistics rounded**")

# summary of played_hours in players.csv data rounded
players_mean_hours <- players_data |>
                summarise(mean_hours = mean(played_hours)) |>
                round(2) |>
                pull()
players_median_hours <- players_data |>
                summarise(median_hours = median(played_hours)) |>
                round(2) |>
                pull()
players_min_hours <- players_data |>
                summarise(min_hours = min(played_hours)) |>
                round(2) |>
                pull()
players_max_hours <- players_data |>
                summarise(max_hours = max(played_hours)) |>
                round(2) |>
                pull()
print(paste("mean time played (hr): ", players_mean_hours))
print(paste("median time played (hr): ", players_median_hours))
print(paste("max time played (hr): ", players_max_hours))
print(paste("min time played (hr): ", players_min_hours))

# summary of age in players.csv data rounded
players_mean_age <- players_data |>
                summarise(mean_age = mean(Age, na.rm = TRUE)) |>
                round(2) |>
                pull()
players_median_age <- players_data |>
                summarise(median_age = median(Age, na.rm = TRUE)) |>
                round(2) |>
                pull()
players_min_age <- players_data |>
                summarise(min_age = min(Age, na.rm = TRUE)) |>
                round(2) |>
                pull()
players_max_age <- players_data |>
                summarise(max_age = max(Age, na.rm = TRUE)) |>
                round(2) |>
                pull()

print(paste("mean age (year): ", players_mean_age))
print(paste("median age (year): ", players_median_age))
print(paste("max age (year): ", players_max_age))
print(paste("min age (year): ", players_min_age))

### Descriptive Summary
#### `players.csv` Dataset
- There are a total of **196 observations** included in this dataset (corresponding to the number of rows)
- There are a total of **7 variables** (corresponding to the number of columns): 
    - `experience` - How much experience a user has with the game; 5 categories: `Veteran`, `Pro`, `Regular`, `Beginner`, `Amateur`
    - `subscribe` - Whether the player is subscribed to a game-related newsletter, can be `TRUE` or `FALSE`
    - `hashedEmail` - The email of the player, hashed for privacy/anonymity reasons
    - `played_hours` - The amount of time the player spent playing the game in hours
    - `name` - The provided name of the player
    - `gender` - The gender of the player, specified by the player
    - `Age` - The age of the player in years, provided by the player
<br>
<br>

 - Variable types:
    - `experience`, `hashedEmail`, `name`, `gender` has type `<chr>`
    - `played_hours`, `Age` has type `<dbl`
    - `subscribe` has type `<lgl>`

In [None]:
print("**sessions.csv summary statistics rounded**")

# summary of original_start_time in sessions.csv data rounded
mean_start_time <- sessions_data |>
                summarise(mean_start_time = mean(original_start_time)) |>
                round(2) |>
                pull()
median_start_time <- sessions_data |>
                summarise(median_start_time = median(original_start_time)) |>
                round(2) |>
                pull()
min_start_time <- sessions_data |>
                summarise(min_start_time = min(original_start_time)) |>
                round(2) |>
                pull()
max_start_time <- sessions_data |>
                summarise(max_start_time = max(original_start_time)) |>
                round(2) |>
                pull()
print(paste("mean start time (s): ", mean_start_time))
print(paste("median start time (s): ", median_start_time))
print(paste("max start time (s): ", max_start_time))
print(paste("min start time (s): ", min_start_time))

# summary of original_end_time in sessions.csv data rounded
mean_end_time <- sessions_data |>
                summarise(mean_age = mean(original_end_time, na.rm = TRUE)) |>
                round(2) |>
                pull()
median_end_time <- sessions_data |>
                summarise(median_age = median(original_end_time, na.rm = TRUE)) |>
                round(2) |>
                pull()
min_end_time <- sessions_data |>
                summarise(min_age = min(original_end_time, na.rm = TRUE)) |>
                round(2) |>
                pull()
max_end_time <- sessions_data |>
                summarise(max_age = max(original_end_time, na.rm = TRUE)) |>
                round(2) |>
                pull()

print(paste("mean end time (s): ", mean_end_time))
print(paste("median end time (s): ", median_end_time))
print(paste("max aend time (s): ", max_end_time))
print(paste("min end time (s): ", min_end_time))

#### `sessions.csv` Dataset 
- There are a total of **1535 observations** included in this dataset (corresponding to the number of rows)
- There are a total of **5 variables** (corresponding to the number of columns): 
    - `hashedEmail` - The email of the player, hashed for privacy/anonymity reasons
    - `start-time` - The start time of the session in Day/Month/Year, 24h time format
    - `end_time` - The end time of the session in Day/Month/Year, 24h time format
    - `original_start_time` - The start time before being converted into Day/Month/Year, 24h time format
    - `original_end_time` - The end time before being converted into Day/Month/Year, 24h time format
<br>
<br>

 - Variable types:
    - `hashedEmail`, `start_time`, `end_time` has type `<chr>`
    - `original_start_time`, `original_end_time` has type `<dbl`

### Issues with the Data
Some issues I see in the given data is that it is unclear how exactly the variables `original_start_time` and `original_end_time` are represented and what units they take on. I also believe that it would be helpful to include a variable representing the time between the start and end times of a session, in order to make it easier to find the summary statistics for the length of each session in addition to the start and end times. Additionally, the values of the `hashedEmail` variable are lengthy and not human-readable, which could potentially make it difficult to compare/relate the data from the sessions dataset with the players dataset.

# PART 2: Questions

The broad question I am looking to address is **which "kinds" of players are most likely to contribute a large amount of data so that we can target those players in our recruiting efforts.** The specific question I will address is: **Can `played_hours` and `Age` predict `experience` in the `players.csv` dataset?** The `players.csv` dataset already includes the necessary explanatory and predictive variables needed to answer the question. By plotting the `played_hours` and `Age` variables against each other and coloring the points by class, we may be able to predict which "kind" a player is based on how much they play, and thus we can figure out which "kinds" of players are most likely to have greater play times and thus contibute a larger amount of data. It may be necessary to convert the `experience` and `gender` variables to a categorial type rather than `<chr>`.

# PART 3: Exploratory Data Analysis and Visualization 

In [None]:
# wrangling players.csv
players_data_wrangled <- players_data |>
                as_tibble() |>
                mutate(experience = as_factor(experience), gender = as_factor(gender))
players_data_wrangled

# wrangling sessions.csv
sessions_data_wrangled <- sessions_data |>
                as_tibble() |>
                mutate(start_time = as_datetime(start_time), end_time = as_datetime(end_time))
sessions_data_wrangled

# creating players.csv mean value table
players_mean_table <- tibble(variable = c("played_hours", "Age"), mean = c(players_mean_hours, players_mean_age)) |>
                      mutate(mean = unlist(mean))
players_mean_table

In [None]:
# creating exploratory visuals
options(repr.plot.width = 12, repr.plot.height = 8)
age_vs_hours_plot <- players_data_wrangled |>
                    ggplot(aes(x = Age, y = played_hours, color = experience)) +
                    geom_point() +
                    labs(x = "age of the player (in years)", y = "total playtime (in hours)", color = "levels of player experience", title = "age vs. playtime") +
                    theme(text = element_text(size = 20))
age_vs_hours_plot

# players_data_count <- players_data_wrangled |>
#                     group_by(experience) |>
#                     summarise(count = n())

playtime_bar_plot <- players_data_wrangled |>
                    ggplot(aes(x = experience, y = played_hours)) +
                    geom_bar(stat = "identity") +
                    labs(x = "type of player experience", y = "total playtime (in hours)", title = "playtime vs. player experience") +
                    theme(text = element_text(size = 20))
playtime_bar_plot

### Insig