## Title: (just an example) "Target Profile of Players Who Contribute the Most Data"
### Introduction:
* provide some relevant background information on the topic so that someone unfamiliar with it will be prepared to understand the rest of your report
* clearly state the question you tried to answer with your project
* identify and fully describe the dataset that was used to answer the question

Analyzing data from minecraft users...
QUESTION 2.

### Methods & Results:
* describe the methods you used to perform your analysis from beginning to end that narrates the analysis code.
your report should include code which:
    * loads data 
    * wrangles and cleans the data to the format necessary for the planned analysis
    * performs a summary of the data set that is relevant for exploratory data analysis related to the planned analysis 
    * creates a visualization of the dataset that is relevant for exploratory data analysis related to the planned analysis
    * performs the data analysis
    * creates a visualization of the analysis
       
* note: all figures should have a figure number and a legend
### Discussion:
* summarize what you found
* discuss whether this is what you expected to find
* discuss what impact could such findings have
* discuss what future questions could this lead to
### References
* You may include references if necessary, as long as they all have a consistent citation style.

In [None]:
### Run this cell before continuing.
library(tidyverse)
library(tidymodels)
library(tidyclust)
library(repr)
library(GGally)
library(cowplot)

#### Read the Datasets from URLs

In [None]:
players_data <- read_csv("https://raw.githubusercontent.com/wojpc/dsci100-project-008-group09/refs/heads/main/Data/players.csv")
sessions_data <- read_csv("https://raw.githubusercontent.com/wojpc/dsci100-project-008-group09/refs/heads/main/Data/sessions.csv")

In [None]:
head(players_data)
head(sessions_data)

#### Left Join Data on Hashed Email and Wrangle Time
A new column called session_time is added to show elapsed time of session and times are converted from strings to more usable data. Sessions data is now combined with player information. original_start_time and original_end_time were useless so I removed them to make the table tidier and less redundant. 

In [None]:
sessions_players_joined <- sessions_data |>
  left_join(players_data, by = "hashedEmail")

sessions_players_elapsed <- sessions_players_joined |>
    mutate(end_time = as.POSIXct(end_time, format = "%d/%m/%Y %H:%M"),
           start_time =  as.POSIXct(start_time, format = "%d/%m/%Y %H:%M")) |>
    mutate(session_time_elapsed = as.numeric(end_time - start_time)) |>
    select(-original_start_time, -original_end_time)

head(sessions_players_elapsed)

#### Preliminary Visualizations

In [None]:
options(repr.plot.width = 7, repr.plot.height = 7)

players_data1 <- filter(players_data, played_hours > 0)
age_vs_hours <- ggplot(players_data1, aes(x = Age, y = played_hours)) +
        geom_point() +
        labs(x = "Age (yrs)",
             y = "Total Hours Played (hours)") +
        scale_y_log10() + theme_bw()

sessions_players_elapsed1 <- filter(sessions_players_elapsed, session_time_elapsed > 0)
age_vs_sesstime <- ggplot(sessions_players_elapsed1, aes(x = Age, y = session_time_elapsed)) +
        geom_point() +
        labs(x = "Age (yrs)",
             y = "Total Session Time (minutes)") + theme_bw()

players_data_col <- select(players_data, experience, played_hours, gender)
experience_playtime <- ggplot(players_data_col, aes(x = experience, 
                                                y = played_hours,
                                                color = gender,
                                                fill = gender)) +
        geom_col() +
        labs(x = "Experience",
             y = "Total Hours Played",
             fill = "Gender") +
        scale_fill_brewer(palette = "Set1") +
        scale_color_brewer(palette = "Set1") +
        guides(color = "none") + theme_bw()

player_pairs <- sessions_players_elapsed |> select(played_hours, Age, session_time_elapsed) |>
    ggpairs(aes(alpha = 0.05)) +
    theme(text = element_text(size = 20)) + theme_bw() 

# Isolate the times when playing starts and ignore dates
starting_times <- sessions_players_elapsed |>
    mutate(start_times = as.POSIXct(format(start_time, "%H:%M"),
      format = "%H:%M", tz = "UTC"),
          feature_sin = sin(2 * pi * hour(start_times) / 24),
feature_cos = cos(2 * pi * hour(start_times) / 24))

session_time_plot2 <- starting_times |> 
    ggplot(aes(x = start_times)) +
    geom_histogram() +
    labs(x = "Session Start Time (HH:MM)",
        y = "Count",
        title = "Histogram 1.5. Distribution of Starting Times for Sessions") +
    scale_x_datetime(date_labels = "%H:%M",
                    date_breaks = "1 hour") +
    theme_bw() +
    theme(text = element_text(size = 15),
         plot.title = element_text(hjust = 0.5),
         axis.text.x = element_text(angle = -45,
                                  vjust = 0.1))

start_vs_sesslen <- ggplot(starting_times, aes(x = start_times, y = session_time_elapsed)) +
        geom_point() +
        labs(x = "Session Start Time (HH:MM)",
             y = "Sess time (minutes)") +
        scale_x_datetime(date_labels = "%H:%M",
                         date_breaks = "1 hour") + theme_bw() +
theme(text = element_text(size = 15),
         plot.title = element_text(hjust = 0.5),
         axis.text.x = element_text(angle = -45,
                                  vjust = 0.1))

sintime_vs_sesslen <- ggplot(starting_times, aes(x = feature_sin, y = session_time_elapsed)) +
        geom_point() +
        labs(x = "Sine Time",
             y = "Total Hours Played (hours)")


costime_vs_sesslen <- ggplot(starting_times, aes(x = feature_cos, y = session_time_elapsed)) +
        geom_point() +
        labs(x = "Cos Time",
             y = "Total Hours Played (hours)")


session_time_plot2
start_vs_sesslen
experience_playtime
age_vs_hours
age_vs_sesstime
player_pairs

#### kNN Means Clustering for Total Hours Played

In [None]:
#Set seed for consistent reproduction 
set.seed(888)

#Plot size
options(repr.plot.width = 12, repr.plot.height = 5)

## Tune for best k##
total_played_clustdata <- players_data |> 
    select(Age, played_hours) |> 
    filter(played_hours > 0) |>
     filter(!is.na(Age))

total_played_recipe <- recipe(~ ., total_played_clustdata) |>
    step_log(played_hours, base = 10) |>
    step_scale(all_predictors(), na_rm = TRUE) |>
    step_center(all_predictors(), na_rm = TRUE)

ks <- tibble(num_clusters = 1:10) 

player_spec_nstart <- k_means(num_clusters = tune()) |>
    set_engine("stats", nstart = 100)

elbow_stats <- workflow() |>
    add_recipe(total_played_recipe) |>
    add_model(player_spec_nstart) |>
    tune_cluster(resamples = apparent(total_played_clustdata), grid = ks) |>
    collect_metrics() |>
    filter(.metric == "sse_within_total") |>
    mutate(total_WSSD = mean) |>
    select(num_clusters, total_WSSD)
    
elbow_plot <- elbow_stats |>
    ggplot(aes(x = num_clusters, y = total_WSSD)) +
    geom_point() +
    geom_line() +
    labs(x = "K",
        y = "Total within-cluster sum of squares") +
    theme(text = element_text(size = 15)) +
    scale_x_continuous(breaks = 1:10)

## Clustering with Optimal K ##
total_played_spec <- k_means(num_clusters = 3) |>
    set_engine("stats")

total_played_clustering <- workflow() |>
    add_recipe(total_played_recipe) |>
    add_model(total_played_spec) |>
    fit(data = total_played_clustdata)

clustered_players <- augment(total_played_clustering, total_played_clustdata)

clustering_plot <- clustered_players |>
    ggplot(aes(x = Age, y = played_hours, color = .pred_cluster)) +
    geom_point() +
    labs(x = "Age",
        y = "Hours Played",
        color = "Cluster") +
    scale_y_log10()

plot_grid(elbow_plot + theme_bw(), clustering_plot + theme_bw(), align = "hv")

#### kNN Means Clustering for Session Times

In [None]:
#Set seed for consistent reproduction 
set.seed(888)

#Plot size
options(repr.plot.width = 12, repr.plot.height = 5)

## Tune for best k##
sess_clustdata <- sessions_players_elapsed |> 
    select(Age, session_time_elapsed) |>
     filter(!is.na(Age), !is.na(session_time_elapsed))

sess_recipe <- recipe(~ ., sess_clustdata) |>
    step_scale(all_predictors(), na_rm = TRUE) |>
    step_center(all_predictors(), na_rm = TRUE)

ks <- tibble(num_clusters = 1:10) 

sess_spec_nstart <- k_means(num_clusters = tune()) |>
    set_engine("stats", nstart = 100)

elbow_stats_sess <- workflow() |>
    add_recipe(sess_recipe) |>
    add_model(sess_spec_nstart) |>
    tune_cluster(resamples = apparent(sess_clustdata), grid = ks) |>
    collect_metrics() |>
    filter(.metric == "sse_within_total") |>
    mutate(total_WSSD = mean) |>
    select(num_clusters, total_WSSD)
    
elbow_plot_sess <- elbow_stats_sess |>
    ggplot(aes(x = num_clusters, y = total_WSSD)) +
    geom_point() +
    geom_line() +
    labs(x = "K",
        y = "Total within-cluster sum of squares") +
    theme(text = element_text(size = 15)) +
    scale_x_continuous(breaks = 1:10)

## Clustering with Optimal K ##
sess_spec <- k_means(num_clusters = 3) |>
    set_engine("stats")

sess_clustering <- workflow() |>
    add_recipe(sess_recipe) |>
    add_model(sess_spec) |>
    fit(data = sess_clustdata)

clustered_sess <- augment(sess_clustering, sess_clustdata)

clustering_plot_sess <- clustered_sess |>
    ggplot(aes(x = Age, y = session_time_elapsed, color = .pred_cluster)) +
    geom_point() +
    labs(x = "Age",
        y = "Session Time (minutes)",
        color = "Cluster")


plot_grid(elbow_plot_sess + theme_bw(), clustering_plot_sess + theme_bw())

#### Clustering for Age, Session Time and Total Play Time

In [None]:
#Set seed for consistent reproduction 
set.seed(888)

#Plot size
options(repr.plot.width = 8, repr.plot.height = 8)

## Tune for best k ##
avg_sesstime <- sessions_players_elapsed |>
    select(name, Age, session_time_elapsed, played_hours) |>
    filter(!is.na(Age), 
           !is.na(session_time_elapsed), 
           !is.na(played_hours)) |>
    group_by(name) |>
    summarize(avg_session_mins = mean(session_time_elapsed))

ast_clustdata <- left_join(players_data, avg_sesstime, by = "name") |> 
    filter(!is.na(avg_session_mins)) |> 
    mutate(avg_session_hours = avg_session_mins / 60) |>
    select(Age, played_hours, avg_session_hours)

#ast_clustdata

ast_recipe <- recipe(~ ., ast_clustdata) |>
    step_scale(all_predictors(), na_rm = TRUE) |>
    step_center(all_predictors(), na_rm = TRUE)

ks <- tibble(num_clusters = 1:10) 

ast_spec_nstart <- k_means(num_clusters = tune()) |>
    set_engine("stats", nstart = 100)

elbow_stats_ast <- workflow() |>
    add_recipe(ast_recipe) |>
    add_model(ast_spec_nstart) |>
    tune_cluster(resamples = apparent(ast_clustdata), grid = ks) |>
    collect_metrics() |>
    filter(.metric == "sse_within_total") |>
    mutate(total_WSSD = mean) |>
    select(num_clusters, total_WSSD)
    
elbow_plot_ast <- elbow_stats_ast |>
    ggplot(aes(x = num_clusters, y = total_WSSD)) +
    geom_point() +
    geom_line() +
    labs(x = "K",
        y = "Total within-cluster sum of squares") +
    theme(text = element_text(size = 15)) +
    scale_x_continuous(breaks = 1:10) + theme_bw()

## Clustering with Optimal K ##
ast_spec <- k_means(num_clusters = 3) |>
    set_engine("stats")

ast_clustering <- workflow() |>
    add_recipe(ast_recipe) |>
    add_model(ast_spec) |>
    fit(data = ast_clustdata)

clustered_sess <- augment(ast_clustering, ast_clustdata)

clustered_sess_density <- clustered_sess  |>
    pivot_longer(cols = -.pred_cluster, names_to = 'category', values_to = 'value')  |> 
    ggplot(aes(value, fill = .pred_cluster)) +
        geom_density(alpha = 0.4, colour = 'white') +
        facet_wrap(facets = vars(category), scales = 'free') +
        theme_minimal() +
        theme(text = element_text(size = 20))
clustered_sess
elbow_plot_ast

options(repr.plot.width = 12, repr.plot.height = 8)
clustered_sess_density