# **UNSW**
# ZZBU6404 Human Resource Analytics
## R Notebook
### By Thomas Bierly

# O-Week

In [None]:
# URL of the CSV file
url <- "http://example.com/data.csv"

# Reading the CSV file from the URL
data <- read.csv(url)

# Viewing the first few rows of the data
head(data)


# Week 2

In [None]:
# URL of the CSV file
url <- "https://raw.githubusercontent.com/thomasbierly/DataScienceForHR/main/Week_2_Employee_Satisfaction.csv"

# Reading the CSV file from the URL
data <- read.csv(url)

# Viewing the first few rows of the data
head(data)

In [None]:
mean_value <- mean(data$Satisfaction.rating)
median_value <- median(data$Satisfaction.rating)
mode_value <- as.numeric(names(sort(table(data$Satisfaction.rating), decreasing=TRUE)[1]))  # R doesn't have a built-in mode function


In [None]:
print(paste("Mean:", mean_value))
print(paste("Median:", median_value))
print(paste("Mode:", mode_value))

In [None]:
library(ggplot2)

In [None]:
ggplot(data, aes(x=Satisfaction.rating)) +
  geom_histogram(binwidth = 1, fill="blue", alpha=0.7) +
  geom_vline(aes(xintercept=mean_value, color="Mean"), linetype="dashed", linewidth=1.5) +
  geom_vline(aes(xintercept=median_value, color="Median"), linetype="solid", linewidth=1.5) +
  geom_vline(aes(xintercept=mode_value, color="Mode"), linetype="dotdash", linewidth=1.5) +
  labs(title="Distribution of Employee Satisfaction Scores", x="Satisfaction Score", y="Frequency", color="Line Type") +
  scale_color_manual(values=c("Mean"="red", "Median"="green", "Mode"="blue")) +
  theme_minimal()

# Week 3

### T-Test

In [None]:
# URL of the CSV file
url <- "https://raw.githubusercontent.com/thomasbierly/DataScienceForHR/main/Week_3_Gender_Salary.csv"

# Reading the CSV file from the URL
data <- read.csv(url)

# Viewing the first few rows of the data
head(data)

In [None]:
data$gender <- factor(data$gender)
head(data)

In [None]:
t_test_result <- t.test(salary ~ gender, data = data)
t_test_result

### ANOVA

In [None]:
# URL of the CSV file
url <- "https://raw.githubusercontent.com/thomasbierly/DataScienceForHR/main/Week_3_Race_Performance.csv"

# Reading the CSV file from the URL
data <- read.csv(url)

# Viewing the first few rows of the data
head(data)

In [None]:
data$Race <- factor(data$Race)
head(data)

In [None]:
res.aov <- aov(Performance_Score ~ Race, data = data)
summary(res.aov)

In [None]:
TukeyHSD(res.aov)

# Week 4

### Linear Regression

In [None]:
# URL of the CSV file
url <- "https://raw.githubusercontent.com/thomasbierly/DataScienceForHR/main/Week_4_IQ_Performance.csv"

# Reading the CSV file from the URL
data <- read.csv(url)

# Viewing the first few rows of the data
head(data)


In [None]:
# Perform linear regression
model <- lm(performance_rating ~ employee_iq, data = data)

# Summary of the regression model
summary(model)

In [None]:
# Plotting the data points
plot(data$employee_iq, data$performance_rating, main = "Employee IQ vs Performance Rating",
     xlab = "Employee IQ", ylab = "Performance Rating", pch = 19, col = "blue")

# Adding the regression line
abline(model, col = "red", lwd = 2)

### Multiple Regression

In [None]:
# URL of the CSV file
url <- "https://raw.githubusercontent.com/thomasbierly/DataScienceForHR/main/Week_4_IQ_Multi.csv"

# Reading the CSV file from the URL
data <- read.csv(url)

# Viewing the first few rows of the data
head(data)

In [None]:
model <- lm(Performance_Rating ~ Employee_IQ + Educational_Background + Work_Experience + Emotional_Intelligence + Motivation_Level, data=data)
summary(model)

In [None]:
par(mfrow = c(2, 3))  # Adjust layout for multiple plots

# Loop over specific columns excluding 'Performance_Rating'
for (var in names(data)[names(data) != "Performance_Rating"]) {
    plot(data[[var]], data$Performance_Rating,
         xlab = var,  # Set x-axis label to the column name
         ylab = "Performance Rating",
         main = paste("Performance Rating vs", var))  # Set title
    abline(lm(Performance_Rating ~ data[[var]], data = data), col = "red")
}

# Week 5

In [None]:
# URL of the CSV file
url <- "https://raw.githubusercontent.com/thomasbierly/DataScienceForHR/main/interview_outcomes.csv"

# Reading the CSV file from the URL
data <- read.csv(url)

# Viewing the first few rows of the data
head(data)


In [None]:
  model <- glm(Interview_Outcome ~ Years_of_Relevant_Experience + Education_Level + Certifications_Count + Industry_References + CV_Strength_Score,
                 data = data, family = "binomial")

summary(model)

In [None]:
# Calculate the odds ratios
odds_ratios <- exp(coef(model))
odds_ratios

In [None]:
library(ggplot2)

# Calculate odds ratios and their confidence intervals
model_coef <- summary(model)$coefficients
odds_ratios <- exp(model_coef[, 1])
conf_int <- exp(confint(model))

# Create a data frame for plotting
plot_data <- data.frame(
  Predictor = rownames(model_coef),
  OR = odds_ratios,
  Lower = conf_int[, 1],
  Upper = conf_int[, 2]
)

# Plotting
ggplot(plot_data, aes(y = Predictor, x = OR, xmin = Lower, xmax = Upper)) +
  geom_point() +
  geom_errorbarh(height = 0.2) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
  labs(title = "Odds Ratios with 95% Confidence Intervals", x = "Odds Ratio") +
  theme_minimal()

# Week 6

### Random Forest

In [None]:
# Install necessary packages
install.packages("randomForest")
install.packages("caret")  # for data splitting and evaluation

# Load the packages
library(randomForest)
library(caret)

In [None]:
# URL of the CSV file
url <- "https://raw.githubusercontent.com/thomasbierly/DataScienceForHR/main/HighPotential.csv"

# Reading the CSV file from the URL
data <- read.csv(url)

# Viewing the first few rows of the data
head(data)


In [None]:
# Check for NA values and preprocess as necessary
summary(data)

# Convert 'HighPotential' to a factor
data$HighPotential <- as.factor(data$HighPotential)

# Splitting the dataset into training and testing sets
set.seed(123)  # for reproducibility
indexes <- createDataPartition(data$HighPotential, p=0.8, list=FALSE)
train_data <- data[indexes, ]
test_data <- data[-indexes, ]

In [None]:
# Train the model
rf_model <- randomForest(HighPotential ~ ., data=train_data, ntree=100)

# Print model summary
print(rf_model)

In [None]:
# Make predictions on the test data
predictions <- predict(rf_model, test_data)

# Ensure predictions are factors with the same levels as the 'HighPotential' variable
predictions <- factor(predictions, levels = levels(test_data$HighPotential))

# Evaluating the model
confusionMatrix(predictions, test_data$HighPotential)

In [None]:
# Assessing variable importance
importance(rf_model)

# Plotting variable importance
varImpPlot(rf_model)

### Neural Network

In [None]:
# Install necessary packages
install.packages("neuralnet")
install.packages("caret")  # for data preprocessing and splitting

# Load the packages
library(neuralnet)
library(caret)

In [None]:
# URL of the CSV file
url <- "https://raw.githubusercontent.com/thomasbierly/DataScienceForHR/main/HighPotential.csv"

# Reading the CSV file from the URL
data <- read.csv(url)

# Viewing the first few rows of the data
head(data)


In [None]:
# Convert 'HighPotential' to a factor
data$HighPotential <- as.factor(data$HighPotential)

# Exclude the target variable from normalization
data_norm <- data
data_norm[, -which(names(data) == "HighPotential")] <- scale(data[, -which(names(data) == "HighPotential")])

# Splitting the dataset into training and testing sets
set.seed(123)
indexes <- createDataPartition(data_norm$HighPotential, p=0.8, list=FALSE)
train_data <- data_norm[indexes, ]
test_data <- data_norm[-indexes, ]

In [None]:
# Train the neural network
nn_model <- neuralnet(HighPotential ~ ., data=train_data, hidden=c(5), linear.output=FALSE)

# Print model summary
print(nn_model)

In [None]:
# Compute predictions
nn_predictions <- compute(nn_model, test_data[,-which(names(test_data) == "HighPotential")])
predicted_values <- nn_predictions$net.result

# Selecting the second column of predicted_values (probabilities of being class 1)
predicted_probabilities <- predicted_values[, 2]

# Converting probabilities to class predictions (assuming >0.5 means class 1)
predicted_classes <- ifelse(predicted_probabilities > 0.5, levels(actual_values)[2], levels(actual_values)[1])

# Converting predicted_classes to a factor
predicted_classes <- as.factor(predicted_classes)

# Evaluating the model with Confusion Matrix
confusionMatrix(predicted_classes, actual_values)


In [None]:
# Plot the neural network
plot(nn_model)