# US Visa Approval Classification

The main goal of the project is to check if Visa get approved or not based on the given dataset.

This can be used to Recommend a suitable profile for the applicants for whom the visa should be certified or denied based on the certain criteria which influences the decision.

The data consists of 25480 Rows and 12 Columns

# Importing required libraries

In [1]:
# Install necessary packages if not already installed
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("dplyr")) install.packages("dplyr")
if (!require("plotly")) install.packages("plotly")
if (!require("readr")) install.packages("readr")
if (!require("tidyverse")) install.packages("tidyverse")

Loading required package: ggplot2

Loading required package: dplyr

"package 'dplyr' was built under R version 4.3.3"

Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Loading required package: plotly

"package 'plotly' was built under R version 4.3.3"

Attaching package: 'plotly'


The following object is masked from 'package:ggplot2':

    last_plot


The following object is masked from 'package:stats':

    filter


The following object is masked from 'package:graphics':

    layout


Loading required package: readr

Loading required package: tidyverse

── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtibble   [39m 3.

In [2]:
# Load libraries
library(ggplot2)
library(dplyr)
library(plotly)
library(readr)
library(tidyverse)

# Step 1: Preliminary Analysis

In [3]:
# Read CSV file
df <- read_csv("EasyVisa.csv")

[1mRows: [22m[34m25480[39m [1mColumns: [22m[34m12[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (9): case_id, continent, education_of_employee, has_job_experience, requ...
[32mdbl[39m (3): no_of_employees, yr_of_estab, prevailing_wage

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [4]:
# Display the first few records of the dataframe
head(df)

case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>
EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified
EZYV06,Asia,Master's,Y,N,2339,2012,South,78252.14,Year,Y,Certified


## Number of independent and dependent variables:

In [5]:
# Identifying independent and dependent variables
independent_vars <- names(df)[1:(ncol(df) - 1)]
dependent_var <- names(df)[ncol(df)]
# Print information about variables and the data frame
cat("Number of independent variables:", length(independent_vars), "\n")
cat("Number of dependent variables:", 1, "\n")

Number of independent variables: 11 
Number of dependent variables: 1 


## Number of records:¶

In [6]:
cat("Number of records:", nrow(df), "\n")

Number of records: 25480 


In [7]:
# Print the dimensions of the data frame
dim(df)

## Categorical Features

In [8]:
# Identify categorical features
categorical_features <- df %>% select_if(~is.character(.)) %>% names()
print(categorical_features)
cat('Number of Categorical Features :', length(categorical_features), '\n')

[1] "case_id"               "continent"             "education_of_employee"
[4] "has_job_experience"    "requires_job_training" "region_of_employment" 
[7] "unit_of_wage"          "full_time_position"    "case_status"          
Number of Categorical Features : 9 


## Unique counts of categories in columns

In [9]:
# Print details about categorical features
for(feature in categorical_features) {
  cat(sprintf("The feature is %s and the number of categories are %d\n", feature, n_distinct(df[[feature]])))
}

The feature is case_id and the number of categories are 25480
The feature is continent and the number of categories are 6
The feature is education_of_employee and the number of categories are 4
The feature is has_job_experience and the number of categories are 2
The feature is requires_job_training and the number of categories are 2
The feature is region_of_employment and the number of categories are 5
The feature is unit_of_wage and the number of categories are 4
The feature is full_time_position and the number of categories are 2
The feature is case_status and the number of categories are 2


## Numerical Features

In [10]:
# Identify numerical features
numerical_features <- df %>% select_if(~is.numeric(.)) %>% names()
print(numerical_features)
cat('Num of Numerical Features :', length(numerical_features), '\n')

[1] "no_of_employees" "yr_of_estab"     "prevailing_wage"
Num of Numerical Features : 3 


In [11]:
# Reprint numerical features, excluding character features
numerical_features <- df %>% select_if(~is.character(.)) %>% names()
print(numerical_features)
cat("Number of numerical variables:", length(numerical_features), '\n')
# Display first few rows of numerical features
head(select(df, all_of(numerical_features)))

[1] "case_id"               "continent"             "education_of_employee"
[4] "has_job_experience"    "requires_job_training" "region_of_employment" 
[7] "unit_of_wage"          "full_time_position"    "case_status"          
Number of numerical variables: 9 


case_id,continent,education_of_employee,has_job_experience,requires_job_training,region_of_employment,unit_of_wage,full_time_position,case_status
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
EZYV01,Asia,High School,N,N,West,Hour,Y,Denied
EZYV02,Asia,Master's,Y,N,Northeast,Year,Y,Certified
EZYV03,Asia,Bachelor's,N,Y,West,Year,Y,Denied
EZYV04,Asia,Bachelor's,N,N,West,Year,Y,Denied
EZYV05,Africa,Master's,Y,N,South,Year,Y,Certified
EZYV06,Asia,Master's,Y,N,South,Year,Y,Certified


## Binary Features

In [12]:
# Check for binary columns
for(col in names(df)) {
  if(n_distinct(df[[col]]) == 2) {
    cat(sprintf("%s might be binary\n", col))
  }
}

has_job_experience might be binary
requires_job_training might be binary
full_time_position might be binary
case_status might be binary


## Discrete Features

In [13]:
# Identifying discrete and continuous features
discrete_features <- df %>%
  select_if(~is.numeric(.)) %>%
  select_if(~n_distinct(.) <= 25) %>%
  names()

cat('We have', length(discrete_features), 'discrete features:', discrete_features, '\n')
cat('Number of Discrete Features:', length(discrete_features), '\n')

We have 0 discrete features:  
Number of Discrete Features: 0 


## Continous Features

In [14]:
continuous_features <- df %>%
  select_if(~is.numeric(.)) %>%
  select_if(~n_distinct(.) > 25) %>%
  names()

cat('\nWe have', length(continuous_features), 'continuous_features:', continuous_features, '\n')
cat('Number of Continuous Features:', length(continuous_features), '\n')


We have 3 continuous_features: no_of_employees yr_of_estab prevailing_wage 
Number of Continuous Features: 3 


## Data types of variables:

In [15]:
# Print data types of variables
str(df)

spc_tbl_ [25,480 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ case_id              : chr [1:25480] "EZYV01" "EZYV02" "EZYV03" "EZYV04" ...
 $ continent            : chr [1:25480] "Asia" "Asia" "Asia" "Asia" ...
 $ education_of_employee: chr [1:25480] "High School" "Master's" "Bachelor's" "Bachelor's" ...
 $ has_job_experience   : chr [1:25480] "N" "Y" "N" "N" ...
 $ requires_job_training: chr [1:25480] "N" "N" "Y" "N" ...
 $ no_of_employees      : num [1:25480] 14513 2412 44444 98 1082 ...
 $ yr_of_estab          : num [1:25480] 2007 2002 2008 1897 2005 ...
 $ region_of_employment : chr [1:25480] "West" "Northeast" "West" "West" ...
 $ prevailing_wage      : num [1:25480] 592 83426 122997 83434 149907 ...
 $ unit_of_wage         : chr [1:25480] "Hour" "Year" "Year" "Year" ...
 $ full_time_position   : chr [1:25480] "Y" "Y" "Y" "Y" ...
 $ case_status          : chr [1:25480] "Denied" "Certified" "Denied" "Denied" ...
 - attr(*, "spec")=
  .. cols(
  ..   case_id = [31mcol_character

In [16]:
# Proportion of count data on categorical columns
for(col in categorical_features) {
  print(prop.table(table(df[[col]])) * 100)
  cat('---------------------------\n')
}


     EZYV01      EZYV02      EZYV03      EZYV04      EZYV05      EZYV06 
0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 
     EZYV07      EZYV08      EZYV09      EZYV10     EZYV100    EZYV1000 
0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 
  EZYV10000   EZYV10001   EZYV10002   EZYV10003   EZYV10004   EZYV10005 
0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 
  EZYV10006   EZYV10007   EZYV10008   EZYV10009    EZYV1001   EZYV10010 
0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 
  EZYV10011   EZYV10012   EZYV10013   EZYV10014   EZYV10015   EZYV10016 
0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 
  EZYV10017   EZYV10018   EZYV10019    EZYV1002   EZYV10020   EZYV10021 
0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 0.003924647 
  EZYV10022   EZYV10023   EZYV10024   EZYV10025   EZYV10026   EZYV10027 
0.003924647 0.003924647 0.003924647 0.003924647 0.

**Insights**
 - `case_id` have unique vlaues for each column which can be dropped as it it of no importance
 - `continent` column is highly biased towards asia. hence we can combine other categories to form a single category.
 - `unit_of_wage` seems to be an important column as most of them are yearly contracts.

## Summary Statistics

In [17]:
# Load the necessary libraries
library(dplyr)
library(tidyr)

In [18]:
# Summary statistics for numerical columns
summary(df)

   case_id           continent         education_of_employee has_job_experience
 Length:25480       Length:25480       Length:25480          Length:25480      
 Class :character   Class :character   Class :character      Class :character  
 Mode  :character   Mode  :character   Mode  :character      Mode  :character  
                                                                               
                                                                               
                                                                               
 requires_job_training no_of_employees   yr_of_estab   region_of_employment
 Length:25480          Min.   :   -26   Min.   :1800   Length:25480        
 Class :character      1st Qu.:  1022   1st Qu.:1976   Class :character    
 Mode  :character      Median :  2109   Median :1997   Mode  :character    
                       Mean   :  5667   Mean   :1979                       
                       3rd Qu.:  3504   3rd Qu.:2005        

# Step 2: Data Cleaning

## Checking for Null Values

In [19]:
# Counting missing values for each column
colSums(is.na(df))

## Checking for unique values

In [20]:
# Number of unique values per column
sapply(df, function(x) length(unique(x)))

## All Features

In [21]:
names(df)

## Checking for low variance:

In [22]:
# Checking for columns with a single unique value
low_variance_cols <- names(df)[sapply(df, function(x) length(unique(na.omit(x))) <= 1)]
cat("Columns with low variance (possible candidates for removal):", low_variance_cols, "\n")

Columns with low variance (possible candidates for removal):  


## Checking & handling for duplicates

In [23]:
# Checking for duplicates
if(any(duplicated(df))) {
  cat("Duplicates found:", sum(duplicated(df)), "\n")
  df <- df[!duplicated(df), ]
  cat("Duplicates have been removed.\n")
} else {
  cat("No duplicates found.\n")
}

No duplicates found.


## Removing Case_Id

In [24]:
# Dropping a column
df <- df %>% select(-case_id)

## Handling Missing Values

In [25]:
library(lubridate)

In [26]:
# Checking for missing values in all columns
missing_data <- colSums(is.na(df))
missing_data <- missing_data[missing_data > 0]

if (length(missing_data) > 0) {
  print("Missing values found in the following columns:")
  print(missing_data)
  # Handling missing values (example using median imputation for numerical columns)
  df <- df %>%
    mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .))) %>%
    mutate(across(where(is.factor), ~ifelse(is.na(.), Mode(.)[1], .))) # For categorical data, using mode
  print("Missing values have been handled.")
} else {
  print("No missing values found.")
}

[1] "No missing values found."


## Copy of Original Data

In [27]:
df1 <- df
head(df1)

continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>
Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified
Asia,Master's,Y,N,2339,2012,South,78252.14,Year,Y,Certified


# Step 3: Feature Engineering

## Feature Extraction

In [28]:
# Getting today's date and calculating 'company_age'
todays_date <- Sys.Date()
current_year <- year(todays_date)
print(current_year)

[1] 2024


In [29]:
df1 <- df1 %>%
  mutate(company_age = current_year - yr_of_estab) %>%
  select(-yr_of_estab)
head(df1)

continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,company_age
<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
Asia,High School,N,N,14513,West,592.2029,Hour,Y,Denied,17
Asia,Master's,Y,N,2412,Northeast,83425.65,Year,Y,Certified,22
Asia,Bachelor's,N,Y,44444,West,122996.86,Year,Y,Denied,16
Asia,Bachelor's,N,N,98,West,83434.03,Year,Y,Denied,127
Africa,Master's,Y,N,1082,South,149907.39,Year,Y,Certified,19
Asia,Master's,Y,N,2339,South,78252.14,Year,Y,Certified,12


In [30]:
# Function to calculate mode
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

# Step 4: Train & Test Split

In [32]:
if (!require("tidymodels")) install.packages("tidymodels")

Loading required package: tidymodels

"there is no package called 'tidymodels'"
Installing package into 'C:/Users/SANA JALGAONKAR/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)

also installing the dependencies 'warp', 'lhs', 'DiceDesign', 'patchwork', 'furrr', 'slider', 'doFuture', 'GPfit', 'modelenv', 'dials', 'infer', 'modeldata', 'parsnip', 'rsample', 'rstudioapi', 'tune', 'workflows', 'workflowsets', 'yardstick'




package 'warp' successfully unpacked and MD5 sums checked
package 'lhs' successfully unpacked and MD5 sums checked
package 'DiceDesign' successfully unpacked and MD5 sums checked
package 'patchwork' successfully unpacked and MD5 sums checked
package 'furrr' successfully unpacked and MD5 sums checked
package 'slider' successfully unpacked and MD5 sums checked
package 'doFuture' successfully unpacked and MD5 sums checked
package 'GPfit' successfully unpacked and MD5 sums checked
package 'modelenv' successfully unpacked and MD5 sums checked
package 'dials' successfully unpacked and MD5 sums checked
package 'infer' successfully unpacked and MD5 sums checked
package 'modeldata' successfully unpacked and MD5 sums checked
package 'parsnip' successfully unpacked and MD5 sums checked
package 'rsample' successfully unpacked and MD5 sums checked
package 'rstudioapi' successfully unpacked and MD5 sums checked
package 'tune' successfully unpacked and MD5 sums checked
package 'workflows' successfull

In [33]:
if (!require("recipes")) install.packages("recipes")

Loading required package: recipes

"package 'recipes' was built under R version 4.3.3"

Attaching package: 'recipes'


The following object is masked from 'package:stringr':

    fixed


The following object is masked from 'package:stats':

    step




In [34]:
library(tidymodels)
library(dplyr)
library(recipes)

"package 'tidymodels' was built under R version 4.3.3"
── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────── tidymodels 1.2.0 ──

[32m✔[39m [34mbroom       [39m 1.0.5     [32m✔[39m [34mrsample     [39m 1.2.1
[32m✔[39m [34mdials       [39m 1.2.1     [32m✔[39m [34mtune        [39m 1.2.1
[32m✔[39m [34minfer       [39m 1.0.7     [32m✔[39m [34mworkflows   [39m 1.1.4
[32m✔[39m [34mmodeldata   [39m 1.3.0     [32m✔[39m [34mworkflowsets[39m 1.1.0
[32m✔[39m [34mparsnip     [39m 1.2.1     [32m✔[39m [34myardstick   [39m 1.3.1

"package 'dials' was built under R version 4.3.3"
"package 'infer' was built under R version 4.3.3"
"package 'modeldata' was built under R version 4.3.3"
"package 'parsnip' was built under R version 4.3.3"
"package 'rsample' was built under R version 4.3.3"
"package 'tune' was built under R version 4.3.3"
"package 'workflows' was built under R version 4.3.3"
"package 'workflowsets

**Manual encoding target column**

In [35]:
# Define the target variable and encode it
df1$case_status <- ifelse(df1$case_status == "Denied", 1, 0)

In [42]:
# Define predictor matrix X and target vector y
X <- select(df1, -case_status)
y <- df1$case_status

## **Feature Encoding and Scaling**

In [43]:
# Identify numerical and categorical features
num_features <- names(select_if(X, is.numeric))

In [44]:
num_features

In [45]:
cat_features <- names(select_if(X, is.character))
cat_features

### **Preprocessing using Column Transformer**

In [46]:
# Define column roles
or_columns <- c('has_job_experience', 'requires_job_training', 'full_time_position', 'education_of_employee')
oh_columns <- c('continent', 'unit_of_wage', 'region_of_employment')
transform_columns <- c('no_of_employees', 'company_age')

In [48]:
# Now create the recipe including y
recipe <- recipe(case_status ~ ., data = df1) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_YeoJohnson(all_numeric(), -all_outcomes()) %>%
  step_normalize(all_numeric(), -all_outcomes())

In [49]:
# Prepare and bake the recipe
prepped_recipe <- prep(recipe)
X_transformed <- bake(prepped_recipe, new_data = NULL)

In [50]:
print(X_transformed)

[90m# A tibble: 25,480 × 22[39m
   no_of_employees prevailing_wage company_age case_status continent_Asia
             [3m[90m<dbl>[39m[23m           [3m[90m<dbl>[39m[23m       [3m[90m<dbl>[39m[23m       [3m[90m<dbl>[39m[23m          [3m[90m<dbl>[39m[23m
[90m 1[39m          1.52           -[31m1[39m[31m.[39m[31m94[39m     -[31m0[39m[31m.[39m[31m952[39m              1          0.715
[90m 2[39m          0.030[4m9[24m          0.345    -[31m0[39m[31m.[39m[31m463[39m              0          0.715
[90m 3[39m          2.88            0.916    -[31m1[39m[31m.[39m[31m0[39m[31m8[39m               1          0.715
[90m 4[39m         -[31m1[39m[31m.[39m[31m34[39m            0.345     1.63               1          0.715
[90m 5[39m         -[31m0[39m[31m.[39m[31m432[39m           1.26     -[31m0[39m[31m.[39m[31m734[39m              0         -[31m1[39m[31m.[39m[31m40[39m 
[90m 6[39m          0.011[4m2[24m       

### **Resampling**

In [51]:
# Install and load necessary packages
if (!require('ROSE')) install.packages('ROSE', dependencies=TRUE)
if (!require('caret')) install.packages('caret', dependencies=TRUE)
library(ROSE)
library(caret)

Loading required package: ROSE

"there is no package called 'ROSE'"
Installing package into 'C:/Users/SANA JALGAONKAR/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)

also installing the dependency 'tree'




package 'tree' successfully unpacked and MD5 sums checked
package 'ROSE' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\SANA JALGAONKAR\AppData\Local\Temp\Rtmpo7sGzC\downloaded_packages


Loading required package: caret

"package 'caret' was built under R version 4.3.3"
Loading required package: lattice


Attaching package: 'caret'


The following objects are masked from 'package:yardstick':

    precision, recall, sensitivity, specificity


The following object is masked from 'package:purrr':

    lift


"package 'ROSE' was built under R version 4.3.3"
Loaded ROSE 0.0-4




In [54]:
# Prepare the dataset, assuming df1 and target y have been defined and properly prepared
data_frame <- cbind(X, y = as.factor(y))  # Ensure target variable 'y' is a factor

In [55]:
# Setting N to be at least the size of the original data or larger
N_size <- max(2000, nrow(data_frame))

In [56]:
# Resampling using SMOTE (ROSE package)
set.seed(42)
data_balanced <- ovun.sample(y ~ ., data = data_frame, method = "over", N = N_size, seed = 42)$data

# Separate the balanced data
X_res <- data_balanced[, -ncol(data_balanced)]
y_res <- data_balanced$y

In [57]:
# Splitting the dataset into training and testing
set.seed(42)
split <- createDataPartition(y_res, p = 0.7, list = FALSE)
X_train <- X_res[split, ]
X_test <- X_res[-split, ]
y_train <- y_res[split]
y_test <- y_res[-split]

## Print the number of records in the training and testing data

In [58]:
cat("Training records:", nrow(X_train), "Testing records:", nrow(X_test), "\n")

Training records: 17837 Testing records: 7643 


## Save training data to CSV

In [59]:
# Saving training and testing data
write.csv(cbind(X_train, y_train), 'visa_training_data_R.csv', row.names = FALSE)
cat("Training data has been saved to 'visa_training_data_R.csv'.\n")

Training data has been saved to 'visa_training_data_R.csv'.


## Save testing data to CSV

In [60]:
write.csv(cbind(X_test, y_test), 'visa_testing_data_R.csv', row.names = FALSE)
cat("Testing data has been saved to 'visa_testing_data_R.csv'.\n")

Testing data has been saved to 'visa_testing_data_R.csv'.


# Step 6: Logistic Regression using SkLearn

In [61]:
# Install necessary R packages
if (!require(caret)) install.packages('caret', dependencies=TRUE)
if (!require(pROC)) install.packages('pROC', dependencies=TRUE)
if (!require(Metrics)) install.packages('Metrics', dependencies=TRUE)
library(caret)
library(pROC)
library(Metrics)

Loading required package: pROC

"package 'pROC' was built under R version 4.3.3"
Type 'citation("pROC")' for a citation.


Attaching package: 'pROC'


The following objects are masked from 'package:stats':

    cov, smooth, var


Loading required package: Metrics

"there is no package called 'Metrics'"
Installing package into 'C:/Users/SANA JALGAONKAR/AppData/Local/R/win-library/4.3'
(as 'lib' is unspecified)



package 'Metrics' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\SANA JALGAONKAR\AppData\Local\Temp\Rtmpo7sGzC\downloaded_packages


"package 'Metrics' was built under R version 4.3.3"

Attaching package: 'Metrics'


The following object is masked from 'package:pROC':

    auc


The following objects are masked from 'package:caret':

    precision, recall


The following objects are masked from 'package:yardstick':

    accuracy, mae, mape, mase, precision, recall, rmse, smape




In [71]:
library(caret)
library(pROC)

In [72]:
# Ensure factor levels consistency by defining them explicitly
y_train <- factor(y_train, levels = c(0, 1))
y_test <- factor(y_test, levels = c(0, 1))

In [73]:
# Train Logistic Regression model
lr_model <- glm(y_train ~ ., data = as.data.frame(X_train), family = binomial())

In [74]:
# Make predictions and convert probabilities to binary outcome based on threshold
y_train_pred_prob <- predict(lr_model, type = "response", newdata = as.data.frame(X_train))
y_test_pred_prob <- predict(lr_model, type = "response", newdata = as.data.frame(X_test))

In [75]:
y_train_pred_class <- factor(ifelse(y_train_pred_prob > 0.5, 1, 0), levels = c(0, 1))
y_test_pred_class <- factor(ifelse(y_test_pred_prob > 0.5, 1, 0), levels = c(0, 1))

In [82]:
# Evaluation function
evaluate_clf <- function(true, predicted) {
  # Ensure both true and predicted are factors and have the same levels
  true <- factor(true, levels = c(0, 1))
  predicted <- factor(predicted, levels = c(0, 1))
  
  conf_matrix <- confusionMatrix(predicted, true)
  
  acc <- conf_matrix$overall['Accuracy']
  precision <- conf_matrix$byClass['Precision']
  recall <- conf_matrix$byClass['Sensitivity']
  f1 <- 2 * (precision * recall) / (precision + recall)
  
  # roc_curve <- roc(as.numeric(true), as.numeric(predicted))  # ensure numeric input for roc function
  # roc_auc <- auc(roc_curve)
  
  return(list(Accuracy = acc, F1 = f1, Precision = precision, Recall = recall))
}

In [83]:
# Evaluate performance on training set
model_train_metrics <- evaluate_clf(y_train, y_train_pred_class)
cat('Model performance for Training set\n')
print(model_train_metrics)

Model performance for Training set
$Accuracy
 Accuracy 
0.7444638 

$F1
Precision 
 0.821311 

$Precision
Precision 
0.7705039 

$Recall
Sensitivity 
  0.8792915 



In [84]:
# Evaluate performance on test set
model_test_metrics <- evaluate_clf(y_test, y_test_pred_class)
cat('Model performance for Test set\n')
print(model_test_metrics)

Model performance for Test set
$Accuracy
 Accuracy 
0.7345283 

$F1
Precision 
0.8155287 

$Precision
Precision 
0.7609433 

$Recall
Sensitivity 
  0.8785504 



# Step 8: Random Forest Using SkLearn

In [85]:
# Install and load necessary packages
if (!require(randomForest)) install.packages('randomForest', dependencies=TRUE)
library(randomForest)
library(caret)
library(pROC)

Loading required package: randomForest

"package 'randomForest' was built under R version 4.3.3"
randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attaching package: 'randomForest'


The following object is masked from 'package:dplyr':

    combine


The following object is masked from 'package:ggplot2':

    margin




In [86]:
# Install and load necessary packages
if (!require(randomForest)) install.packages('randomForest', dependencies=TRUE)
library(randomForest)
library(caret)
library(pROC)

In [87]:
# Train RandomForest model
rf_model <- randomForest(x = X_train, y = y_train, ntree = 500)  # setting ntree as an example

In [88]:
# Make predictions
y_train_pred <- predict(rf_model, X_train)
y_test_pred <- predict(rf_model, X_test)

In [89]:
# Training set performance
rfc_metrics_train <- evaluate_clf(y_train, y_train_pred)
cat("Model performance for Training set:\n")
print(rfc_metrics_train)

Model performance for Training set:
$Accuracy
 Accuracy 
0.9342378 

$F1
Precision 
0.9522141 

$Precision
Precision 
0.9250435 

$Recall
Sensitivity 
  0.9810291 



In [90]:
# Test set performance
rfc_metrics_test <- evaluate_clf(y_test, y_test_pred)
cat("Model performance for Test set:\n")
print(rfc_metrics_test)

Model performance for Test set:
$Accuracy
 Accuracy 
0.7962842 

$F1
Precision 
0.8536517 

$Precision
Precision 
0.8205638 

$Recall
Sensitivity 
  0.8895201 



In [91]:
# Additional evaluation metrics
rfc_accuracy <- rfc_metrics_test$Accuracy
cat("Random Forest Accuracy (Test Set):", rfc_accuracy, "\n")

Random Forest Accuracy (Test Set): 0.7962842 
