In [1]:
## Data Analysis packages
import numpy as np
import pandas as pd

import warnings

## Data Visualization packages
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import matplotlib

# sklearn library
import sklearn

### sklearn preprocessing tools
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,auc,accuracy_score,roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, PowerTransformer,FunctionTransformer,OneHotEncoder

# Error Metrics 
from sklearn.metrics import confusion_matrix ,classification_report
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score


### Machine learning classification Models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier #stacstic gradient descent clasifeier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier


#crossvalidation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut


#hyper parameter tunning
from sklearn.model_selection import GridSearchCV,cross_val_score,RandomizedSearchCV


In [2]:
### Initial settings
%matplotlib inline
sns.set_style("darkgrid")
matplotlib.rcParams["font.size"] = 10
matplotlib.rcParams["figure.figsize"] = (8,6)
matplotlib.rcParams["figure.facecolor"] = '#00000000' 
sns.set(rc={
            "font.size":10,
            "axes.titlesize":10,
            "axes.labelsize":15},
             style="darkgrid",
            ) 

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

## Information about the dataset.

- employee_id : Unique Identity of each employee

- age : The actual age of the employee

- gender : male or female

- marital_status : Single or married Or divorced

- salary : The actual salary of the employee

- employment_type : Full time / Part time / Contract

- region : North / South and so on

- has_dependents : Yes or No

- tenure_years  : Number of years enrolled

- enrolled (target: 1 for enrolled, 0 for not enrolled)

## `Data-Import`

In [3]:
raw_data = pd.read_csv("https://github.com/neustackapp/assignment/raw/refs/heads/main/employee_data.csv")
raw_data.head()

Unnamed: 0,employee_id,age,gender,marital_status,salary,employment_type,region,has_dependents,tenure_years,enrolled
0,10001,60,Female,Single,55122.97,Part-time,West,No,1.5,0
1,10002,50,Female,Single,89549.66,Full-time,West,Yes,12.8,1
2,10003,36,Male,Divorced,74145.66,Part-time,Midwest,No,3.8,0
3,10004,64,Female,Married,53877.83,Full-time,Northeast,No,3.3,0
4,10005,29,Male,Single,63404.63,Contract,Midwest,Yes,10.0,0


## `Train-Test-Split:`

### Doing any kind of transformations before splitting the data leads to `deceitful model.`

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(raw_data,random_state=2)
print(f"Train data shape : ", train_df.shape)
print(f"Test data shape : ", test_df.shape)

Train data shape :  (7500, 10)
Test data shape :  (2500, 10)


In [5]:
train_df.head()

Unnamed: 0,employee_id,age,gender,marital_status,salary,employment_type,region,has_dependents,tenure_years,enrolled
2753,12754,56,Male,Married,69297.05,Full-time,South,Yes,0.5,1
2824,12825,51,Female,Single,76583.42,Part-time,South,Yes,1.6,1
2597,12598,45,Female,Married,94439.96,Full-time,West,Yes,1.3,1
3973,13974,22,Male,Divorced,61050.07,Part-time,Midwest,Yes,3.1,0
3335,13336,50,Female,Married,28380.67,Full-time,South,Yes,13.0,1


In [6]:
## Initial statistical description
train_df.describe() 

Unnamed: 0,employee_id,age,salary,tenure_years,enrolled
count,7500.0,7500.0,7500.0,7500.0,7500.0
mean,14998.246533,42.8628,65003.785275,4.00428,0.6148
std,2877.092961,12.264106,14972.851589,3.886657,0.486675
min,10001.0,22.0,2207.79,0.0,0.0
25%,12518.5,32.0,54686.4375,1.2,0.0
50%,15004.5,43.0,64967.675,2.8,1.0
75%,17470.25,53.0,75045.1425,5.6,1.0
max,19999.0,64.0,120312.0,36.0,1.0


### `Observations:`

- Average salary of the employees is 65000, with a deviation of 15000(approx).

- Median of the tenure years is 3(approx), which means 50% of employees were enrolled for more than 3 years.
