<a href="https://colab.research.google.com/github/vivekkumarshiv/Cogentix-Analysis/blob/main/cogentix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder

Here all all the imports required. Now we will load the dataset.

In [None]:
df = pd.read_csv("DATASET.csv")
df.head()

Unnamed: 0,DOJ Extended,Duration to accept offer,Notice period,Offered band,Pecent hike expected in CTC,Percent hike offered in CTC,Percent difference CTC,Joining Bonus,Candidate relocate actual,Gender,Candidate Source,Rex in Yrs,LOB,Location,Age,Status
0,Yes,14,30,E2,-20.79,13.16,42.86,No,No,Female,Agency,7,ERS,Noida,34,1
1,No,18,30,E2,50.0,320.0,180.0,No,No,Male,Employee Referral,8,INFRA,Chennai,34,1
2,No,3,45,E2,42.84,42.84,0.0,No,No,Male,Agency,4,INFRA,Noida,27,1
3,No,26,30,E2,42.84,42.84,0.0,No,No,Male,Employee Referral,4,INFRA,Noida,34,1
4,Yes,1,120,E2,42.59,42.59,0.0,No,Yes,Male,Employee Referral,6,INFRA,Noida,34,1


In [None]:
categorical_columns = ['DOJ Extended', 'Offered band','Joining Bonus','Candidate relocate actual','Gender','Candidate Source','LOB','Location']
le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

df.head()

Unnamed: 0,DOJ Extended,Duration to accept offer,Notice period,Offered band,Pecent hike expected in CTC,Percent hike offered in CTC,Percent difference CTC,Joining Bonus,Candidate relocate actual,Gender,Candidate Source,Rex in Yrs,LOB,Location,Age,Status
0,1,14,30,2,-20.79,13.16,42.86,0,0,0,0,7,4,8,34,1
1,0,18,30,2,50.0,320.0,180.0,0,0,1,2,8,7,2,34,1
2,0,3,45,2,42.84,42.84,0.0,0,0,1,0,4,7,8,27,1
3,0,26,30,2,42.84,42.84,0.0,0,0,1,2,4,7,8,34,1
4,1,1,120,2,42.59,42.59,0.0,0,1,1,2,6,7,8,34,1


In [None]:
for label in df.columns[:-1]:
  plt.hist(df[df["Status"]==1][label], color='blue', label='gamma', alpha=0.7, density=True)
  plt.hist(df[df["Status"]==0][label], color='red', label='hadron', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

Lets check if there are any missing values

In [None]:
duplicates = df[df.duplicated()]

print("Duplicate Entries:")
print(duplicates)

Duplicate Entries:
Empty DataFrame
Columns: [DOJ Extended, Duration to accept offer, Notice period, Offered band, Pecent hike expected in CTC, Percent hike offered in CTC, Percent difference CTC, Joining Bonus, Candidate relocate actual, Gender, Candidate Source, Rex in Yrs, LOB, Location, Age, Status]
Index: []


In [None]:

df = df.drop_duplicates()


In [None]:
missing_values = df.isnull().sum()


print("Columns with missing values:")
print(missing_values[missing_values > 0])

Columns with missing values:
Series([], dtype: int64)


**Now lets see the the data is balanced or not.**

In [None]:
status_counts = df['Status'].value_counts()
print(status_counts)

1    7194
0    1657
Name: Status, dtype: int64


Clearly the datasheet is unbalanced.In order to train the model,we need to add random values

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [None]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

In [None]:
print(len(train[train['Status']==1]))
print(len(train[train['Status']==0]))

4298
1012


In [None]:
train, X_train, y_train = scale_dataset(train, oversample=False)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)


In [None]:
sum(y_train==1)



4298

In [None]:

sum(y_train==0)

1012

# **KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

knn = KNeighborsClassifier()
knn=knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.27      0.33       300
           1       0.86      0.93      0.89      1471

    accuracy                           0.82      1771
   macro avg       0.65      0.60      0.61      1771
weighted avg       0.79      0.82      0.80      1771



*Parameter optimization by finding best parameters*

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train, y_train)

test_accuracy = best_knn.score(X_test, y_test)

print("Best Hyperparameters:", best_params)
print("Best Training Accuracy:", best_accuracy)
print("Test Accuracy with Best Hyperparameters:", test_accuracy)

Best Hyperparameters: {'n_neighbors': 11, 'p': 1, 'weights': 'uniform'}
Best Training Accuracy: 0.8128060263653485
Test Accuracy with Best Hyperparameters: 0.8260869565217391


In [None]:
sum(y_train==1)

4319

# **LOG REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.09      0.16       300
           1       0.84      0.99      0.91      1471

    accuracy                           0.84      1771
   macro avg       0.74      0.54      0.53      1771
weighted avg       0.81      0.84      0.78      1771



*Parameter optimization by finding best parameters*

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split






param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

lr = LogisticRegression()

grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

best_lr = LogisticRegression(**best_params)

best_lr.fit(X_train, y_train)


test_accuracy = best_lr.score(X_test, y_test)

print("Best Hyperparameters:", best_params)
print("Best Training Accuracy:", best_accuracy)
print("Test Accuracy with Best Hyperparameters:", test_accuracy)


Best Hyperparameters: {'C': 0.1, 'penalty': 'l2'}
Best Training Accuracy: 0.8099811676082863
Test Accuracy with Best Hyperparameters: 0.8198757763975155


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.80979284        nan 0.80

*Gauging Features*

In [None]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression(C=0.1)
lr.fit(X_train, y_train)


coefficients = lr.coef_[0]


feature_names =['DOJ Extended','Duration to accept offer', 'Notice period', 'Offered band', 'Pecent hike expected in CTC', 'Percent hike offered in CTC', 'Percent difference CTC', 'Joining Bonus', 'Candidate relocate actual', 'Gender', 'Candidate Source', 'Rex in Yrs', 'LOB', 'Location', 'Age', 'Status']


feature_coefficients = dict(zip(feature_names, coefficients))

sorted_features = sorted(feature_coefficients.items(), key=lambda x: abs(x[1]), reverse=True)

for feature, coefficient in sorted_features:
    print(f"{feature}: {coefficient}")


Candidate relocate actual: 1.3255019803889874
Notice period: -0.45445320369496517
Candidate Source: 0.2428308411854634
Percent hike offered in CTC: 0.19382110910664146
Age: 0.16458943131326173
LOB: 0.153033116919581
Location: 0.12851005927375878
Pecent hike expected in CTC: -0.12192799831175379
Rex in Yrs: -0.11708033707167154
Gender: -0.08548165852890834
Percent difference CTC: -0.07611502175019434
DOJ Extended: 0.05544437518936717
Joining Bonus: -0.04446987095592393
Duration to accept offer: 0.017327899640115368
Offered band: 0.008319112059438555


# **SVC**

In [None]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.08      0.15       300
           1       0.84      0.99      0.91      1471

    accuracy                           0.84      1771
   macro avg       0.76      0.54      0.53      1771
weighted avg       0.81      0.84      0.78      1771



Gauging *Features*

In [None]:
from sklearn.svm import SVC

svm = SVC(C=0.001, kernel='linear')
svm.fit(X_train, y_train)

coefficients = svm.coef_[0]

feature_names =['DOJ Extended','Duration to accept offer', 'Notice period', 'Offered band', 'Pecent hike expected in CTC', 'Percent hike offered in CTC', 'Percent difference CTC', 'Joining Bonus', 'Candidate relocate actual', 'Gender', 'Candidate Source', 'Rex in Yrs', 'LOB', 'Location', 'Age', 'Status']

feature_coefficients = dict(zip(feature_names, coefficients))

sorted_features = sorted(feature_coefficients.items(), key=lambda x: abs(x[1]), reverse=True)

for feature, coefficient in sorted_features:
    print(f"{feature}: {coefficient}")


Candidate relocate actual: 0.0004981891306072991
Percent hike offered in CTC: 0.0004167138229326068
Pecent hike expected in CTC: -0.0003312794851312701
Percent difference CTC: -0.00029806460649069936
Location: -0.00011346260284086787
Offered band: -6.461111236989465e-05
Joining Bonus: -6.195562840474137e-05
Notice period: -5.532117386905023e-05
LOB: 5.428786499272987e-05
Age: 4.641730982175179e-05
Rex in Yrs: 3.584079762248551e-05
Candidate Source: 3.3079834029717535e-05
DOJ Extended: 2.7040426902576317e-05
Duration to accept offer: -1.2770889593905974e-05
Gender: -3.7358466275082703e-06


# **Gradient Boosting**

In [None]:
import xgboost as xgb
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train, y_train)
y_pred = xgb_classifier.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.37      0.18      0.24       300
           1       0.85      0.94      0.89      1471

    accuracy                           0.81      1771
   macro avg       0.61      0.56      0.57      1771
weighted avg       0.77      0.81      0.78      1771



*Parameter optimization by finding best parameters*

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import train_test_split






param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 200],
    'reg_alpha': [0, 1, 2],
    'reg_lambda': [0, 1, 2]
}


xgb_classifier = xgb.XGBClassifier()


grid_search = GridSearchCV(xgb_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_


best_xgb_classifier = xgb.XGBClassifier(**best_params)


best_xgb_classifier.fit(X_train, y_train)


test_accuracy = best_xgb_classifier.score(X_test, y_test)

print("Best Hyperparameters:", best_params)
print("Best Training Accuracy:", best_accuracy)
print("Test Accuracy with Best Hyperparameters:", test_accuracy)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_alpha': 1, 'reg_lambda': 1}
Best Training Accuracy: 0.8235404896421846
Test Accuracy with Best Hyperparameters: 0.8334274421230943


# Gauging Parameters

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

feature_importances = xgb_model.feature_importances_

feature_names =['DOJ Extended','Duration to accept offer', 'Notice period', 'Offered band', 'Pecent hike expected in CTC', 'Percent hike offered in CTC', 'Percent difference CTC', 'Joining Bonus', 'Candidate relocate actual', 'Gender', 'Candidate Source', 'Rex in Yrs', 'LOB', 'Location', 'Age', 'Status']

feature_importance = dict(zip(feature_names, feature_importances))

sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_features:
    print(f"{feature}: {importance}")


Candidate relocate actual: 0.5048701167106628
Notice period: 0.0628712996840477
Candidate Source: 0.03988249972462654
Joining Bonus: 0.03945528715848923
LOB: 0.039264269173145294
Offered band: 0.03813832625746727
Duration to accept offer: 0.036936674267053604
Gender: 0.03226175904273987
DOJ Extended: 0.031250860542058945
Location: 0.03101413883268833
Percent difference CTC: 0.03032580390572548
Age: 0.028991863131523132
Pecent hike expected in CTC: 0.028897205367684364
Rex in Yrs: 0.02830788865685463
Percent hike offered in CTC: 0.027531979605555534


# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()


rf_classifier.fit(X_train, y_train)


y_pred = rf_classifier.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.16      0.25       326
           1       0.84      0.96      0.90      1445

    accuracy                           0.82      1771
   macro avg       0.67      0.56      0.57      1771
weighted avg       0.78      0.82      0.78      1771



*Parameter optimization by finding best parameters*

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split





param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

best_rf = RandomForestClassifier(**best_params)

best_rf.fit(X_train, y_train)

test_accuracy = best_rf.score(X_test, y_test)

print("Best Hyperparameters:", best_params)
print("Best Training Accuracy:", best_accuracy)
print("Test Accuracy with Best Hyperparameters:", test_accuracy)


Best Hyperparameters: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best Training Accuracy: 0.8188323917137478
Test Accuracy with Best Hyperparameters: 0.841897233201581


# **Comparison Of the Models**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

models = [
    ('Random Forest', RandomForestClassifier()),
    ('SVM', SVC()),
    ('Logistic Regression', LogisticRegression()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),


    ('Gradient Boosting', GradientBoostingClassifier())
]


results = []
model_names = []

for model_name, model in models:
    scores = cross_val_score(model, X_train, y_train, cv=5)
    results.append(scores)
    model_names.append(model_name)


for i in range(len(model_names)):
    print(f"{model_names[i]}: Mean Accuracy = {results[i].mean():.3f}, Std Dev = {results[i].std():.3f}")


Random Forest: Mean Accuracy = 0.816, Std Dev = 0.002
SVM: Mean Accuracy = 0.813, Std Dev = 0.001
Logistic Regression: Mean Accuracy = 0.811, Std Dev = 0.004
K-Nearest Neighbors: Mean Accuracy = 0.796, Std Dev = 0.005
Gradient Boosting: Mean Accuracy = 0.822, Std Dev = 0.004
