# Import Packages

In [1]:
## Data Analysis packages
import os
import sys
import numpy as np
import pandas as pd
import joblib

## Machine learning packages
import sklearn
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_curve, auc
from category_encoders import OrdinalEncoder

import warnings 
warnings.filterwarnings("ignore")

In [2]:
print(f"Pandas Version: {pd.__version__}")
print(f"Numpy Version: {np.__version__}")
print(f"scikit-learn Version: {sklearn.__version__}")
print(f"joblib Version: {joblib.__version__}")
print(f"lightgbm Version: {lgb.__version__}")

Pandas Version: 1.1.5
Numpy Version: 1.19.4
scikit-learn Version: 0.24.0
joblib Version: 1.0.0
lightgbm Version: 3.1.1


## Load data

In [3]:
## Files
data_file = './data/Placement_Data_Full_Class.csv'

# Load train loan dataset 
try:
    data = pd.read_csv(data_file)
    print("The dataset has {} samples with {} features.".format(*data.shape))
except:
    print("The dataset could not be loaded. Is the dataset missing?")

The dataset has 215 samples with 15 features.


## Introduction To The Data

In [4]:
data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


## Prepare data for model training

In [5]:
exclude_feature = ['sl_no', 'salary', 'status']
# Define Target columns
target = data['status'].map({"Placed": 0 , "Not Placed": 1})

# Define numeric and categorical features
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
numeric_features = [col for col in numeric_columns if col not in exclude_feature]
categorical_features = [col for col in categorical_columns if col not in exclude_feature]

# Define final feature list for training and validation
features = numeric_features + categorical_features
# Final data for training and validation
data = data[features]
data = data.fillna(0)

# Split data in train and vlaidation
X_train, X_valid, y_train, y_valid = train_test_split(data, target, test_size=0.15, random_state=10)
X_valid.to_json(path_or_buf='./data/valid.json', orient='records', lines=True)

# Perform label encoding for categorical variable
le = OrdinalEncoder(cols=categorical_features)
le.fit(X_train[categorical_features])
X_train[categorical_features] = le.transform(X_train[categorical_features])
X_valid[categorical_features] = le.transform(X_valid[categorical_features])

## Train and evaluate model

In [6]:
 # Perform model training
clf = LGBMClassifier(random_state=10)
clf.fit(X_train, y_train)

# Perform model evaluation 
valid_prediction = clf.predict_proba(X_valid)[:, 1]
fpr, tpr, thresholds = roc_curve(y_valid, valid_prediction)
roc_auc = auc(fpr, tpr) # compute area under the curve
print("=====================================")
print("Validation AUC:{}".format(roc_auc))
print("=====================================")

Validation AUC:0.9135338345864662


In [7]:
# Perform model evaluation 
print(classification_report(y_valid,clf.predict(X_valid)))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84        19
           1       0.89      0.57      0.70        14

    accuracy                           0.79        33
   macro avg       0.82      0.76      0.77        33
weighted avg       0.81      0.79      0.78        33



## Save model artifacts

In [9]:
joblib.dump(le, './model/label_encoder.joblib')
joblib.dump(clf, './model/lgb_model.joblib')
joblib.dump(features, './model/features.joblib')
joblib.dump(categorical_features, './model/categorical_features.joblib')

['./model/categorical_features.joblib']