In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

Load cleaned up version of dataset from 2012 to 2018.  Drop geographic information about employer and worksite to reduce dimensionality

In [2]:
df = pd.read_pickle('/Users/ml/Desktop/Udacity_ML_Capstone/data/H1B_15-18_new.pickle')
df.drop(columns=['EMPLOYER_CITY','JOB_TITLE','EMPLOYER_NAME'], inplace=True)
df.drop(columns=['EMPLOYER_STATE','WORKSITE_STATE'], inplace=True)


In [3]:
ones_df = df[df['CASE_STATUS']=='DENIED']
zeros_df = df[df['CASE_STATUS']=='CERTIFIED']

In [4]:
df = pd.concat([ones_df, zeros_df.sample(frac=0.02, random_state=99)])


In [5]:
ones_df.head()
print(ones_df.shape)

(37310, 11)


In [6]:
zeros_df.head()
print(zeros_df.shape)

(1656369, 11)


We will use column 'CASE_STATUS' as our label.  DENIED will be 1 and CERTIFIED will be 0 after it gets incoded by label_encoder.

Normalize PREVAILING_WAGE using minmax scaler since it varies quite a bit.

In [7]:
scaler = MinMaxScaler()
X = df.drop(columns='CASE_STATUS')

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['CASE_STATUS'])

X[['PREVAILING_WAGE']] = scaler.fit_transform(X[['PREVAILING_WAGE']])
X = pd.get_dummies(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

Initialize and train a XGBoost classifier

In [9]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=99, nthread=11)

xgb.fit(X_train, y_train)
# print(xgb.feature_importances_)
print(xgb.score(X_test, y_test))
xgb_pred = xgb.predict(X_test)
print(xgb_pred)

  if diff:


0.7238074957410562
[0 0 0 ... 1 1 1]


  if diff:


In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix



print("** XGB Classifier Stats (OVERALL testset)")
print(classification_report(y_test, xgb_pred))

** XGB Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.68      0.78      0.73      6626
          1       0.78      0.67      0.72      7462

avg / total       0.73      0.72      0.72     14088



In [14]:
import operator
nonzero_features = {}
for i in range(len(xgb.feature_importances_)):
    if xgb.feature_importances_[i] > 0.0:
        nonzero_features[list(X_train.columns)[i]] = xgb.feature_importances_[i]
sorted_result = sorted(nonzero_features.items(), key=operator.itemgetter(1)) 
for i in range(len(sorted_result)):
    print(sorted_result[-(i+1)])

# with open('/tmp/feature_importances_first_800k_xgb.pickle', 'wb') as f:
#     pickle.dump(sorted_result, f, protocol=pickle.HIGHEST_PROTOCOL)
# with open('/tmp/feature_importances_first_800k_xgb_dict.pickle', 'wb') as f:
#     pickle.dump(nonzero_features, f, protocol=pickle.HIGHEST_PROTOCOL)

(u'H1B_DEPENDENT_N', 0.22089207)
('WAGE_LOWER_THAN_PW', 0.18790165)
(u'VISA_CLASS_H-1B', 0.121546865)
(u'PW_SOURCE_Other', 0.057632793)
('NAICS_CODE_541511', 0.04291458)
(u'PW_SOURCE_OES', 0.03331062)
(u'PREVAILING_WAGE', 0.028017666)
('NAICS_CODE_611310', 0.02222083)
('SOC_CODE_15-1132', 0.015854584)
('NAICS_CODE_CAPGEMINIAMERICAINC', 0.013849134)
('NAICS_CODE_54151', 0.011991634)
('NAICS_CODE_51121', 0.0117135625)
('SOC_CODE_15-1121', 0.011516797)
('NAICS_CODE_334413', 0.009613371)
('NAICS_CODE_518112', 0.009264715)
('SOC_CODE_15-2041', 0.008028058)
('SOC_CODE_29-1069', 0.0077255527)
('SOC_CODE_29-2011', 0.007722302)
('SOC_CODE_19-1029', 0.007434772)
('NAICS_CODE_541512', 0.0074032987)
('SOC_CODE_19-1042', 0.007357748)
('SOC_CODE_23-1011', 0.007092336)
(u'WAGE_UNIT_OF_PAY_Month', 0.0070633255)
(u'PW_SOURCE_CBA', 0.006945116)
('NAICS_CODE_ACCENTURELLP', 0.0065460377)
('NAICS_CODE_54161', 0.006511824)
('SOC_CODE_15-1199', 0.006456011)
(u'PW_SOURCE_SCA', 0.0060395407)
('SOC_CODE_11-1021

In [15]:
print("** XGB Classifier Scores (OVERALL testset)")
print("Precision: %s" % precision_score(y_test, xgb_pred))
print("Recall: %s"% recall_score(y_test, xgb_pred))
print("Accuracy score: %s"% accuracy_score(y_test, xgb_pred))
print("F-1 score: %s"% f1_score(y_test, xgb_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, xgb_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, xgb_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, xgb_pred))
print("\n")

** XGB Classifier Scores (OVERALL testset)
Precision: 0.7779853650941927
Recall: 0.6696596086839989
Accuracy score: 0.7238074957410562
F-1 score: 0.7197695354699315
F-beta score with beta=0.5: 0.753604391626953
F-beta score with beta=0.2: 0.7731749555157494
Confusion Matrix: 
[[5200 1426]
 [2465 4997]]




In [17]:
import numpy as np
ones_index = y_test.nonzero()
x_ones = X_test.iloc[ones_index]
y_ones = np.take(y_test, ones_index)[0]

xgb_pred_oneonly = xgb.predict(x_ones)

  if diff:


In [18]:
print("** XGB Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s" % precision_score(y_ones, xgb_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, xgb_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, xgb_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, xgb_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, xgb_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, xgb_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, xgb_pred_oneonly))
print("\n")

** XGB Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.6696596086839989
Accuracy score: 0.6696596086839989
F-1 score: 0.802151055461915
F-beta score with beta=0.5: 0.910200364298725
F-beta score with beta=0.2: 0.9813803470129243
Confusion Matrix: 
[[   0    0]
 [2465 4997]]




In [19]:
zeros_index = np.arange(len(y_test))[(y_test==0)]

x_zeros = X_test.iloc[zeros_index]
y_zeros = np.take(y_test, zeros_index)

xgb_pred_zeroonly = xgb.predict(x_zeros)

  if diff:


In [20]:
print("** XGB Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s" % precision_score(y_zeros, xgb_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, xgb_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, xgb_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, xgb_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, xgb_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, xgb_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, xgb_pred_zeroonly))
print("\n")

** XGB Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.7847872019317839
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[5200 1426]
 [   0    0]]




  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
