In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

Load cleaned up version of dataset from 2012 to 2018.  Drop geographic information about employer and worksite to reduce dimensionality

In [2]:
df = pd.read_pickle('/Users/ml/Desktop/Udacity_ML_Capstone/data/H1B_15-18_new.pickle')
df.drop(columns=['EMPLOYER_CITY','JOB_TITLE','EMPLOYER_NAME'], inplace=True)
df.drop(columns=['EMPLOYER_STATE','WORKSITE_STATE'], inplace=True)


In [3]:
ones_df = df[df['CASE_STATUS']=='DENIED']
zeros_df = df[df['CASE_STATUS']=='CERTIFIED']

In [4]:
df = pd.concat([ones_df, zeros_df.sample(frac=0.02, random_state=99)])


In [5]:
ones_df.head()
print(ones_df.shape)

(37310, 11)


In [6]:
zeros_df.head()
print(zeros_df.shape)

(1656369, 11)


We will use column 'CASE_STATUS' as our label.  DENIED will be 1 and CERTIFIED will be 0 after it gets incoded by label_encoder.

Normalize PREVAILING_WAGE using minmax scaler since it varies quite a bit.

In [7]:
scaler = MinMaxScaler()
X = df.drop(columns='CASE_STATUS')

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['CASE_STATUS'])

X[['PREVAILING_WAGE']] = scaler.fit_transform(X[['PREVAILING_WAGE']])
X = pd.get_dummies(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Check value counts to verify stratify split

In [9]:
X_train.head()

Unnamed: 0,PREVAILING_WAGE,WAGE_LOWER_THAN_PW,FULL_TIME_POSITION_N,FULL_TIME_POSITION_Y,H1B_DEPENDENT_N,H1B_DEPENDENT_Y,NAICS_CODE_11111,NAICS_CODE_111110,NAICS_CODE_11113,NAICS_CODE_111140,...,VISA_CLASS_H-1B,VISA_CLASS_H-1B1 Chile,VISA_CLASS_H-1B1 Singapore,WAGE_UNIT_OF_PAY_Bi-Weekly,WAGE_UNIT_OF_PAY_Hour,WAGE_UNIT_OF_PAY_Month,WAGE_UNIT_OF_PAY_Week,WAGE_UNIT_OF_PAY_Year,WILLFUL_VIOLATOR_N,WILLFUL_VIOLATOR_Y
67398,6.8702e-05,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
365901,0.0001049146,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
82831,7.225408e-05,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
9102,4.529e-08,False,0,1,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
76241,7.9934e-05,False,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0


Train RandomForestClassifier and predict using X_test

In [10]:
from sklearn.ensemble import RandomForestClassifier


# from sklearn.feature_extraction.text import CountVectorizer

rfc = RandomForestClassifier()
# vec = CountVectorizer()
# X_train_t = vec.fit_transform(X_train)

rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

  from numpy.core.umath_tests import inner1d


Print out classification scores stats

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
print("** Random Forest Classifier Stats (OVERALL testset)")
print(classification_report(y_test, rfc_pred))

** Random Forest Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.70      0.74      0.72      6626
          1       0.76      0.72      0.74      7462

avg / total       0.73      0.73      0.73     14088



In [12]:
import operator
# print(len(rfc.feature_importances_))
# print(len(X_train.columns))

nonzero_features = {}
for i in range(len(rfc.feature_importances_)):
    if rfc.feature_importances_[i] > 0.0:
        nonzero_features[list(X_train.columns)[i]] = rfc.feature_importances_[i]
sorted_result = sorted(nonzero_features.items(), key=operator.itemgetter(1)) 
for i in range(len(sorted_result)):
    print(sorted_result[-(i+1)])

(u'PREVAILING_WAGE', 0.376408861617415)
('WAGE_LOWER_THAN_PW', 0.10343544737972435)
(u'H1B_DEPENDENT_N', 0.03745900025584652)
('NAICS_CODE_541511', 0.02911764932722416)
(u'VISA_CLASS_H-1B', 0.025159911013253065)
(u'H1B_DEPENDENT_Y', 0.023342474752840882)
(u'PW_SOURCE_OES', 0.014800264284088014)
(u'PW_SOURCE_Other', 0.011429226793800453)
(u'VISA_CLASS_E-3 Australian', 0.009940922030597996)
(u'WAGE_UNIT_OF_PAY_Year', 0.007297923044884501)
('SOC_CODE_15-1132', 0.007162863360886231)
(u'WAGE_UNIT_OF_PAY_Hour', 0.006031017777359241)
('SOC_CODE_15-1121', 0.005665739032137275)
('NAICS_CODE_541512', 0.0044354870951230545)
('SOC_CODE_15-1199', 0.004250944097272442)
('SOC_CODE_15-1131', 0.0038098455473264264)
('NAICS_CODE_54151', 0.003530323986267757)
('NAICS_CODE_611310', 0.0034005366629094916)
('NAICS_CODE_51121', 0.0033446723337782376)
('NAICS_CODE_CAPGEMINIAMERICAINC', 0.003119544845840404)
('NAICS_CODE_541519', 0.002940352652991395)
('NAICS_CODE_541330', 0.0027403244918788325)
('SOC_CODE_15-

('SOC_CODE_11-9013', 6.622367699521266e-05)
('NAICS_CODE_OPENASSEMBLYINC', 6.610268807604995e-05)
('NAICS_CODE_423840', 6.607022545604673e-05)
('SOC_CODE_17-3023', 6.605640373813456e-05)
('NAICS_CODE_311412', 6.600150856297444e-05)
('NAICS_CODE_FCAUSLLC', 6.5997215273153e-05)
('SOC_CODE_19-3093', 6.594424489273133e-05)
('NAICS_CODE_334414', 6.590380133155947e-05)
('SOC_CODE_53-2011', 6.583351803514063e-05)
('NAICS_CODE_811219', 6.577667038145835e-05)
('NAICS_CODE_221310', 6.560165316952655e-05)
('NAICS_CODE_562211', 6.559219422768829e-05)
('NAICS_CODE_327112', 6.553476751476785e-05)
('NAICS_CODE_512131', 6.538334516069654e-05)
('NAICS_CODE_322212', 6.536704794061118e-05)
('SOC_CODE_21-2099', 6.530904149519859e-05)
('NAICS_CODE_332611', 6.515385118968006e-05)
('NAICS_CODE_921110', 6.507636519929343e-05)
('NAICS_CODE_236115', 6.477136007440057e-05)
('SOC_CODE_19-2011', 6.442902722235519e-05)
('NAICS_CODE_333131', 6.434957340871526e-05)
('NAICS_CODE_42332', 6.405397096854753e-05)
('NAICS_

('NAICS_CODE_LOGICALPARADIGMLLC', 1.9082816067730367e-05)
('NAICS_CODE_532412', 1.904800219332437e-05)
('NAICS_CODE_324191', 1.900022814563014e-05)
('NAICS_CODE_MOONCOLLECTIONINC', 1.889595632455312e-05)
('NAICS_CODE_325182', 1.884466454735265e-05)
('NAICS_CODE_541300', 1.8840893814172682e-05)
('NAICS_CODE_ONCAMINC', 1.8819119291114832e-05)
('NAICS_CODE_LANDCOFINANCIALSERVICESCORPORATION', 1.8804011241906465e-05)
('NAICS_CODE_51511', 1.879015893845903e-05)
('NAICS_CODE_333996', 1.87824054023155e-05)
('NAICS_CODE_ONFORCEINCORPORATED', 1.8740648900013943e-05)
('NAICS_CODE_WESTERNCOMMUNITYACTIONINC', 1.8732064472270475e-05)
('NAICS_CODE_928120', 1.8717035831372385e-05)
('SOC_CODE_11-3111', 1.8665413544016256e-05)
('NAICS_CODE_MCTESTSERVICEINC', 1.862885463459541e-05)
('NAICS_CODE_GERARDFOXLAWPC', 1.8560640643403858e-05)
('NAICS_CODE_SAPLABSLLC', 1.8511068009494987e-05)
('NAICS_CODE_33633', 1.8471484949670078e-05)
('SOC_CODE_39-7011', 1.844714884609245e-05)
('NAICS_CODE_316110', 1.83878246

('NAICS_CODE_TRIMECAUSALLC', 4.3050338845230225e-06)
('NAICS_CODE_MENDELANDCOMPANYCONSTRUCTION', 4.2961975140975525e-06)
('NAICS_CODE_81311', 4.287472795171767e-06)
('NAICS_CODE_313210', 4.2863267093746225e-06)
('NAICS_CODE_81321', 4.28525634726927e-06)
('NAICS_CODE_22131', 4.236621080661125e-06)
('NAICS_CODE_FOUNTAINFOODSINC', 4.232175194996066e-06)
('NAICS_CODE_CAPITALSTONEMANAGEMENT', 4.228547635748289e-06)
('NAICS_CODE_TRINITYVALLEYSCHOOL', 4.2131389968269015e-06)
('NAICS_CODE_EVIDERAINC', 4.199747800551614e-06)
('NAICS_CODE_CARIBEPAPERTRADINGCOMPANYLLC', 4.196719612720067e-06)
('NAICS_CODE_VICENTEFERRERFOUNDATIONUSAINC', 4.181078222035137e-06)
('NAICS_CODE_FMICAPITALADVISORSINC', 4.17663171932824e-06)
('NAICS_CODE_812111', 4.17663171932824e-06)
('NAICS_CODE_PRUDENTACCOUNTINGSERVICESLLC', 4.171678836291083e-06)
('NAICS_CODE_325400', 4.1698978008217195e-06)
('NAICS_CODE_IONTECHNOLOGYINC', 4.150961278276804e-06)
('NAICS_CODE_MODWAYINC', 4.126402603637703e-06)
('NAICS_CODE_INTERTOLLUS

('NAICS_CODE_HOLDENENTERPRISESLLC', 3.5318571121798725e-08)
('SOC_CODE_27-1027', 3.4774828217589506e-08)
('NAICS_CODE_HARRIUSALLC', 3.468453060358023e-08)
('NAICS_CODE_48832', 3.356183539217342e-08)
('SOC_CODE_47-2111', 3.3475494179656216e-08)
('NAICS_CODE_NORTHWESTCHRISTIANUNIVERSITY', 3.339456764673527e-08)
('NAICS_CODE_SOUTHLANDDATAPROCESSING', 3.299801843289802e-08)
('NAICS_CODE_INSURANCEINCORPORATEDOFSOUTHERNCALIFORNIA', 3.218213108957334e-08)
('NAICS_CODE_THEWHITEOAKGROUPLLC', 3.214239927866988e-08)
('SOC_CODE_51-6099', 2.9839496202314686e-08)
('NAICS_CODE_STOBILLC', 2.9030426701935726e-08)
('NAICS_CODE_HARRISMEDIALLC', 2.8798270632364386e-08)
('NAICS_CODE_NORTHWESTCULTUREFOUNDATION', 2.7743538695668476e-08)
('NAICS_CODE_SPRINGUPINC', 2.725198411967772e-08)
('NAICS_CODE_THEMARIAMONTESSORISCHOOL', 2.6944512760756915e-08)
('NAICS_CODE_RICONPHARMALLC', 2.6350290600340793e-08)
('NAICS_CODE_SPRINGDENTALEASTTULSA', 2.587904705011813e-08)
('NAICS_CODE_THEBARNYARDENTERPRISESINC', 2.54110

Evaluate stats for the entire test set

In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score, confusion_matrix

print("** Random Forest Classifier Scores (OVERALL testset)")
print("Precision: %s"% precision_score(y_test, rfc_pred))
print("Recall: %s"% recall_score(y_test, rfc_pred))
print("Accuracy score: %s"% accuracy_score(y_test, rfc_pred))
print("F-1 score: %s"% f1_score(y_test, rfc_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, rfc_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, rfc_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, rfc_pred))

** Random Forest Classifier Scores (OVERALL testset)
Precision: 0.7600960180739904
Recall: 0.7213883677298312
Accuracy score: 0.7318285065303805
F-1 score: 0.7402365236523653
F-beta score with beta=0.5: 0.7520257055043309
F-beta score with beta=0.2: 0.7585306104751994
Confusion Matrix: 
[[4927 1699]
 [2079 5383]]


Evaluate stats for testset with label 1 (DENIED cases only)

In [14]:
import numpy as np
ones_index = y_test.nonzero()
x_ones = X_test.iloc[ones_index]
y_ones = np.take(y_test, ones_index)[0]

rfc_pred_oneonly = rfc.predict(x_ones)

In [15]:
print("** Random Forest Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s"% precision_score(y_ones, rfc_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, rfc_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, rfc_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, rfc_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, rfc_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, rfc_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, rfc_pred_oneonly))

** Random Forest Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.7213883677298312
Accuracy score: 0.7213883677298312
F-1 score: 0.8381471389645777
F-beta score with beta=0.5: 0.9282955094157412
F-beta score with beta=0.2: 0.9853629688039032
Confusion Matrix: 
[[   0    0]
 [2079 5383]]


Evaluate stats for testset with label 0 (CERTIFIED cases only)

In [16]:
zeros_index = np.arange(len(y_test))[(y_test==0)]

x_zeros = X_test.iloc[zeros_index]
y_zeros = np.take(y_test, zeros_index)
rfc_pred_zeroonly = rfc.predict(x_zeros)

In [17]:
print("** Random Forest Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s"% precision_score(y_zeros, rfc_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, rfc_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, rfc_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, rfc_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, rfc_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, rfc_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, rfc_pred_zeroonly))


** Random Forest Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.7435858738303652
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[4927 1699]
 [   0    0]]


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
