In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

Load cleaned up version of dataset from 2012 to 2018.  Drop geographic information about employer and worksite to reduce dimensionality

In [3]:
df = pd.read_pickle('/Users/minse_chang/PycharmProjects/Udacity_ML_Capstone/data/H1B_15-18_new.pickle')
df = df[['CASE_STATUS','WAGE_LOWER_THAN_PW','PREVAILING_WAGE','SOC_CODE','EMPLOYER_NAME']]
# df = df[['CASE_STATUS','WAGE_LOWER_THAN_PW','PREVAILING_WAGE','SOC_CODE']]
df['CASE_STATUS'] = df['CASE_STATUS'].apply(lambda x : 1 if x=='DENIED' else 0)

In [4]:
df['WAGE_LOWER_THAN_PW'] = df['WAGE_LOWER_THAN_PW'].astype(int)
df.head(10)

Unnamed: 0,CASE_STATUS,WAGE_LOWER_THAN_PW,PREVAILING_WAGE,SOC_CODE,EMPLOYER_NAME
1,0,0,45727.334,25-1032,UNIVERSITYOFOKLAHOMA
3,0,0,70413.2662,17-2072,OMRONOILFIELDANDMARINEINC
4,0,0,103390.0783,15-1131,FEDERALHOMELOANMORTGAGECO
6,0,0,142938.9944,15-1132,VMWAREINC
7,0,0,68372.2865,15-2031,FEDERALHOMELOANMORTGAGECORPORATION
8,0,0,108717.11,15-1121,VMWAREINC
9,1,0,23.962574,13-2011,IMAEXTRADINGCOMPANY
10,0,0,105150.4633,15-1132,VMWAREINC
11,1,1,368080.5,13-2011,LHBINC
12,1,1,71456.6944,15-1021,INTERNATIONALSOLUTIONSGROUPINC


In [5]:
ones_df = df[df['CASE_STATUS']==1]
zeros_df = df[df['CASE_STATUS']==0]

In [6]:
df = pd.concat([ones_df, zeros_df.sample(frac=0.02, random_state=99)])

In [7]:
ones_df.head()
print(ones_df.shape)

(37310, 5)


In [8]:
zeros_df.head()
print(zeros_df.shape)

(2424938, 5)


We will use column 'CASE_STATUS' as our label.  DENIED will be 1 and CERTIFIED will be 0 after it gets incoded by label_encoder.

Normalize PREVAILING_WAGE using minmax scaler since it varies quite a bit.

In [9]:
scaler = MinMaxScaler()
X = df.drop(columns='CASE_STATUS')

# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(df['CASE_STATUS'])


X[['PREVAILING_WAGE']] = scaler.fit_transform(X[['PREVAILING_WAGE']])
X = pd.get_dummies(X)

In [10]:
y = df['CASE_STATUS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Check value counts to verify stratify split

In [17]:
# print(y_train.value_counts())
# print(y_test.value_counts())
X_train.head()

Unnamed: 0,WAGE_LOWER_THAN_PW,PREVAILING_WAGE,SOC_CODE_11-1011,SOC_CODE_11-1021,SOC_CODE_11-2011,SOC_CODE_11-2021,SOC_CODE_11-2022,SOC_CODE_11-2031,SOC_CODE_11-3011,SOC_CODE_11-3021,...,EMPLOYER_NAME_ZYNGA INC.,"EMPLOYER_NAME_ZYNGA, INC.",EMPLOYER_NAME_ZYPE INC,EMPLOYER_NAME_ZYPE INC.,"EMPLOYER_NAME_ZYSTEMSGO, INC.","EMPLOYER_NAME_ZYTO TECHNOLOGIES, INC.",EMPLOYER_NAME_ZYWIE INC.,"EMPLOYER_NAME_[24]7.AI, INC.",EMPLOYER_NAME_ÉTUDES LLC,"EMPLOYER_NAME_ÉTUDES, LLC"
595927,0,6.2e-05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
624033,0,8.5e-05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
144585,0,9.5e-05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
102684,0,6.9e-05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
447271,0,7.3e-05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Train DecisionTreeClassifier and predict using only 4 features from X_test

In [78]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)

clf_pred = clf.predict(X_test)

In [79]:
from sklearn.metrics import classification_report, confusion_matrix
print("** Random Forest Classifier Stats (OVERALL testset)")
print(classification_report(y_test, clf_pred))

** Random Forest Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.75      0.81      0.78      9742
          1       0.72      0.64      0.68      7420

avg / total       0.73      0.74      0.73     17162



Print out score statsfor the entire dataset

In [80]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score, confusion_matrix

print("** Decision Tree Classifier Scores (OVERALL testset)")
print("Precision: %s"% precision_score(y_test, clf_pred))
print("Recall: %s"% recall_score(y_test, clf_pred))
print("Accuracy score: %s"% accuracy_score(y_test, clf_pred))
print("F-1 score: %s"% f1_score(y_test, clf_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, clf_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, clf_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, clf_pred))

** Decision Tree Classifier Scores (OVERALL testset)
Precision: 0.7189831024509058
Recall: 0.636522911051213
Accuracy score: 0.7352872625568115
F-1 score: 0.675244835227679
F-beta score with beta=0.5: 0.7008250237416904
F-beta score with beta=0.2: 0.7154184508724403
Confusion Matrix: 
[[7896 1846]
 [2697 4723]]


In [81]:
df.head()

Unnamed: 0,CASE_STATUS,WAGE_LOWER_THAN_PW,PREVAILING_WAGE,SOC_CODE,EMPLOYER_NAME
9,1,0,23.962574,13-2011,IMAEX TRADING COMPANY
11,1,1,368080.5,13-2011,"LHB, INC."
12,1,1,71456.6944,15-1021,"INTERNATIONAL SOLUTIONS GROUP, INC."
37,1,0,21338.0,15-2041,"LHB, INC."
55,1,0,74696.8697,15-1021,"INTERNATIONAL SOLUTIONS GROUP, INC."


Evaluate stats for testset with label 1 (DENIED cases only)

In [82]:
import numpy as np
ones_index = list(y_test.nonzero()[0])

In [83]:
x_ones = X_test.iloc[ones_index]
y_ones = np.take(y_test.values, ones_index)

clf_pred_oneonly = clf.predict(x_ones)

In [84]:
print("** Decision Tree Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s"% precision_score(y_ones, clf_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, clf_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, clf_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, clf_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, clf_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, clf_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, clf_pred_oneonly))

** Decision Tree Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.636522911051213
Accuracy score: 0.636522911051213
F-1 score: 0.7778967306266985
F-beta score with beta=0.5: 0.8974992398905443
F-beta score with beta=0.2: 0.9785091039483644
Confusion Matrix: 
[[   0    0]
 [2697 4723]]


Evaluate stats for testset with label 0 (CERTIFIED cases only)

In [85]:
zeros_index = np.arange(len(y_test))[(y_test==0)]

x_zeros = X_test.iloc[zeros_index]
y_zeros = np.take(y_test.values, zeros_index)
clf_pred_zeroonly = clf.predict(x_zeros)

In [86]:
print("** Decision Tree Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s"% precision_score(y_zeros, clf_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, clf_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, clf_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, clf_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, clf_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, clf_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, clf_pred_zeroonly))


** Decision Tree Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.8105111886676247
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[7896 1846]
 [   0    0]]


In [87]:
import operator
nonzero_features = {}
for i in range(len(clf.feature_importances_)):
    if clf.feature_importances_[i] > 0.0:
        nonzero_features[list(X_train.columns)[i]] = clf.feature_importances_[i]
sorted_result = sorted(nonzero_features.items(), key=operator.itemgetter(1)) 
for i in range(len(sorted_result)):
    print(sorted_result[-(i+1)])



(u'PREVAILING_WAGE', 0.18962907678420166)
('WAGE_LOWER_THAN_PW', 0.11245432982346015)
(u'EMPLOYER_NAME_INFOSYS LIMITED', 0.011695117371107145)
('SOC_CODE_15-1121', 0.011211457060237899)
('SOC_CODE_15-1199', 0.010339803088269543)
('SOC_CODE_15-1132', 0.00992767344191534)
('SOC_CODE_15-1131', 0.009396137272003818)
(u'EMPLOYER_NAME_TATA CONSULTANCY SERVICES LIMITED', 0.0067021037711386166)
('SOC_CODE_15-1133', 0.0029931750349588334)
('SOC_CODE_13-1111', 0.0025560385978159253)
('SOC_CODE_15-1141', 0.002349335517094268)
('SOC_CODE_15-1142', 0.0023245161739404857)
('SOC_CODE_15-2031', 0.0022820328424360977)
(u'EMPLOYER_NAME_CONSULTADD INC', 0.002229979980389235)
(u'EMPLOYER_NAME_DELOITTE CONSULTING LLP', 0.00206820251730964)
('SOC_CODE_29-1123', 0.0018440040134659838)
(u'EMPLOYER_NAME_GLOBALLOGIC,INC.', 0.0016767410410562715)
(u'EMPLOYER_NAME_ACCENZ LLC', 0.0016017064362577875)
('SOC_CODE_15-2041', 0.0014999232519278976)
(u'EMPLOYER_NAME_HCL AMERICA, INC.', 0.001442913540259314)
(u'EMPLOYER_

(u'EMPLOYER_NAME_NEAL ROSENBERG ESQ', 5.694813829780987e-05)
(u'EMPLOYER_NAME_DYNAMIC MOTION CONTROL, INC.', 5.694813829780987e-05)
(u'EMPLOYER_NAME_DAVERO SONOMA, INC.', 5.694813829780987e-05)
(u'EMPLOYER_NAME_CHRISTUS HEALTH', 5.694813829780987e-05)
(u'EMPLOYER_NAME_CYBERBRIDGE INTERNATIONAL, INC.', 5.694813829780987e-05)
(u'EMPLOYER_NAME_ITASCA INDEPENDENT SCHOOL DISTRICT', 5.694813829780987e-05)
(u'EMPLOYER_NAME_NPH MEDICAL SERVICES', 5.694813829780987e-05)
(u'EMPLOYER_NAME_MCV TECHNOLOGIES, INC.', 5.694813829780987e-05)
(u'EMPLOYER_NAME_LOTTE CHEMICAL ALABAMA CORP', 5.694813829780987e-05)
(u'EMPLOYER_NAME_PHOTON8, INC.', 5.694813829780987e-05)
(u'EMPLOYER_NAME_NUTRIBIOTECH USA, INC.', 5.694813829780987e-05)
(u'EMPLOYER_NAME_TEAMITSERVE INC', 5.694813829780987e-05)
(u'EMPLOYER_NAME_XS SUPPLY LLC', 5.694813829780987e-05)
(u'EMPLOYER_NAME_BSO NETWORK INC,', 5.694813829780987e-05)
(u'EMPLOYER_NAME_TEXAS A&M UNIVERSITY-COMMERCE', 5.694813829780987e-05)
(u'EMPLOYER_NAME_JC HORIZON LTD',

(u'EMPLOYER_NAME_TRINTECH INC.', 4.613244320209407e-05)
(u'EMPLOYER_NAME_CORNWELL ENGINEERING', 4.612799202122598e-05)
(u'EMPLOYER_NAME_DEPTHCORE, LLC', 4.612799202122598e-05)
(u'EMPLOYER_NAME_EAST COAST SPINE JOINT AND SPORTS MEDICINE PA', 4.612799202122598e-05)
(u'EMPLOYER_NAME_KPIT INFOSYSTEMS INCORPORATED', 4.612799202122598e-05)
(u'EMPLOYER_NAME_INSTITUTE FOR SYSTEMS BIOLOGY', 4.612799202122598e-05)
(u'EMPLOYER_NAME_BARRE SAVINGS BANK', 4.612799202122598e-05)
(u'EMPLOYER_NAME_TOUCHPAL, INC.', 4.612799202122598e-05)
(u'EMPLOYER_NAME_KEYBANK NATIONAL ASSOCIATION', 4.612799202122598e-05)
(u'EMPLOYER_NAME_JAMES B. PIRTLE CONSTRUCTION COMPANY, INC.', 4.612799202122598e-05)
(u'EMPLOYER_NAME_R & A ARCHITECTURE & DESIGN INC.', 4.612799202122598e-05)
(u'EMPLOYER_NAME_E&M DESIGN SOLUTIONS, LLC.', 4.612799202122598e-05)
('SOC_CODE_25-9099', 4.612799202122598e-05)
(u'EMPLOYER_NAME_CHEUNG KONG GRADUATE SCHOOL OF BUSINESS (NORTH AMERICA) INC.', 4.612799202122598e-05)
(u'EMPLOYER_NAME_NEW YORK F

(u'EMPLOYER_NAME_APTUDE INC', 3.6987971138184867e-05)
(u'EMPLOYER_NAME_FLEXLINE LLC', 3.698761206087609e-05)
(u'EMPLOYER_NAME_ROCHE CAROLINA INC.', 3.696474200977186e-05)
(u'EMPLOYER_NAME_UNIVERSITY OF WISCONSIN SYSTEM ADMINISTRATION', 3.696474200977186e-05)
(u'EMPLOYER_NAME_CHICAGO INTERNAL MEDICINE PRACTICE AND RESEARCH, S', 3.695325041962403e-05)
(u'EMPLOYER_NAME_COMSPARK DIVERSITIES', 3.695168431726152e-05)
(u'EMPLOYER_NAME_STRATITUDE, INC.', 3.694295445316096e-05)
(u'EMPLOYER_NAME_HANSEN BANNER LLC', 3.694146672372493e-05)
(u'EMPLOYER_NAME_LUTHERAN MEDICAL GROUP LLC', 3.693715884706021e-05)
(u'EMPLOYER_NAME_ELAVON, INC.', 3.691820608298117e-05)
(u'EMPLOYER_NAME_FAN HUB MEDIA USA LLC', 3.691580889558376e-05)
(u'EMPLOYER_NAME_TAPAD INC.', 3.690910436322843e-05)
(u'EMPLOYER_NAME_MAGELLAN GROUP INVESTMENTS LLC', 3.6907798232256496e-05)
(u'EMPLOYER_NAME_UNIVERSITY SURGICAL ASSOCIATES', 3.6907798232256496e-05)
(u'EMPLOYER_NAME_VINSYS INFORMATION TECHNOLOGY INC', 3.6897533006838545e-05)


(u'EMPLOYER_NAME_IBM INDIA PVT LTD', 1.6447844775150888e-05)
(u'EMPLOYER_NAME_WESTERN DIGITAL TECHNOLOGIES, INC.', 1.63416396854585e-05)
(u'EMPLOYER_NAME_GREATER BOSTON MANUFACTURING PARTNERSHIP INC', 1.6323138996272065e-05)
(u'EMPLOYER_NAME_I2 INFOTECH, LLC', 1.617888476169399e-05)
(u'EMPLOYER_NAME_RAMPS INTERNATIONAL, INC', 1.6037172509149078e-05)
(u'EMPLOYER_NAME_SATYA MARG SOLUTIONS, INC', 1.5943729555542588e-05)
(u'EMPLOYER_NAME_CREDENT TECHNOLOGIES LLC', 1.5851100917887854e-05)
(u'EMPLOYER_NAME_ATOMIK IT, INC.', 1.5759277161703066e-05)
(u'EMPLOYER_NAME_SSB SYSTEMS INC', 1.5668248988741446e-05)
(u'EMPLOYER_NAME_NEW YORK UNIVERSITY HOSPITALS CENTER', 1.566073803189771e-05)
(u'EMPLOYER_NAME_P&S CONSTRUCTION INC.', 1.566073803189771e-05)
(u'EMPLOYER_NAME_CORPTEQ SOLUTIONS INC.', 1.5578007234649134e-05)
(u'EMPLOYER_NAME_PRECISION X-RAY, INC.', 1.550254875884825e-05)
(u'EMPLOYER_NAME_INTEGRETAS, INC', 1.5488542866626606e-05)
(u'EMPLOYER_NAME_TERADYNE INFOTECH', 1.5444099415068446e-05)


Evaluate a naive classifier which predicts a case is always denied if wage is lower than PW. It is always certified otherwise.

In [47]:
status_stats = df.groupby('CASE_STATUS').size().to_frame()
status_stats['percentage'] = status_stats[0] / status_stats[0].sum()
print(status_stats)

                  0  percentage
CASE_STATUS                    
CERTIFIED    270302    0.795825
DENIED        69348    0.204175


In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score, confusion_matrix


naive_prediction = X_test['WAGE_LOWER_THAN_PW']
print("** Naive Classifier Scores (Overall)")
print("Precision: %s"% precision_score(y_test, naive_prediction))
print("Recall: %s"% recall_score(y_test, naive_prediction))
print("Accuracy score: %s"% accuracy_score(y_test, naive_prediction))
print("F-1 score: %s"% f1_score(y_test, naive_prediction))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, naive_prediction, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, naive_prediction, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, naive_prediction))

** Naive Classifier Scores (Overall)
Precision: 0.99568345323741
Recall: 0.18500200507953482
Accuracy score: 0.6443887658781028
F-1 score: 0.3120279562619772
F-beta score with beta=0.5: 0.5306341538225596
F-beta score with beta=0.2: 0.8520754895692736
Confusion Matrix: 
[[9675    6]
 [6097 1384]]
