In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

Load cleaned up version of dataset from 2012 to 2018.  Drop geographic information about employer and worksite to reduce dimensionality

In [2]:
df = pd.read_pickle('/Users/ml/Desktop/Udacity_ML_Capstone/data/H1B_12-18_COMBINED.pickle')
df.drop(columns=['EMPLOYER_CITY','JOB_TITLE','EMPLOYER_NAME'], inplace=True)
df.drop(columns=['EMPLOYER_STATE','WORKSITE_STATE'], inplace=True)


In [3]:
ones_df = df[df['CASE_STATUS']=='DENIED']
zeros_df = df[df['CASE_STATUS']=='CERTIFIED']

In [4]:
ones_df.shape

(69348, 9)

Show first 5 lines of the dataset

In [5]:
zeros_df.shape

(2703023, 9)

In [6]:
df = pd.concat([ones_df, zeros_df.sample(frac=0.1, random_state=99)])

In [7]:
ones_df.head()
print(ones_df.shape)

(69348, 9)


In [8]:
zeros_df.head()
print(zeros_df.shape)

(2703023, 9)


We will use column 'CASE_STATUS' as our label.  DENIED will be 1 and CERTIFIED will be 0 after it gets incoded by label_encoder.

Normalize PREVAILING_WAGE using minmax scaler since it varies quite a bit.

In [9]:
scaler = MinMaxScaler()
X = df.drop(columns='CASE_STATUS')

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['CASE_STATUS'])

X[['PREVAILING_WAGE']] = scaler.fit_transform(X[['PREVAILING_WAGE']])
X = pd.get_dummies(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

Check value counts to verify stratify split

In [13]:
# print(y_train.value_counts())
# print(y_test.value_counts())
X_train.head()

Unnamed: 0,PREVAILING_WAGE,WAGE_LOWER_THAN_PW,VISA_CLASS_E-3 Australian,VISA_CLASS_H-1B,VISA_CLASS_H-1B1 Chile,VISA_CLASS_H-1B1 Singapore,VISA_CLASS_Select Visa Classification,SOC_CODE_11-1011,SOC_CODE_11-1021,SOC_CODE_11-1031,...,NAICS_CODE_927110,NAICS_CODE_928110,NAICS_CODE_92812,NAICS_CODE_928120,NAICS_CODE_941611,NAICS_CODE_941720,NAICS_CODE_966984,NAICS_CODE_99867,NAICS_CODE_999990,NAICS_CODE_999999
251195,7.4e-05,False,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
435435,7.7e-05,False,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
233378,8.7e-05,False,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
317743,7.1e-05,False,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
560032,8.6e-05,False,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Train DecisionTreeClassifier and predict using X_test

In [14]:
from sklearn.ensemble import RandomForestClassifier


# from sklearn.feature_extraction.text import CountVectorizer

rfc = RandomForestClassifier()
# vec = CountVectorizer()
# X_train_t = vec.fit_transform(X_train)

rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

  from numpy.core.umath_tests import inner1d


Train RandomForestClassifier and predict using X_test

In [15]:
from sklearn.metrics import classification_report, confusion_matrix
print("** Random Forest Classifier Stats (OVERALL testset)")
print(classification_report(y_test, rfc_pred))

** Random Forest Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.86      0.93      0.90     54060
          1       0.62      0.43      0.51     13870

avg / total       0.81      0.83      0.82     67930



In [27]:
import operator
# print(len(rfc.feature_importances_))
# print(len(X_train.columns))

nonzero_features = {}
for i in range(len(rfc.feature_importances_)):
    if rfc.feature_importances_[i] > 0.0:
        nonzero_features[list(X_train.columns)[i]] = rfc.feature_importances_[i]
sorted_result = sorted(nonzero_features.items(), key=operator.itemgetter(1)) 
for i in range(len(sorted_result)):
    print(sorted_result[-(i+1)])

('PREVAILING_WAGE', 0.3611513966103289)
('WAGE_LOWER_THAN_PW', 0.15089086476169783)
(u'VISA_CLASS_E-3 Australian', 0.02949388225984559)
('NAICS_CODE_541511', 0.022251122646762395)
(u'PW_SOURCE_OES', 0.013500226763516593)
(u'VISA_CLASS_H-1B', 0.011035955560476908)
(u'WAGE_UNIT_OF_PAY_Year', 0.010933549785294854)
(u'PW_SOURCE_Other', 0.008440135193809862)
('SOC_CODE_15-1121', 0.007117221366678595)
('SOC_CODE_15-1132', 0.007092697922765677)
(u'WAGE_UNIT_OF_PAY_Hour', 0.006056288090556824)
(u'FULL_TIME_POSITION_N', 0.004834894696918695)
('SOC_CODE_15-1199', 0.003824640719460158)
('SOC_CODE_15-1131', 0.0035049491207538703)
('SOC_CODE_13-2011', 0.002872341277517354)
('NAICS_CODE_611310', 0.002854804080248536)
('NAICS_CODE_541512', 0.0026363408116065255)
(u'FULL_TIME_POSITION_Y', 0.002500223333954229)
('SOC_CODE_13-1161', 0.0024827070397162995)
('SOC_CODE_13-2051', 0.00217663196767683)
('SOC_CODE_13-1111', 0.0020484814198579123)
('NAICS_CODE_541330', 0.0020447469152284534)
('NAICS_CODE_541519

('NAICS_CODE_31331', 0.00010074011348317846)
('NAICS_CODE_812199', 0.00010037020503154197)
('NAICS_CODE_333992', 9.991616708421007e-05)
('NAICS_CODE_452990', 9.987445496070337e-05)
('NAICS_CODE_238340', 9.985431194516547e-05)
('NAICS_CODE_722410', 9.970173557358613e-05)
('SOC_CODE_47-4099', 9.956595679141628e-05)
('NAICS_CODE_339914', 9.896121076392373e-05)
('SOC_CODE_19-3099', 9.896101128384538e-05)
('SOC_CODE_19-3092', 9.884193525827391e-05)
('NAICS_CODE_512290', 9.8723414484852e-05)
('SOC_CODE_19-3091', 9.84035433143211e-05)
('NAICS_CODE_112920', 9.834596138505194e-05)
('NAICS_CODE_54511', 9.82577146886428e-05)
('SOC_CODE_43-3099', 9.81462872115571e-05)
('NAICS_CODE_221310', 9.790347095229286e-05)
('NAICS_CODE_44831', 9.775982099419306e-05)
('NAICS_CODE_312140', 9.745902143436133e-05)
('NAICS_CODE_56159', 9.745285759364723e-05)
('NAICS_CODE_56143', 9.700084564678807e-05)
('NAICS_CODE_33699', 9.692376781706539e-05)
('NAICS_CODE_541300', 9.688001896421597e-05)
('NAICS_CODE_33634', 9.6

('NAICS_CODE_56142', 2.5764172054338658e-05)
('NAICS_CODE_326192', 2.5689920349552914e-05)
('NAICS_CODE_922120', 2.566754372674408e-05)
('NAICS_CODE_32611', 2.565688343205182e-05)
('NAICS_CODE_54150', 2.5611636587110715e-05)
('NAICS_CODE_712120', 2.559810577615738e-05)
('NAICS_CODE_23815', 2.545684129443194e-05)
('NAICS_CODE_32712', 2.5410890007545834e-05)
('SOC_CODE_51-9071', 2.5406360933034532e-05)
('NAICS_CODE_81219', 2.533414408004003e-05)
('NAICS_CODE_713920', 2.530406325180243e-05)
('NAICS_CODE_421910', 2.5288776563590883e-05)
('NAICS_CODE_480000', 2.5281681299266458e-05)
('NAICS_CODE_424430', 2.5263626406417193e-05)
('NAICS_CODE_31522', 2.5251243068086623e-05)
('NAICS_CODE_611410', 2.5219887797840425e-05)
('SOC_CODE_45-2099', 2.5215747836977872e-05)
('NAICS_CODE_63111', 2.521487335742902e-05)
('NAICS_CODE_33111', 2.520551709309031e-05)
('NAICS_CODE_311411', 2.514248708836415e-05)
('NAICS_CODE_45121', 2.512243919614124e-05)
('NAICS_CODE_11411', 2.505762481069718e-05)
('NAICS_CODE

('NAICS_CODE_801100', 3.690608020582347e-06)
('NAICS_CODE_51731', 3.6719921067152487e-06)
('SOC_CODE_41-9011', 3.635036385619758e-06)
('SOC_CODE_26-1069', 3.632238285947651e-06)
('NAICS_CODE_624401', 3.6102481967629735e-06)
('NAICS_CODE_424520', 3.5902847050682027e-06)
('NAICS_CODE_524928', 3.5819391792271805e-06)
('NAICS_CODE_734900', 3.5626636901095286e-06)
('NAICS_CODE_332300', 3.5055906615557322e-06)
('NAICS_CODE_541900', 3.4977681654140964e-06)
('NAICS_CODE_92611', 3.494058375231198e-06)
('NAICS_CODE_322214', 3.4837661937846723e-06)
('NAICS_CODE_611330', 3.476974586804861e-06)
('NAICS_CODE_11111', 3.4603765975674613e-06)
('NAICS_CODE_33643', 3.4120969962895773e-06)
('SOC_CODE_47-2221', 3.4050644271685897e-06)
('NAICS_CODE_611692', 3.402558764422068e-06)
('SOC_CODE_53-3032', 3.395149170824977e-06)
('NAICS_CODE_11140', 3.3854662041835468e-06)
('NAICS_CODE_61000', 3.3704227574906856e-06)
('NAICS_CODE_48551', 3.36902033841547e-06)
('NAICS_CODE_541117', 3.356089803976786e-06)
('NAICS_C

Output the feature importance vector and sort by importance

In [28]:
import pickle
with open('feature_importances_first_800k_rfc.pickle', 'wb') as f:
    pickle.dump(sorted_result, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('feature_importances_first_800k_rfc_dict.pickle', 'wb') as f:
    pickle.dump(nonzero_features, f, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
from sklearn.externals import joblib
joblib.dump(rfc, 'rfc.pkl', compress=9)

['rfc.pkl']

Output the trained model to a pickle file in case we need it in the future

In [21]:
rfc = joblib.load('rfc.pkl')

['tree.pkl']

To load the pre-trained model, run the cell below.

In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score, confusion_matrix

print("** Random Forest Classifier Scores (OVERALL testset)")
print("Precision: %s"% precision_score(y_test, rfc_pred))
print("Recall: %s"% recall_score(y_test, rfc_pred))
print("Accuracy score: %s"% accuracy_score(y_test, rfc_pred))
print("F-1 score: %s"% f1_score(y_test, rfc_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, rfc_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, rfc_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, rfc_pred))

** Random Forest Classifier Scores (OVERALL testset)
Precision: 0.6199502796768179
Recall: 0.4315068493150685
Accuracy score: 0.8299131458854704
F-1 score: 0.5088420336677436
F-beta score with beta=0.5: 0.5701520405441451
F-beta score with beta=0.2: 0.6097092704333517
Confusion Matrix: 
[[50391  3669]
 [ 7885  5985]]


Evaluate stats for the entire test set

In [19]:
import numpy as np
ones_index = y_test.nonzero()
x_ones = X_test.iloc[ones_index]
y_ones = np.take(y_test, ones_index)[0]

rfc_pred_oneonly = rfc.predict(x_ones)

In [20]:
print("** Random Forest Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s"% precision_score(y_ones, rfc_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, rfc_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, rfc_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, rfc_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, rfc_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, rfc_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, rfc_pred_oneonly))

** Random Forest Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.4315068493150685
Accuracy score: 0.4315068493150685
F-1 score: 0.6028708133971292
F-beta score with beta=0.5: 0.7914572864321608
F-beta score with beta=0.2: 0.9517722254503197
Confusion Matrix: 
[[   0    0]
 [7885 5985]]


Evaluate stats for testset with label 1 (DENIED cases only)

In [21]:
zeros_index = np.arange(len(y_test))[(y_test==0)]

x_zeros = X_test.iloc[zeros_index]
y_zeros = np.take(y_test, zeros_index)
rfc_pred_zeroonly = rfc.predict(x_zeros)

In [22]:
print("** Random Forest Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s"% precision_score(y_zeros, rfc_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, rfc_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, rfc_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, rfc_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, rfc_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, rfc_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, rfc_pred_zeroonly))


** Random Forest Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.9321309655937847
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[50391  3669]
 [    0     0]]


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [23]:
import datetime
datetime.datetime.now()

datetime.datetime(2018, 12, 14, 0, 2, 29, 793994)

In [24]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=99)

xgb.fit(X_train, y_train)
# print(xgb.feature_importances_)
print(xgb.score(X_test, y_test))
xgb_pred = xgb.predict(X_test)
print(xgb_pred)

  if diff:


0.8385985573384366
[0 0 0 ... 0 0 0]


  if diff:


In [25]:
print("** XGB Classifier Stats (OVERALL testset)")
print(classification_report(y_test, xgb_pred))

** XGB Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.84      0.99      0.91     54060
          1       0.85      0.25      0.39     13870

avg / total       0.84      0.84      0.80     67930



In [29]:
nonzero_features = {}
for i in range(len(xgb.feature_importances_)):
    if xgb.feature_importances_[i] > 0.0:
        nonzero_features[list(X_train.columns)[i]] = xgb.feature_importances_[i]
sorted_result = sorted(nonzero_features.items(), key=operator.itemgetter(1)) 
for i in range(len(sorted_result)):
    print(sorted_result[-(i+1)])

with open('/tmp/feature_importances_first_800k_xgb.pickle', 'wb') as f:
    pickle.dump(sorted_result, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('/tmp/feature_importances_first_800k_xgb_dict.pickle', 'wb') as f:
    pickle.dump(nonzero_features, f, protocol=pickle.HIGHEST_PROTOCOL)

('PREVAILING_WAGE', 0.28293413)
('WAGE_LOWER_THAN_PW', 0.08532934)
(u'WAGE_UNIT_OF_PAY_Year', 0.059880238)
(u'PW_SOURCE_OES', 0.05538922)
('NAICS_CODE_541511', 0.049401198)
(u'PW_SOURCE_Other', 0.028443113)
(u'VISA_CLASS_H-1B', 0.028443113)
('NAICS_CODE_611310', 0.023952097)
('SOC_CODE_15-1199', 0.023952097)
('SOC_CODE_15-1132', 0.02245509)
(u'VISA_CLASS_E-3 Australian', 0.019461079)
('SOC_CODE_15-1121', 0.017964073)
(u'PW_SOURCE_SCA', 0.016467066)
('NAICS_CODE_541512', 0.0149700595)
('SOC_CODE_15-1131', 0.013473053)
(u'FULL_TIME_POSITION_N', 0.013473053)
('NAICS_CODE_51121', 0.011976048)
('SOC_CODE_15-2031', 0.010479042)
(u'WAGE_UNIT_OF_PAY_Month', 0.010479042)
('NAICS_CODE_54151', 0.010479042)
('SOC_CODE_11-1021', 0.010479042)
('NAICS_CODE_523110', 0.010479042)
('SOC_CODE_11-1011', 0.008982036)
('NAICS_CODE_561330', 0.008982036)
('SOC_CODE_19-1042', 0.0074850298)
('NAICS_CODE_518112', 0.0074850298)
('NAICS_CODE_561310', 0.0074850298)
('SOC_CODE_29-1131', 0.0074850298)
('SOC_CODE_29-1

In [30]:
from sklearn.externals import joblib
import cPickle
joblib.dump(xgb, 'xgb.pkl', compress=9)

with open('xgb_cpickle.pkl', 'wb') as pkl:
    cPickle.dump(xgb, pkl)  

In [None]:
xgb = joblib.load('xgb.pkl')

In [52]:
print("** XGB Classifier Scores (OVERALL testset)")
print("Precision: %s" % precision_score(y_test, xgb_pred))
print("Recall: %s"% recall_score(y_test, xgb_pred))
print("Accuracy score: %s"% accuracy_score(y_test, xgb_pred))
print("F-1 score: %s"% f1_score(y_test, xgb_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, xgb_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, xgb_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, xgb_pred))
print("\n")

** XGB Classifier Scores (OVERALL testset)
Precision: 0.8521570528356762
Recall: 0.25349675558759915
Accuracy score: 0.8385985573384366
F-1 score: 0.3907535007779506
F-beta score with beta=0.5: 0.5787844867320735
F-beta score with beta=0.2: 0.7811997949068535
Confusion Matrix: 
[[53450   610]
 [10354  3516]]




In [32]:
xgb_pred_oneonly = xgb.predict(x_ones)

  if diff:


In [53]:
print("** XGB Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s" % precision_score(y_ones, xgb_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, xgb_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, xgb_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, xgb_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, xgb_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, xgb_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, xgb_pred_oneonly))
print("\n")

** XGB Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.25349675558759915
Accuracy score: 0.25349675558759915
F-1 score: 0.4044633613252042
F-beta score with beta=0.5: 0.6293405885301068
F-beta score with beta=0.2: 0.8982607841210575
Confusion Matrix: 
[[    0     0]
 [10354  3516]]




In [34]:
xgb_pred_zeroonly = xgb.predict(x_zeros)

  if diff:


In [54]:
print("** XGB Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s" % precision_score(y_zeros, xgb_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, xgb_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, xgb_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, xgb_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, xgb_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, xgb_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, xgb_pred_zeroonly))
print("\n")

** XGB Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.9887162412134666
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[53450   610]
 [    0     0]]




In [37]:
import datetime
datetime.datetime.now()

datetime.datetime(2018, 12, 14, 1, 3, 34, 348231)

In [38]:
from sklearn.ensemble import AdaBoostClassifier
# from sklearn.feature_extraction.text import CountVectorizer

abc = AdaBoostClassifier(random_state=99)
# vec = CountVectorizer()
# X_train_t = vec.fit_transform(X_train)

abc.fit(X_train, y_train)

abc_pred = abc.predict(X_test)

In [39]:
print("** ADABOOST Classifier Stats (OVERALL testset)")
print(classification_report(y_test, abc_pred))

** ADABOOST Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.84      0.98      0.91     54060
          1       0.81      0.26      0.40     13870

avg / total       0.83      0.84      0.80     67930



In [40]:
nonzero_features = {}
for i in range(len(abc.feature_importances_)):
    if abc.feature_importances_[i] > 0.0:
        nonzero_features[list(X_train.columns)[i]] = abc.feature_importances_[i]
sorted_result = sorted(nonzero_features.items(), key=operator.itemgetter(1)) 
for i in range(len(sorted_result)):
    print(sorted_result[-(i+1)])
with open('/tmp/feature_importances_first_800k_abc.pickle', 'wb') as f:
    pickle.dump(sorted_result, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('/tmp/feature_importances_first_800k_abc_dict.pickle', 'wb') as f:
    pickle.dump(nonzero_features, f, protocol=pickle.HIGHEST_PROTOCOL)

('NAICS_CODE_541511', 0.06)
('PREVAILING_WAGE', 0.04)
('SOC_CODE_15-2031', 0.02)
('NAICS_CODE_334111', 0.02)
('NAICS_CODE_51121', 0.02)
('SOC_CODE_27-1024', 0.02)
(u'PW_SOURCE_OES', 0.02)
(u'WAGE_UNIT_OF_PAY_Year', 0.02)
('WAGE_LOWER_THAN_PW', 0.02)
('NAICS_CODE_611310', 0.02)
('NAICS_CODE_54111', 0.02)
('NAICS_CODE_54151', 0.02)
('NAICS_CODE_452910', 0.02)
(u'PW_SOURCE_SCA', 0.02)
('SOC_CODE_15-1199', 0.02)
('SOC_CODE_19-1042', 0.02)
('NAICS_CODE_518112', 0.02)
('NAICS_CODE_336111', 0.02)
('NAICS_CODE_561310', 0.02)
('NAICS_CODE_522110', 0.02)
('SOC_CODE_13-1161', 0.02)
('NAICS_CODE_551112', 0.02)
('NAICS_CODE_541500', 0.02)
('SOC_CODE_15-1131', 0.02)
('SOC_CODE_15-1133', 0.02)
('SOC_CODE_15-1132', 0.02)
('NAICS_CODE_325412', 0.02)
('SOC_CODE_29-1131', 0.02)
('NAICS_CODE_454111', 0.02)
('SOC_CODE_11-1021', 0.02)
('NAICS_CODE_541330', 0.02)
('NAICS_CODE_334220', 0.02)
('NAICS_CODE_813110', 0.02)
('NAICS_CODE_541510', 0.02)
('NAICS_CODE_541512', 0.02)
('NAICS_CODE_523110', 0.02)
('NAICS

In [41]:
joblib.dump(abc, 'abc.pkl', compress=9)

with open('abc_cpickle.pkl', 'wb') as pkl:
    cPickle.dump(abc, pkl)

In [None]:
abc = joblib.load('abc.pkl')

In [49]:
print("** AB Classifier Scores (OVERALL testset)")
print("Precision: %s"% precision_score(y_test, abc_pred))
print("Recall: %s"% recall_score(y_test, abc_pred))
print("Accuracy score: %s"% accuracy_score(y_test, abc_pred))
print("F-1 score: %s"% f1_score(y_test, abc_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, abc_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, abc_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, abc_pred))

** AB Classifier Scores (OVERALL testset)
Precision: 0.8091249724487547
Recall: 0.26467195385724585
Accuracy score: 0.8371117326659797
F-1 score: 0.39886999511055576
F-beta score with beta=0.5: 0.5732712849022426
F-beta score with beta=0.2: 0.7498016418555324
Confusion Matrix: 
[[53194   866]
 [10199  3671]]


In [43]:
abc_pred_oneonly = abc.predict(x_ones)

In [50]:
print("** AB Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s"% precision_score(y_ones, abc_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, abc_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, abc_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, abc_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, abc_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, abc_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, abc_pred_oneonly))

** AB Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.26467195385724585
Accuracy score: 0.26467195385724585
F-1 score: 0.41856222564277973
F-beta score with beta=0.5: 0.6428171184422499
F-beta score with beta=0.2: 0.9034596999384731
Confusion Matrix: 
[[    0     0]
 [10199  3671]]


In [45]:
abc_pred_zeroonly = abc.predict(x_zeros)

In [51]:
print("** AB Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s"% precision_score(y_zeros, abc_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, abc_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, abc_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, abc_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, abc_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, abc_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, abc_pred_zeroonly))

** AB Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.9839807621161673
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[53194   866]
 [    0     0]]


In [47]:
status_stats = df.groupby('CASE_STATUS').size().to_frame()
status_stats['percentage'] = status_stats[0] / status_stats[0].sum()
print(status_stats)

                  0  percentage
CASE_STATUS                    
CERTIFIED    270302    0.795825
DENIED        69348    0.204175


In [48]:
import datetime
datetime.datetime.now()

datetime.datetime(2018, 12, 14, 1, 8, 58, 829183)

In [None]:
status_stats = df.groupby('CASE_STATUS').size().to_frame()
status_stats['percentage'] = status_stats[0] / status_stats[0].sum()
print(status_stats)