In [121]:
import pandas as pd
import numpy as np

#col_names=['age','work_place','fnlwgt','education','education_num','marital_status','occupation',
#           'relationship','race','gender','capital_gain','captial_loss','hours_per_week','native_country','target']


data_path = "./census/"

income_data = pd.read_csv(data_path+'census-income.data', sep=',', header=None)


#pf.columns = col_names
#change column names
new_col_names =['age', 'class_of_worker', 'industry_code', 'occupation_code', 'education', 'wage_per_hour', 'enrolled_in_edu_inst_last_week', 'marital_status', 'major_industry_code', 'major_occupation_code', 'race', 'hispanic_origin', 'sex', 'member_of_labor_union', 'reason_for_unemployment', 'employment_status', 'capital_gains', 'capital_losses', 'dividends', 'tax_filer_status', 'previous_residence_region', 'previous_residence_state', 'detailed_household_summary_stat','detailed_household_summary_household',  'what2', 'migration_msa_change', 'migration_reg_change', 'migration_within_reg_change', 'live_in_this_house_1yr_ago', 'previous_residence_sunbelt', 'num_persons_worked_for_employer', 'family_members_under_18', 'birthplace_father', 'birthplace_mother', 'birthplace_self', 'citizenship', 'self_employed', 'veteran_questionnaire_filler', 'veterans_benefits', 'weeks_worked_in_year', 'year', 'target']
#print(new_col_names)
income_data.columns = new_col_names 


In [122]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(income_data,income_data.target,test_size=0.2, random_state=1500)


In [123]:

def income_data_prep(income_data):
    '''
    input: data frame. here the data propressing is customized for bank data
    '''
    income_data.dropna(inplace=True)  
    
    #remove useless columns
    re_corr_cols= ['industry_code', 'occupation_code', 'education', 'wage_per_hour',"detailed_household_summary_stat","what2"]
    income_data.drop(re_corr_cols,axis=1,inplace=True)
    
    # we found out that clients from year 1995, the entire re_miss_cols has missing values
    re_miss_cols = ['migration_msa_change','migration_reg_change', 'migration_within_reg_change','previous_residence_sunbelt']
    income_data.drop(re_miss_cols,axis=1,inplace=True)   
    
    # Remove leading and trailing whitespace from all values in the dataframe
    income_data = income_data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    # change ['hispanic_origin'] NA to "Do not know"
    income_data['hispanic_origin'] = income_data['hispanic_origin'].replace("NA", "Do not know")
    
    #change the number to string for object features

    # define a dictionary to map numerical values to string values
    employ_map = {0: 'employee', 1: 'selfemployed',2: 'ownbusiness'}
    # use the replace method to map numerical values to string values
    income_data["self_employed"] = income_data["self_employed"].replace(employ_map)
    
    veterans_map = {0: 'A',1: 'B', 2: 'C'}
    # use the replace method to map numerical values to string values
    income_data['veterans_benefits'] = income_data['veterans_benefits'].replace(veterans_map)

    
    year_map = {94: '1994', 95: '1995'}
    # use the replace method to map numerical values to string values
    income_data['year'] = income_data['year'].replace(year_map)
    
    # get the list of columns that contain '?'    
    cols_with_question_mark =income_data.applymap(lambda x: isinstance(x, str) and '?' in x).any()
    columns_to_replace = cols_with_question_mark[cols_with_question_mark == True].index.tolist()
    #print(columns_to_replace)
    for column in columns_to_replace:
         income_data[column] = income_data[column].replace('?', np.nan)
    
    # replace "?" with NA
   # income_data.replace({'?', np.NaN}, inplace=True)
    #remove NAs
    income_data.dropna(inplace=True)
    
    # change the target values 
    income_data["target"]=  np.where(income_data["target"] =="50000+.",1,0)

    # Get all categorical features
    cat_columns = list(income_data.columns[income_data.dtypes=='object'])   
    # cat_columns.remove('target')
    
    #remove NA string
    #income_data.replace('NA', np.nan, inplace=True)
    #income_data.dropna(inplace=True)

    return(income_data,cat_columns)

In [124]:
#data pre-processing 
#data pre-processing 
df_income, cat_cols=income_data_prep(X_train)
df_income_test, cat_cols = income_data_prep(X_test)


y_train = df_income.target
y_test = df_income_test.target

In [125]:
len(y_train)
len(y_test)

38057

In [126]:
df_income.head().agg(lambda x: print(x["class_of_worker"]), axis=1)

Private
Not in universe
Private
Not in universe
Not in universe


170868    None
110370    None
139696    None
11112     None
106644    None
dtype: object

In [127]:
df_income[cat_cols] = df_income[cat_cols].applymap(lambda x: x.replace(" ","-"))
df_income_test[cat_cols] = df_income_test[cat_cols].applymap(lambda x: x.replace(" ","-"))

In [128]:
df_income["year"]

170868    1994
110370    1994
139696    1995
11112     1994
106644    1995
          ... 
193390    1994
181117    1994
155841    1995
120651    1995
77075     1995
Name: year, Length: 152504, dtype: object

In [129]:
categorical_cols = list(df_income.dtypes[df_income.dtypes=="object"].keys())

In [130]:
categorical_cols

['class_of_worker',
 'enrolled_in_edu_inst_last_week',
 'marital_status',
 'major_industry_code',
 'major_occupation_code',
 'race',
 'hispanic_origin',
 'sex',
 'member_of_labor_union',
 'reason_for_unemployment',
 'employment_status',
 'tax_filer_status',
 'previous_residence_region',
 'previous_residence_state',
 'detailed_household_summary_household',
 'live_in_this_house_1yr_ago',
 'family_members_under_18',
 'birthplace_father',
 'birthplace_mother',
 'birthplace_self',
 'citizenship',
 'self_employed',
 'veteran_questionnaire_filler',
 'veterans_benefits',
 'year']

In [131]:
df_income['stringcat'] = df_income.agg(lambda x: f"{x['class_of_worker']} {x['enrolled_in_edu_inst_last_week']}  {x['marital_status']}  {x['major_industry_code']}  {x['major_occupation_code']}  {x['race']} {x['hispanic_origin']} {x['sex']}  {x['member_of_labor_union']}  {x['reason_for_unemployment']} {x['employment_status']}  {x['tax_filer_status']}  {x['previous_residence_region']}  {x['previous_residence_state']} {x['detailed_household_summary_household']}  {x['live_in_this_house_1yr_ago']}  {x['family_members_under_18']} {x['birthplace_father']}  {x['birthplace_mother']}  {x['birthplace_mother']}  {x['birthplace_self']}   {x['citizenship']}  {x['self_employed']}  {x['veteran_questionnaire_filler']}  {x['veterans_benefits']}  {x['year']} ", axis=1)

In [132]:
from gensim.test.utils import common_texts

from gensim.models import Word2Vec

In [133]:
dimpool = 30
model = Word2Vec(sentences=df_income['stringcat'].str.split(" "), vector_size=dimpool, window=2, min_count=1, workers=4)

model.save("word2vec.model.income")

In [134]:
df_income['stringcat']

170868    Private Not-in-universe  Never-married  Financ...
110370    Not-in-universe Not-in-universe  Divorced  Not...
139696    Private Not-in-universe  Married-civilian-spou...
11112     Not-in-universe Not-in-universe  Married-civil...
106644    Not-in-universe Not-in-universe  Never-married...
                                ...                        
193390    Not-in-universe Not-in-universe  Never-married...
181117    Not-in-universe Not-in-universe  Married-civil...
155841    Not-in-universe Not-in-universe  Never-married...
120651    Private Not-in-universe  Never-married  Constr...
77075     Not-in-universe Not-in-universe  Married-civil...
Name: stringcat, Length: 152504, dtype: object

In [135]:
df_income['stringcat'].str.split(" ")

170868    [Private, Not-in-universe, , Never-married, , ...
110370    [Not-in-universe, Not-in-universe, , Divorced,...
139696    [Private, Not-in-universe, , Married-civilian-...
11112     [Not-in-universe, Not-in-universe, , Married-c...
106644    [Not-in-universe, Not-in-universe, , Never-mar...
                                ...                        
193390    [Not-in-universe, Not-in-universe, , Never-mar...
181117    [Not-in-universe, Not-in-universe, , Married-c...
155841    [Not-in-universe, Not-in-universe, , Never-mar...
120651    [Private, Not-in-universe, , Never-married, , ...
77075     [Not-in-universe, Not-in-universe, , Married-c...
Name: stringcat, Length: 152504, dtype: object

In [136]:
columns_categorical = cat_cols

In [137]:
from tqdm import tqdm_notebook as tqdm

elements = []


for row in tqdm(df_income.iterrows()):
    categorical_embeddings = []
    for i in columns_categorical:
        #print(i,row[1][i])
        try:
            
            categorical_embeddings.append(model.wv[row[1][i]])
        except:
            categorical_embeddings.append(np.zeros((dimpool)))
    elements.append(np.array(categorical_embeddings))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for row in tqdm(df_income.iterrows()):


0it [00:00, ?it/s]

In [139]:
elements[0]

array([[ 9.71528649e-01,  2.53408879e-01, -4.95787716e+00,
         4.39093113e+00,  5.90402508e+00,  1.73303521e+00,
        -6.23992383e-01, -2.44400716e+00, -1.04024816e+00,
         4.63740921e+00,  1.69965780e+00, -2.97356343e+00,
         1.45111978e+00, -1.36095986e-01,  4.81407070e+00,
         7.47750378e+00,  1.13175106e+00,  1.78077114e+00,
        -5.80858588e-01, -1.99191794e-01,  6.55080700e+00,
         4.93402338e+00,  1.92034924e+00, -6.62984276e+00,
        -4.22747707e+00, -6.42770454e-02, -2.75742888e-01,
        -4.07873631e+00,  2.46035051e+00,  4.46570921e+00],
       [ 7.18490183e-01, -8.33356082e-01,  1.03425428e-01,
        -2.08595857e-01,  4.41765487e-01,  2.10303235e+00,
        -3.18044871e-01, -2.01902300e-01,  3.41060698e-01,
         8.01150084e-01, -1.13940060e+00,  2.54227638e-01,
        -1.73210418e+00,  1.83080387e+00, -1.74183404e+00,
         1.60659492e+00,  1.06867516e+00, -9.48434651e-01,
         6.64179742e-01, -1.52156532e-01, -1.33667541e+

In [140]:
elements

reshaped_x = (np.reshape(elements,(152504,len(columns_categorical)*dimpool)))



In [141]:
df_income.dtypes

age                                      int64
class_of_worker                         object
enrolled_in_edu_inst_last_week          object
marital_status                          object
major_industry_code                     object
major_occupation_code                   object
race                                    object
hispanic_origin                         object
sex                                     object
member_of_labor_union                   object
reason_for_unemployment                 object
employment_status                       object
capital_gains                            int64
capital_losses                           int64
dividends                                int64
tax_filer_status                        object
previous_residence_region               object
previous_residence_state                object
detailed_household_summary_household    object
live_in_this_house_1yr_ago              object
num_persons_worked_for_employer          int64
family_member

In [142]:
del df_income['stringcat']
del df_income['target']

In [143]:
df_income

Unnamed: 0,age,class_of_worker,enrolled_in_edu_inst_last_week,marital_status,major_industry_code,major_occupation_code,race,hispanic_origin,sex,member_of_labor_union,...,family_members_under_18,birthplace_father,birthplace_mother,birthplace_self,citizenship,self_employed,veteran_questionnaire_filler,veterans_benefits,weeks_worked_in_year,year
170868,25,Private,Not-in-universe,Never-married,Finance-insurance-and-real-estate,Adm-support-including-clerical,Black,All-other,Male,Not-in-universe,...,Not-in-universe,United-States,United-States,United-States,Native--Born-in-the-United-States,ownbusiness,Not-in-universe,C,52,1994
110370,37,Not-in-universe,Not-in-universe,Divorced,Not-in-universe-or-children,Not-in-universe,White,All-other,Female,Not-in-universe,...,Not-in-universe,Canada,United-States,United-States,Native--Born-in-the-United-States,employee,Not-in-universe,C,0,1994
139696,33,Private,Not-in-universe,Married-civilian-spouse-present,Finance-insurance-and-real-estate,Executive-admin-and-managerial,White,All-other,Female,No,...,Not-in-universe,United-States,United-States,United-States,Native--Born-in-the-United-States,employee,Not-in-universe,C,52,1995
11112,2,Not-in-universe,Not-in-universe,Married-civilian-spouse-present,Not-in-universe-or-children,Not-in-universe,Asian-or-Pacific-Islander,Puerto-Rican,Male,Not-in-universe,...,Not-in-universe,Puerto-Rico,Puerto-Rico,Puerto-Rico,Native--Born-in-Puerto-Rico-or-U-S-Outlying,employee,Not-in-universe,C,0,1994
106644,48,Not-in-universe,Not-in-universe,Never-married,Not-in-universe-or-children,Not-in-universe,White,Puerto-Rican,Female,Not-in-universe,...,Not-in-universe,Puerto-Rico,Puerto-Rico,Puerto-Rico,Native--Born-in-Puerto-Rico-or-U-S-Outlying,employee,Not-in-universe,C,12,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193390,9,Not-in-universe,Not-in-universe,Never-married,Not-in-universe-or-children,Not-in-universe,Black,All-other,Female,Not-in-universe,...,Mother-only-present,United-States,United-States,United-States,Native--Born-in-the-United-States,employee,Not-in-universe,A,0,1994
181117,85,Not-in-universe,Not-in-universe,Married-civilian-spouse-present,Not-in-universe-or-children,Not-in-universe,White,All-other,Male,Not-in-universe,...,Not-in-universe,United-States,United-States,United-States,Native--Born-in-the-United-States,employee,Not-in-universe,C,0,1994
155841,3,Not-in-universe,Not-in-universe,Never-married,Not-in-universe-or-children,Not-in-universe,White,All-other,Male,Not-in-universe,...,Both-parents-present,United-States,United-States,United-States,Native--Born-in-the-United-States,employee,Not-in-universe,A,0,1995
120651,30,Private,Not-in-universe,Never-married,Construction,Precision-production-craft-&-repair,White,All-other,Male,No,...,Not-in-universe,United-States,United-States,United-States,Native--Born-in-the-United-States,employee,Not-in-universe,C,52,1995


In [144]:
np.where(df_income.dtypes!="object")[0]

array([ 0, 12, 13, 14, 20, 29])

In [145]:
df_income_num = df_income.iloc[:, [ 0, 12, 13, 14, 20, 29]].reset_index()
del df_income_num["index"]

In [146]:
df_income_num.head()

Unnamed: 0,age,capital_gains,capital_losses,dividends,num_persons_worked_for_employer,weeks_worked_in_year
0,25,0,0,0,6,52
1,37,0,0,0,0,0
2,33,0,0,250,6,52
3,2,0,0,0,0,0
4,48,0,0,0,1,12


In [147]:

my_data = pd.concat([df_income_num,pd.DataFrame(reshaped_x)],axis=1)

my_data.head()

Unnamed: 0,age,capital_gains,capital_losses,dividends,num_persons_worked_for_employer,weeks_worked_in_year,0,1,2,3,...,740,741,742,743,744,745,746,747,748,749
0,25,0,0,0,6,52,0.971529,0.253409,-4.957877,4.390931,...,4.165947,-1.904109,3.041353,3.019732,-1.629127,3.065566,0.473898,-3.741247,-1.62171,-0.559038
1,37,0,0,0,0,0,0.71849,-0.833356,0.103425,-0.208596,...,4.165947,-1.904109,3.041353,3.019732,-1.629127,3.065566,0.473898,-3.741247,-1.62171,-0.559038
2,33,0,0,250,6,52,0.971529,0.253409,-4.957877,4.390931,...,4.557215,0.002884,2.63856,0.596689,-1.718802,2.135884,0.993245,-4.360782,-1.234738,-4.189466
3,2,0,0,0,0,0,0.71849,-0.833356,0.103425,-0.208596,...,4.165947,-1.904109,3.041353,3.019732,-1.629127,3.065566,0.473898,-3.741247,-1.62171,-0.559038
4,48,0,0,0,1,12,0.71849,-0.833356,0.103425,-0.208596,...,4.557215,0.002884,2.63856,0.596689,-1.718802,2.135884,0.993245,-4.360782,-1.234738,-4.189466


In [149]:
my_data["target"] = y_train.values

In [150]:
my_data.to_csv('census_w2vec_paired_ttest.csv')

In [105]:
from sklearn.model_selection import train_test_split


X_train2, X_test2, y_train2, y_test2 = train_test_split(my_data,y_train,test_size=0.2, random_state=1500)


In [106]:
X_train2.head()

Unnamed: 0,age,capital_gains,capital_losses,dividends,num_persons_worked_for_employer,weeks_worked_in_year,0,1,2,3,...,740,741,742,743,744,745,746,747,748,749
97034,46,0,0,0,1,52,-2.62079,-0.719483,-1.934697,-0.245091,...,-0.577736,-1.884557,2.849357,4.034204,-1.54795,0.945969,3.525007,2.184367,-1.472942,-1.589378
39782,60,0,0,0,6,52,-4.346836,-0.429085,-4.042089,-1.993753,...,0.021078,-2.28648,1.125948,1.499001,-0.352019,-0.392285,2.319086,1.507847,-0.871454,-3.148458
52743,11,0,0,0,0,0,0.949587,-2.220054,0.689826,0.021175,...,-0.577736,-1.884557,2.849357,4.034204,-1.54795,0.945969,3.525007,2.184367,-1.472942,-1.589378
62458,25,0,0,0,6,52,-4.346836,-0.429085,-4.042089,-1.993753,...,0.021078,-2.28648,1.125948,1.499001,-0.352019,-0.392285,2.319086,1.507847,-0.871454,-3.148458
15742,40,0,0,0,4,52,-4.346836,-0.429085,-4.042089,-1.993753,...,0.021078,-2.28648,1.125948,1.499001,-0.352019,-0.392285,2.319086,1.507847,-0.871454,-3.148458


In [107]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report,auc, roc_auc_score


log = LogisticRegression(max_iter=1000)


stc = StandardScaler()

X_scaled = stc.fit_transform(X_train2.values)

log.fit(X_scaled,y_train2)

y_pred = log.predict(stc.transform(X_test2.values))
y_pred_prob = log.predict_proba(stc.transform(X_test2.values))

print(confusion_matrix(y_test2,y_pred))
print(classification_report(y_test2,y_pred))

print(roc_auc_score(y_test2,y_pred_prob[:,1]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[28367   229]
 [ 1300   605]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     28596
           1       0.73      0.32      0.44      1905

    accuracy                           0.95     30501
   macro avg       0.84      0.65      0.71     30501
weighted avg       0.94      0.95      0.94     30501

0.939424506997473


In [55]:
df_bank_test.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,y
34578,32,blue-collar,single,high.school,no,yes,no,cellular,may,thu,216,1,999,0,nonexistent,0
36326,21,management,single,university.degree,no,unknown,unknown,cellular,jun,tue,106,1,999,0,nonexistent,0
5595,41,unknown,single,basic.9y,no,no,yes,telephone,may,mon,369,1,999,0,nonexistent,0
1323,49,technician,married,professional.course,no,no,no,telephone,may,thu,63,2,999,0,nonexistent,0
4141,33,blue-collar,married,basic.9y,no,unknown,unknown,telephone,may,mon,215,3,999,0,nonexistent,0


In [108]:
from tqdm import tqdm_notebook as tqdm

elements = []


for row in tqdm(df_income_test.iterrows()):
    categorical_embeddings = []
    for i in columns_categorical:
        #print(i,row[1][i])
        try:
            categorical_embeddings.append(model.wv[row[1][i]])
        except:
            categorical_embeddings.append(np.zeros((dimpool)))
    elements.append(np.array(categorical_embeddings))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for row in tqdm(df_income_test.iterrows()):


0it [00:00, ?it/s]

Unnamed: 0,index,age,duration,campaign,pdays,previous,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,34578,32,216,1,999,0,0.73752,0.123823,-2.235938,-0.008387,1.573915,-1.665571,-0.637934,-2.27809,0.049668,-1.635396,-1.929349,-1.000723,-0.533718,-0.834326,-1.824794,0.038925,-0.129125,0.014223,-0.674284,0.114642,-0.634255,1.230909,-2.404409,-0.725083,-0.574599,-0.13485,-1.260833,-0.123816,-0.340644,0.6565,-1.641073,-0.576843,-0.734858,0.549312,-0.272104,-0.949458,0.959403,0.29089,1.196229,0.68031,0.778703,0.529044,1.104055,0.64198,-0.200062,1.471174,0.059713,0.967924,0.386162,-0.413555,-0.143492,0.694045,-0.718676,-0.677616,-0.28417,-0.283272,-0.465551,0.33551,-0.710641,0.847138,0.217178,0.98721,0.188168,-0.911985,0.395678,-0.40143,-1.298678,-0.035854,-0.485246,-1.353042,0.115888,0.9824,-0.168843,0.773007,0.070154,-0.285727,-2.170016,-0.828807,0.609008,-0.044335,-0.625657,1.036383,0.706459,0.380866,-0.28207,-1.143707,-0.092644,0.145357,1.849624,-0.141301,0.105616,-0.047155,0.00776,0.641873,0.112908,-0.13444,0.138091,-0.58182,0.137857,0.086066,-0.523317,-0.041743,-0.242234,0.337758,-0.150625,0.203721,-0.631735,0.372938,0.057301,-0.127025,0.147902,-0.015795,-0.434472,-0.236199,0.62158,-0.167283,-0.172281,0.268205,-0.3338,0.274109,-0.120261,-0.043971,-0.200144,0.938579,0.239544,0.216742,0.053667,-0.629416,0.159571,0.29339,-0.574056,-0.010323,0.062688,0.785707,-0.095875,-0.010037,-0.444404,0.487239,0.083919,-0.27863,-0.189657,0.027409,-0.743127,0.165857,0.635983,-0.135124,-0.158816,-0.029843,-0.362606,0.123182,0.105616,-0.047155,0.00776,0.641873,0.112908,-0.13444,0.138091,-0.58182,0.137857,0.086066,-0.523317,-0.041743,-0.242234,0.337758,-0.150625,0.203721,-0.631735,0.372938,0.057301,-0.127025,0.147902,-0.015795,-0.434472,-0.236199,0.62158,-0.167283,-0.172281,0.268205,-0.3338,0.274109,-0.342052,0.464329,0.455764,-1.045769,1.34715,0.07957,-0.260341,0.347451,-0.178962,0.18099,1.635039,-1.117023,0.419578,-0.380985,0.231705,0.390557,0.577302,0.831301,-0.320434,0.232938,0.960819,-0.071384,-0.857278,-0.388923,-0.268886,1.548165,1.079972,0.185744,-0.303444,-0.366259,0.031807,-0.220882,-0.606814,1.02843,-0.77484,-1.328458,1.033845,0.537285,-1.283828,-0.616049,-0.826769,0.016151,0.526333,-1.565876,0.208127,-0.959644,0.824665,0.224006,-0.240964,-0.873134,-0.512755,0.237131,-0.173402,-0.590654,-1.699434,-1.579987,-1.16873,-0.435126,-0.345812,-0.367637,-0.171014,-1.605697,-1.400374,-0.37321,0.217223,-1.227038,-0.471033,-0.76819,0.145873,0.000108,-0.346444,0.577568,-0.034946,1.764654,-0.998005,0.3223,0.282416,-0.431087,0.35799,-0.208421,-0.085968,0.11475,-0.082078,0.084027,1.206931,0.69038,0.909911,0.642603,0.15101,0.348313,-0.408219,1.140387,-0.537646,0.667966,0.865984,0.38015,0.590908,-1.093774,0.248988,0.167887,-0.110277,0.663152,-0.492732,-0.466525,-0.044248,0.646224,-1.035201,0.378048,-0.079573,-1.190863,0.14809,0.978977,-0.660713,0.096484,0.045345,-0.202588,-0.534431,-0.484841,-0.785899,1.558576
1,36326,21,106,1,999,0,1.18375,0.166753,-1.711782,-0.066444,-0.027119,-0.825378,-0.727118,-0.882812,0.468634,0.110552,-0.02502,-0.225321,1.397198,0.80829,-0.546209,0.020733,-0.800319,-0.305296,0.160573,-1.06664,0.670329,0.444602,-2.37267,-1.08061,0.324173,-0.89474,-0.102429,0.109571,-1.186333,0.883738,-1.641073,-0.576843,-0.734858,0.549312,-0.272104,-0.949458,0.959403,0.29089,1.196229,0.68031,0.778703,0.529044,1.104055,0.64198,-0.200062,1.471174,0.059713,0.967924,0.386162,-0.413555,-0.143492,0.694045,-0.718676,-0.677616,-0.28417,-0.283272,-0.465551,0.33551,-0.710641,0.847138,0.532838,0.775831,-0.021071,-1.140646,-0.19329,0.599552,-0.67579,-0.207102,-0.951473,-0.566707,0.309191,0.193222,-0.376717,0.245492,0.072888,0.180067,-2.102032,-1.44935,1.939363,-0.563498,-0.644124,1.441648,0.295521,-0.516849,0.639666,-0.336949,0.112537,-0.260369,1.960213,-0.268944,0.105616,-0.047155,0.00776,0.641873,0.112908,-0.13444,0.138091,-0.58182,0.137857,0.086066,-0.523317,-0.041743,-0.242234,0.337758,-0.150625,0.203721,-0.631735,0.372938,0.057301,-0.127025,0.147902,-0.015795,-0.434472,-0.236199,0.62158,-0.167283,-0.172281,0.268205,-0.3338,0.274109,-0.066566,-0.101876,-0.504578,0.84776,0.447376,-0.295172,0.162936,-1.082382,0.165525,-0.29047,-0.803497,0.421197,-0.091065,-0.248764,-0.443314,-0.019006,-0.597748,0.072122,-0.200603,-0.032076,0.014689,-0.306317,-0.23678,0.064137,0.172729,-0.168238,-0.264689,0.330116,-0.292099,0.182384,-0.066566,-0.101876,-0.504578,0.84776,0.447376,-0.295172,0.162936,-1.082382,0.165525,-0.29047,-0.803497,0.421197,-0.091065,-0.248764,-0.443314,-0.019006,-0.597748,0.072122,-0.200603,-0.032076,0.014689,-0.306317,-0.23678,0.064137,0.172729,-0.168238,-0.264689,0.330116,-0.292099,0.182384,-0.342052,0.464329,0.455764,-1.045769,1.34715,0.07957,-0.260341,0.347451,-0.178962,0.18099,1.635039,-1.117023,0.419578,-0.380985,0.231705,0.390557,0.577302,0.831301,-0.320434,0.232938,0.960819,-0.071384,-0.857278,-0.388923,-0.268886,1.548165,1.079972,0.185744,-0.303444,-0.366259,-0.289138,-0.077672,-0.66002,0.712851,-0.409011,-1.129291,1.663137,0.035363,-1.393749,-0.417578,-0.776558,0.106254,0.980925,-1.514186,0.267294,-1.371215,0.530433,-0.313031,-0.554121,-1.120521,-0.538667,-0.105463,0.012812,-0.862455,-1.66978,-1.288714,-0.798667,-0.742461,-0.393326,-0.294072,-0.172127,-1.481207,-1.204882,-0.511225,0.044744,-1.184031,-0.566755,-0.81034,0.279201,0.04127,-0.356709,0.584259,0.025949,1.747729,-1.235396,0.402458,0.156931,-0.486146,0.357052,-0.374766,-0.124984,0.119608,0.196866,-0.024035,1.172147,0.763851,0.719679,0.817186,0.202703,0.461435,-0.408219,1.140387,-0.537646,0.667966,0.865984,0.38015,0.590908,-1.093774,0.248988,0.167887,-0.110277,0.663152,-0.492732,-0.466525,-0.044248,0.646224,-1.035201,0.378048,-0.079573,-1.190863,0.14809,0.978977,-0.660713,0.096484,0.045345,-0.202588,-0.534431,-0.484841,-0.785899,1.558576
2,5595,41,369,1,999,0,-0.066566,-0.101876,-0.504578,0.84776,0.447376,-0.295172,0.162936,-1.082382,0.165525,-0.29047,-0.803497,0.421197,-0.091065,-0.248764,-0.443314,-0.019006,-0.597748,0.072122,-0.200603,-0.032076,0.014689,-0.306317,-0.23678,0.064137,0.172729,-0.168238,-0.264689,0.330116,-0.292099,0.182384,-1.641073,-0.576843,-0.734858,0.549312,-0.272104,-0.949458,0.959403,0.29089,1.196229,0.68031,0.778703,0.529044,1.104055,0.64198,-0.200062,1.471174,0.059713,0.967924,0.386162,-0.413555,-0.143492,0.694045,-0.718676,-0.677616,-0.28417,-0.283272,-0.465551,0.33551,-0.710641,0.847138,0.272228,0.353904,0.128193,-0.672868,-0.120834,-0.141276,-1.472556,-0.966153,-0.980325,-0.975222,0.24397,1.686233,0.057917,0.329196,0.680495,-0.436619,-1.066129,-1.13301,1.244544,0.177002,0.332558,0.129711,-0.124003,0.284603,-0.348194,-1.124995,-0.101651,-0.077752,1.639333,0.580224,0.105616,-0.047155,0.00776,0.641873,0.112908,-0.13444,0.138091,-0.58182,0.137857,0.086066,-0.523317,-0.041743,-0.242234,0.337758,-0.150625,0.203721,-0.631735,0.372938,0.057301,-0.127025,0.147902,-0.015795,-0.434472,-0.236199,0.62158,-0.167283,-0.172281,0.268205,-0.3338,0.274109,0.105616,-0.047155,0.00776,0.641873,0.112908,-0.13444,0.138091,-0.58182,0.137857,0.086066,-0.523317,-0.041743,-0.242234,0.337758,-0.150625,0.203721,-0.631735,0.372938,0.057301,-0.127025,0.147902,-0.015795,-0.434472,-0.236199,0.62158,-0.167283,-0.172281,0.268205,-0.3338,0.274109,-0.120261,-0.043971,-0.200144,0.938579,0.239544,0.216742,0.053667,-0.629416,0.159571,0.29339,-0.574056,-0.010323,0.062688,0.785707,-0.095875,-0.010037,-0.444404,0.487239,0.083919,-0.27863,-0.189657,0.027409,-0.743127,0.165857,0.635983,-0.135124,-0.158816,-0.029843,-0.362606,0.123182,-0.905775,0.627073,0.693905,-1.157644,-0.196461,0.017037,-0.727226,-1.193352,0.586496,-0.825565,1.298624,-0.730723,0.912171,-1.225806,-0.286296,0.576252,0.774334,-0.755445,-1.565033,0.995422,0.294524,0.020977,-0.591274,-0.432368,-0.090343,1.197432,-0.473306,-0.276709,-0.567322,0.329632,0.031807,-0.220882,-0.606814,1.02843,-0.77484,-1.328458,1.033845,0.537285,-1.283828,-0.616049,-0.826769,0.016151,0.526333,-1.565876,0.208127,-0.959644,0.824665,0.224006,-0.240964,-0.873134,-0.512755,0.237131,-0.173402,-0.590654,-1.699434,-1.579987,-1.16873,-0.435126,-0.345812,-0.367637,-0.154433,-1.510103,-1.338466,-0.403064,0.095192,-1.302034,-0.529649,-0.863992,0.326605,-0.024042,-0.328717,0.576783,0.06089,1.744399,-1.177445,0.401472,0.138362,-0.48404,0.314313,-0.259789,-0.191134,0.162616,0.139855,-0.110055,1.181512,0.770473,0.781663,0.877373,0.193223,0.410664,-0.408219,1.140387,-0.537646,0.667966,0.865984,0.38015,0.590908,-1.093774,0.248988,0.167887,-0.110277,0.663152,-0.492732,-0.466525,-0.044248,0.646224,-1.035201,0.378048,-0.079573,-1.190863,0.14809,0.978977,-0.660713,0.096484,0.045345,-0.202588,-0.534431,-0.484841,-0.785899,1.558576
3,1323,49,63,2,999,0,2.105438,0.502962,-1.402525,0.089092,-0.02498,-1.16843,0.417023,-1.496348,-0.440249,-0.87713,0.854587,-0.250386,0.754239,1.735862,-1.329939,0.792554,-0.402507,1.175703,-0.101038,-1.687949,0.501983,0.376127,-1.204813,-1.397898,1.019364,-0.4415,-1.576738,-1.062737,-0.44961,0.701868,-1.497004,-1.682721,-0.657443,0.249652,0.485596,-0.70613,0.705218,-0.532998,0.840226,0.845392,-0.367331,0.152462,0.419869,-0.494277,-0.706555,1.657687,0.537531,0.879349,0.629593,-0.696026,-0.385432,0.395378,-1.063428,-0.582441,-0.538848,0.357713,-0.952133,-0.24536,-0.969099,0.744297,0.268713,0.917914,-0.341198,-1.606804,-0.187101,0.2923,-1.160036,0.328717,-1.292309,0.290831,-0.47379,1.069713,-0.215785,-0.003466,-0.159311,0.27301,-1.710433,-1.010563,0.639543,-0.203161,0.688519,0.496993,-0.044194,0.32354,0.825225,-1.041318,-0.356865,-0.023407,2.31296,0.076189,0.105616,-0.047155,0.00776,0.641873,0.112908,-0.13444,0.138091,-0.58182,0.137857,0.086066,-0.523317,-0.041743,-0.242234,0.337758,-0.150625,0.203721,-0.631735,0.372938,0.057301,-0.127025,0.147902,-0.015795,-0.434472,-0.236199,0.62158,-0.167283,-0.172281,0.268205,-0.3338,0.274109,0.105616,-0.047155,0.00776,0.641873,0.112908,-0.13444,0.138091,-0.58182,0.137857,0.086066,-0.523317,-0.041743,-0.242234,0.337758,-0.150625,0.203721,-0.631735,0.372938,0.057301,-0.127025,0.147902,-0.015795,-0.434472,-0.236199,0.62158,-0.167283,-0.172281,0.268205,-0.3338,0.274109,0.105616,-0.047155,0.00776,0.641873,0.112908,-0.13444,0.138091,-0.58182,0.137857,0.086066,-0.523317,-0.041743,-0.242234,0.337758,-0.150625,0.203721,-0.631735,0.372938,0.057301,-0.127025,0.147902,-0.015795,-0.434472,-0.236199,0.62158,-0.167283,-0.172281,0.268205,-0.3338,0.274109,-0.905775,0.627073,0.693905,-1.157644,-0.196461,0.017037,-0.727226,-1.193352,0.586496,-0.825565,1.298624,-0.730723,0.912171,-1.225806,-0.286296,0.576252,0.774334,-0.755445,-1.565033,0.995422,0.294524,0.020977,-0.591274,-0.432368,-0.090343,1.197432,-0.473306,-0.276709,-0.567322,0.329632,0.031807,-0.220882,-0.606814,1.02843,-0.77484,-1.328458,1.033845,0.537285,-1.283828,-0.616049,-0.826769,0.016151,0.526333,-1.565876,0.208127,-0.959644,0.824665,0.224006,-0.240964,-0.873134,-0.512755,0.237131,-0.173402,-0.590654,-1.699434,-1.579987,-1.16873,-0.435126,-0.345812,-0.367637,-0.171014,-1.605697,-1.400374,-0.37321,0.217223,-1.227038,-0.471033,-0.76819,0.145873,0.000108,-0.346444,0.577568,-0.034946,1.764654,-0.998005,0.3223,0.282416,-0.431087,0.35799,-0.208421,-0.085968,0.11475,-0.082078,0.084027,1.206931,0.69038,0.909911,0.642603,0.15101,0.348313,-0.408219,1.140387,-0.537646,0.667966,0.865984,0.38015,0.590908,-1.093774,0.248988,0.167887,-0.110277,0.663152,-0.492732,-0.466525,-0.044248,0.646224,-1.035201,0.378048,-0.079573,-1.190863,0.14809,0.978977,-0.660713,0.096484,0.045345,-0.202588,-0.534431,-0.484841,-0.785899,1.558576
4,4141,33,215,3,999,0,0.73752,0.123823,-2.235938,-0.008387,1.573915,-1.665571,-0.637934,-2.27809,0.049668,-1.635396,-1.929349,-1.000723,-0.533718,-0.834326,-1.824794,0.038925,-0.129125,0.014223,-0.674284,0.114642,-0.634255,1.230909,-2.404409,-0.725083,-0.574599,-0.13485,-1.260833,-0.123816,-0.340644,0.6565,-1.497004,-1.682721,-0.657443,0.249652,0.485596,-0.70613,0.705218,-0.532998,0.840226,0.845392,-0.367331,0.152462,0.419869,-0.494277,-0.706555,1.657687,0.537531,0.879349,0.629593,-0.696026,-0.385432,0.395378,-1.063428,-0.582441,-0.538848,0.357713,-0.952133,-0.24536,-0.969099,0.744297,0.272228,0.353904,0.128193,-0.672868,-0.120834,-0.141276,-1.472556,-0.966153,-0.980325,-0.975222,0.24397,1.686233,0.057917,0.329196,0.680495,-0.436619,-1.066129,-1.13301,1.244544,0.177002,0.332558,0.129711,-0.124003,0.284603,-0.348194,-1.124995,-0.101651,-0.077752,1.639333,0.580224,0.105616,-0.047155,0.00776,0.641873,0.112908,-0.13444,0.138091,-0.58182,0.137857,0.086066,-0.523317,-0.041743,-0.242234,0.337758,-0.150625,0.203721,-0.631735,0.372938,0.057301,-0.127025,0.147902,-0.015795,-0.434472,-0.236199,0.62158,-0.167283,-0.172281,0.268205,-0.3338,0.274109,-0.066566,-0.101876,-0.504578,0.84776,0.447376,-0.295172,0.162936,-1.082382,0.165525,-0.29047,-0.803497,0.421197,-0.091065,-0.248764,-0.443314,-0.019006,-0.597748,0.072122,-0.200603,-0.032076,0.014689,-0.306317,-0.23678,0.064137,0.172729,-0.168238,-0.264689,0.330116,-0.292099,0.182384,-0.066566,-0.101876,-0.504578,0.84776,0.447376,-0.295172,0.162936,-1.082382,0.165525,-0.29047,-0.803497,0.421197,-0.091065,-0.248764,-0.443314,-0.019006,-0.597748,0.072122,-0.200603,-0.032076,0.014689,-0.306317,-0.23678,0.064137,0.172729,-0.168238,-0.264689,0.330116,-0.292099,0.182384,-0.905775,0.627073,0.693905,-1.157644,-0.196461,0.017037,-0.727226,-1.193352,0.586496,-0.825565,1.298624,-0.730723,0.912171,-1.225806,-0.286296,0.576252,0.774334,-0.755445,-1.565033,0.995422,0.294524,0.020977,-0.591274,-0.432368,-0.090343,1.197432,-0.473306,-0.276709,-0.567322,0.329632,0.031807,-0.220882,-0.606814,1.02843,-0.77484,-1.328458,1.033845,0.537285,-1.283828,-0.616049,-0.826769,0.016151,0.526333,-1.565876,0.208127,-0.959644,0.824665,0.224006,-0.240964,-0.873134,-0.512755,0.237131,-0.173402,-0.590654,-1.699434,-1.579987,-1.16873,-0.435126,-0.345812,-0.367637,-0.154433,-1.510103,-1.338466,-0.403064,0.095192,-1.302034,-0.529649,-0.863992,0.326605,-0.024042,-0.328717,0.576783,0.06089,1.744399,-1.177445,0.401472,0.138362,-0.48404,0.314313,-0.259789,-0.191134,0.162616,0.139855,-0.110055,1.181512,0.770473,0.781663,0.877373,0.193223,0.410664,-0.408219,1.140387,-0.537646,0.667966,0.865984,0.38015,0.590908,-1.093774,0.248988,0.167887,-0.110277,0.663152,-0.492732,-0.466525,-0.044248,0.646224,-1.035201,0.378048,-0.079573,-1.190863,0.14809,0.978977,-0.660713,0.096484,0.045345,-0.202588,-0.534431,-0.484841,-0.785899,1.558576


In [116]:
reshaped_x_test = (np.reshape(elements,(38057,len(columns_categorical)*dimpool)))

df_income_test2 = df_income_test.iloc[:, [ 0, 12, 13, 14, 20, 29]].reset_index()

del df_income_test2["index"]
#del df_bank_test['target']

my_test_data = pd.concat([df_income_test2,pd.DataFrame(reshaped_x_test)],axis=1)
my_test_data


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,auc, roc_auc_score



y_pred_test = log.predict(stc.transform(my_test_data.values))
y_pred_prob_test = log.predict_proba(stc.transform(my_test_data.values))

print(confusion_matrix(y_test,y_pred_test))
print(classification_report(y_test,y_pred_test))

print(roc_auc_score(y_test,y_pred_prob_test[:,1]))

[[35419   324]
 [ 1632   682]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     35743
           1       0.68      0.29      0.41      2314

    accuracy                           0.95     38057
   macro avg       0.82      0.64      0.69     38057
weighted avg       0.94      0.95      0.94     38057

0.9340151365320433
