# 4.1 Imports

In [22]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from imblearn.over_sampling import ADASYN

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load train data
train = pd.read_csv('data/train_prep_1.csv')

In [4]:
# load test data
test = pd.read_csv('data/test_prep_1.csv')

In [5]:
train.shape

(32462, 12)

In [6]:
test.shape

(16232, 12)

In [7]:
X_train = train.drop(columns=['target'])
X_train.shape

(32462, 11)

In [8]:
X_test = test.drop(columns=['target'])
X_test.shape

(16232, 11)

In [9]:
y_train = train['target']
y_train.shape

(32462,)

In [10]:
y_test = test['target']
y_test.shape

(16232,)

In [11]:
# numerical features
num_features = ['age','education','hours_per_week']

# numerical transformer
num_transformer = StandardScaler()

In [16]:
# categorical features
cat_features = ['workclass','marital_status','occupation','relationship',
               'race','sex','capital_change','native_country']

# categorical transformer
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [17]:
# preprocessing

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [18]:
# whole model pipeline

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',LogisticRegression())
])

In [19]:
# model fitting
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['age', 'education',
                                                   'hours_per_week']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['workclass',
                                                   'marital_status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'capital_change',
                                                   'native_country'])])),
                ('classifier', LogisticRegression())])

In [20]:
# model evaluation
print('model score: %.3f' % clf.score(X_test,y_test))

model score: 0.843


In [24]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90     12397
           1       0.71      0.57      0.63      3835

    accuracy                           0.84     16232
   macro avg       0.79      0.75      0.77     16232
weighted avg       0.84      0.84      0.84     16232



In [25]:
# oversampler
# error: cannot deal with string
# solution: put it in the transformer, do it after encoding(?)
adasyn = ADASYN(random_state=100,n_jobs=-1)

X_res, y_res = adasyn.fit_resample(X_train,y_train)

ValueError: could not convert string to float: 'State-gov'

In [None]:
#END