In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Read the data
X_full = pd.read_csv('train_data.csv', index_col='ID')
X_test = pd.read_csv('test_data.csv', index_col='ID')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['app_status'], inplace=True)
y = X_full.app_status
X_full.drop(['app_status'], axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y,test_size=0.2)



In [8]:
X_full.head()

Unnamed: 0_level_0,parents,has_nurs,form,children,housing,finance,social,health
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,usual,less_proper,complete,3,critical,convenient,problematic,not_recom
2,pretentious,very_crit,completed,1,convenient,inconv,nonprob,not_recom
3,pretentious,proper,incomplete,1,less_conv,convenient,slightly_prob,priority
4,great_pret,improper,complete,1,convenient,convenient,nonprob,recommended
5,great_pret,less_proper,completed,1,convenient,convenient,slightly_prob,priority


In [3]:
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, X_train.columns)
    ])

# Define model
model =  XGBClassifier(use_label_encoder=False,
                                  eval_metric='mlogloss')

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
y_pred =clf.predict(X_valid)
print(accuracy_score(y_valid,y_pred))

1.0


In [4]:
#confusion matrix
print(confusion_matrix(y_valid, y_pred))

[[1352    0]
 [   0  722]]


In [5]:
# classification report
print(classification_report(y_valid,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1352
           1       1.00      1.00      1.00       722

    accuracy                           1.00      2074
   macro avg       1.00      1.00      1.00      2074
weighted avg       1.00      1.00      1.00      2074



In [6]:
# Preprocessing of test data, fit model
out_put=clf.predict(X_test)

In [7]:
# Save test predictions to file
output = pd.DataFrame({'ID': X_test.index,
                       'app_status': out_put})
output.to_csv('submission.csv', index=False)