In [10]:
## Working with data frames
import pandas as pd
import numpy as np
from scipy import stats # Exclude outliers


In [11]:
# Create data frame of people
pple = (
    pd.read_csv("application_record.csv")  # Read the csv file
    .drop_duplicates('ID', keep='last')  # Drop duplicate ids
    # .drop('OCCUPATION_TYPE', axis=1)  # Drop occupation type column
)  # 438 510 rows

In [12]:
# Create data frame of record
record = (
    # Read the csv file
    pd.read_csv("credit_record.csv")  
    # Drop months balance its not needed
    .drop('MONTHS_BALANCE', axis=1)  
     # Replace the text - it means the same thing
    .replace({'C': 0, 'X' : 0}) 
    # Convert to number
    .assign(STATUS=lambda x: pd.to_numeric(x['STATUS']))  
)  # 45985 rows



# Mark the rows where the person owed the debt for two or more monts
# as default marker 1 otherwise 0
record['RESULT'] = record['STATUS'].apply(lambda x:1 if x >= 2 else 0) 

# resulta = df.loc[df['RESULT'] == 1]
# print(resulta.iloc[1])

# Join the two data frames based on the ID inner wise
df = pple.join(record.set_index('ID'), on='ID', how='inner')  # 36457 rows

In [13]:
# Features that need to be manipulated with
numeric_features = ['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'CNT_CHILDREN', 
                    'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS']

categorical_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                        'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 
                        'NAME_FAMILY_STATUS','NAME_HOUSING_TYPE']

In [14]:
for column in numeric_features:
    df[(np.abs(stats.zscore(df[column])) < 3)]

In [17]:
# df['RESULT'].value_counts()
df['RESULT'].value_counts()


0    36456
1        1
Name: RESULT, dtype: int64

In [6]:
# # Sving the pipeline
# save_file_name = 'credit_risk_pipeline.pkl'  # Name
# save_path = "../backend/core/"  # Path

In [16]:
df = df.drop_duplicates('ID', keep='last')  # Drop duplicate ids


In [82]:
INCOME_CHOICES = [['commercial', 'Commercial associate'],
                  ['working', 'Working'],
                  ['pensioner', 'Pensioner'],
                  ['state', 'State servant'],
                  ['student', 'Student']]

    
EDUCATION_CHOICES = [['degree', 'Academic degree'],
                     ['higher', 'Higher education'],
                     ['secondary', 'Secondary / secondary special'],
                    [ 'higher_incompl[ete', 'Incomplete higher'],
                    [ 'lower', 'Lower secondary']]

FAMILY_CHOICES =  [['civil', 'Civil marriage'],
                  ['married', 'Married'],
                  ['single', 'Single / not married'],
                  ['separated', 'Separated'],
                  ['widow', 'Widow']]
                
HOUSING_CHOICES = [['rented', 'Rented apartment'],
    ['house', 'House / apartment'],
    ['coop', 'Co-op apartment'],
    ['municipal', 'Municipal apartment'],
    ['office', 'Office apartment'],
    ['parents', 'With parents']]


In [None]:
def replace(word, array):
    for row in array:
        key = row[0]
        value = row[1]
        # print(word, "  ", value)
        if word == value:
            return key
        
    return array[-1][0]

In [84]:
# [
#   {
#     "model": "myapp.person",
#     "pk": 1,
#     "fields": {
#       "first_name": "John",
#       "last_name": "Lennon"
#     }
#   },
#   {
#     "model": "myapp.person",
#     "pk": 2,
#     "fields": {
#       "first_name": "Paul",
#       "last_name": "McCartney"
#     }
#   }
# ]

In [None]:

## TO JSON FORMAT
final_array = []

for index, row in df.iterrows():
    default_object = {}
    default_object['model'] = "website.applicationrecord"
    default_object['pk'] = row['ID']
    default_object['fields'] = {}
    default_object['fields']['code_gender'] = row['CODE_GENDER']
    default_object['fields']['flag_own_car'] = True if row['FLAG_OWN_CAR'] == "Y" else False
    default_object['fields']['flag_own_realty'] = True if row['FLAG_OWN_REALTY'] == "Y" else False
    default_object['fields']['cnt_children'] = row['CNT_CHILDREN']
    default_object['fields']['amt_income_total'] = row['AMT_INCOME_TOTAL']
    default_object['fields']['days_birth'] = row['DAYS_BIRTH']
    default_object['fields']['days_employed'] = row['DAYS_EMPLOYED']
    default_object['fields']['flag_mobil'] = True if row['FLAG_MOBIL'] == "Y" else False
    default_object['fields']['flag_work_phone'] = True if row['FLAG_WORK_PHONE'] == "Y" else False
    default_object['fields']['flag_phone'] = True if row['FLAG_PHONE'] == "Y" else False
    default_object['fields']['flag_email'] = True if row['FLAG_EMAIL'] == "Y" else False
    
    if 'OCCUPATION_TYPE' in row:
        default_object['fields']['occupation_type'] = row['occupation_type']
    else: 
        default_object['fields']['occupation_type'] = ''
    
    
    default_object['fields']['name_income_type'] = replace(row['NAME_INCOME_TYPE'],INCOME_CHOICES)
    default_object['fields']['name_education_type'] = replace(row['NAME_EDUCATION_TYPE'], EDUCATION_CHOICES)
    default_object['fields']['name_family_status'] = replace(row['NAME_FAMILY_STATUS'], FAMILY_CHOICES)
    default_object['fields']['name_housing_type'] = replace(row['NAME_HOUSING_TYPE'], HOUSING_CHOICES)
    default_object['fields']['cnt_fam_members'] = row['CNT_FAM_MEMBERS']
    default_object['fields']['status'] = row['RESULT']
    final_array.append(default_object)
    

import json
# Directly from dictionary
with open('json_data.json', 'w') as outfile:
    json.dump(final_array, outfile)