In [1]:
## Working with data frames
import pandas as pd
import numpy as np
from scipy import stats # Exclude outliers


In [2]:
# Create data frame of people
pple = (
    pd.read_csv("application_record.csv")  # Read the csv file
    .drop_duplicates('ID', keep='last')  # Drop duplicate ids
    .drop('OCCUPATION_TYPE', axis=1)  # Drop occupation type column
)  # 438 510 rows

In [3]:
# Create data frame of record
record = (
    # Read the csv file
    pd.read_csv("credit_record.csv")  
    # Drop months balance its not needed
    .drop('MONTHS_BALANCE', axis=1)  
     # Replace the text - it means the same thing
    .replace({'C': 0, 'X' : 0}) 
    # Convert to number
    .assign(STATUS=lambda x: pd.to_numeric(x['STATUS']))  
)  # 45985 rows



# Mark the rows where the person owed the debt for two or more monts
# as default marker 1 otherwise 0
record['RESULT'] = record['STATUS'].apply(lambda x:1 if x >= 2 else 0) 

# resulta = df.loc[df['RESULT'] == 1]
# print(resulta.iloc[1])

# Join the two data frames based on the ID inner wise
df = pple.join(record.set_index('ID'), on='ID', how='inner')  # 36457 rows

In [4]:
# Features that need to be manipulated with
numeric_features = ['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'CNT_CHILDREN', 
                    'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS']

categorical_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                        'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 
                        'NAME_FAMILY_STATUS','NAME_HOUSING_TYPE']

In [5]:
for column in numeric_features:
    df[(np.abs(stats.zscore(df[column])) < 3)]

In [6]:
df = df.drop_duplicates('ID', keep='last')  # Drop duplicate ids


In [7]:
# df['RESULT'].value_counts()
df

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,STATUS,RESULT
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,2.0,0,0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,2.0,0,0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,2.0,0,0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,1.0,0,0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434808,5149828,M,Y,Y,0,315000.0,Working,Secondary / secondary special,Married,House / apartment,-17348,-2420,1,0,0,0,2.0,0,0
434809,5149834,F,N,Y,0,157500.0,Commercial associate,Higher education,Married,House / apartment,-12387,-1325,1,0,1,1,2.0,0,0
434810,5149838,F,N,Y,0,157500.0,Pensioner,Higher education,Married,House / apartment,-12387,-1325,1,0,1,1,2.0,0,0
434811,5150049,F,N,Y,0,283500.0,Working,Secondary / secondary special,Married,House / apartment,-17958,-655,1,0,0,0,2.0,0,0


In [8]:
# [
#   {
#     "model": "myapp.person",
#     "pk": 1,
#     "fields": {
#       "first_name": "John",
#       "last_name": "Lennon"
#     }
#   },
#   {
#     "model": "myapp.person",
#     "pk": 2,
#     "fields": {
#       "first_name": "Paul",
#       "last_name": "McCartney"
#     }
#   }
# ]

In [9]:

from pathlib import Path  

filepath = Path('out.csv')  

filepath.parent.mkdir(parents=True, exist_ok=True)  

df.to_csv(filepath)  