# Creación de csv para ingesta de base de datos

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import random
import numpy as np
import warnings
warnings.filterwarnings("ignore")
today = datetime.now()

### Creación de csv de clientes

In [2]:
data = pd.read_csv("CUSTOMER.csv")
fecha_actual = datetime.now()
z_code = pd.read_csv("ZIPCODES.csv", encoding='latin')
codes = set(z_code['Country'].dropna())
data['birthday']=pd.to_datetime(data['birthday'])
data['Edad'] = fecha_actual.year - data['birthday'].dt.year
data['birthday'] = data['birthday'].dt.strftime('%Y-%m-%d')
data['zipcode'] = np.random.choice(list(codes), size=len(data))
data.to_csv('customer_data.csv')
data

Unnamed: 0,id,surname,birthday,address,phone_number,Edad,zipcode
0,1,Caryl,1996-01-27,2 Farmco Crossing,595-642-5891,28,51640.0
1,2,Guglielmo,1996-10-06,86943 Chive Way,538-266-7527,28,51730.0
2,3,Aprilette,1995-01-16,697 Sachs Avenue,859-366-3566,29,51676.0
3,4,Dorree,1998-12-30,33 Mayfield Way,883-884-4505,26,51760.0
4,5,Darbee,1998-05-17,969 Loeprich Trail,725-355-0696,26,51800.0
...,...,...,...,...,...,...,...
995,996,Humphrey,1997-10-05,1486 Eastwood Park,345-152-7275,27,51614.0
996,997,Vale,1998-03-16,92 Pearson Circle,559-809-1094,26,51723.0
997,998,Chan,1997-10-27,5731 Hansons Court,429-541-9910,27,51620.0
998,999,Mannie,1996-01-17,71509 Carioca Park,196-985-4712,28,51773.0


### Creación de csv de contratos

In [3]:
data = pd.read_csv("Contracts.csv")
data

Unnamed: 0,id,Contract_start,Contract_end,total_amount
0,362,28/01/2023,28/02/2021,20474
1,476,02/08/2024,11/05/2018,25062
2,318,02/12/2015,01/05/2021,24979
3,405,13/12/2020,13/03/2023,19372
4,454,30/03/2022,31/10/2019,89682
...,...,...,...,...
995,326,15/08/2016,26/11/2024,68398
996,18,18/05/2024,31/07/2021,45222
997,454,18/12/2019,08/11/2022,46674
998,272,30/11/2018,22/12/2019,17708


In [4]:
#Ajustar formato de columnas de tiempo
def day_transform(row):
    row['Contract_start']=row['Contract_start'].replace('-','/')
    try:
        timeN = pd.to_datetime(row['Contract_start'])
    except:
        row['Contract_start']=row['Contract_start'].replace(row['Contract_start'][:2],'01')
        timeN = pd.to_datetime(row['Contract_start'])
    return timeN
def endday_transform(row):
    row['Contract_end']=row['Contract_end'].replace('-','/')
    try:
        timeN = pd.to_datetime(row['Contract_end'])
    except:
        row['Contract_end']=row['Contract_end'].replace(row['Contract_end'][:2],'01')
        timeN = pd.to_datetime(row['Contract_end'])
    return timeN
data['Contract_start']= data.apply(day_transform, axis=1)
data['Contract_end']= data.apply(endday_transform, axis=1)
data

Unnamed: 0,id,Contract_start,Contract_end,total_amount
0,362,2023-01-28,2021-02-28,20474
1,476,2024-02-08,2018-11-05,25062
2,318,2015-02-12,2021-01-05,24979
3,405,2020-12-13,2023-03-13,19372
4,454,2022-03-30,2019-10-31,89682
...,...,...,...,...
995,326,2016-08-15,2024-11-26,68398
996,18,2024-05-18,2021-07-31,45222
997,454,2019-12-18,2022-08-11,46674
998,272,2018-11-30,2019-12-22,17708


In [5]:
#Acomodar fechas de inicio y fin de los contratos
def swap_values(row):
    if row['Contract_end'] < row['Contract_start']:
        temp = row['Contract_start']
        row['Contract_start'] = row['Contract_end']
        row['Contract_end'] = temp
    return row

data = data.apply(swap_values, axis=1)

In [67]:
data

Unnamed: 0,id,Contract_start,Contract_end,total_amount
0,362,2021-02-28,2023-01-28,20474
1,476,2018-11-05,2024-02-08,25062
2,318,2015-02-12,2021-01-05,24979
3,405,2020-12-13,2023-03-13,19372
4,454,2019-10-31,2022-03-30,89682
...,...,...,...,...
995,326,2016-08-15,2024-11-26,68398
996,18,2021-07-31,2024-05-18,45222
997,454,2019-12-18,2022-08-11,46674
998,272,2018-11-30,2019-12-22,17708


In [6]:
def get_days(row):
    d=max(1, round(row['days_duration'] / 31))
    return d
data['days_duration'] = (data['Contract_end'] - data['Contract_start']).dt.days

payment_frequency = 'M'

data['payment_freq'] = data.apply(get_days, axis=1)

data['payment_amount'] = data['total_amount'] / data['payment_freq']
data.to_csv('LOAN_DATA.csv', index=False)
data

id                                362
Contract_start    2021-02-28 00:00:00
Contract_end      2023-01-28 00:00:00
total_amount                    20474
days_duration                     699
Name: 0, dtype: object
id                                476
Contract_start    2018-11-05 00:00:00
Contract_end      2024-02-08 00:00:00
total_amount                    25062
days_duration                    1921
Name: 1, dtype: object
id                                318
Contract_start    2015-02-12 00:00:00
Contract_end      2021-01-05 00:00:00
total_amount                    24979
days_duration                    2154
Name: 2, dtype: object
id                                405
Contract_start    2020-12-13 00:00:00
Contract_end      2023-03-13 00:00:00
total_amount                    19372
days_duration                     820
Name: 3, dtype: object
id                                454
Contract_start    2019-10-31 00:00:00
Contract_end      2022-03-30 00:00:00
total_amount                    89

Unnamed: 0,id,Contract_start,Contract_end,total_amount,days_duration,payment_freq,payment_amount
0,362,2021-02-28,2023-01-28,20474,699,23,890.173913
1,476,2018-11-05,2024-02-08,25062,1921,62,404.225806
2,318,2015-02-12,2021-01-05,24979,2154,69,362.014493
3,405,2020-12-13,2023-03-13,19372,820,26,745.076923
4,454,2019-10-31,2022-03-30,89682,881,28,3202.928571
...,...,...,...,...,...,...,...
995,326,2016-08-15,2024-11-26,68398,3025,98,697.938776
996,18,2021-07-31,2024-05-18,45222,1022,33,1370.363636
997,454,2019-12-18,2022-08-11,46674,967,31,1505.612903
998,272,2018-11-30,2019-12-22,17708,387,12,1475.666667


### Creación de csv de pagos

In [7]:
# Función para generar fechas de pago dentro del rango entre Contract_start y Contract_end
def generate_payment_dates(start_date, end_date, freq):
    payment_dates = []
    diference = end_date - start_date
    days = diference.days
    number_days=days/freq
    fecha_nueva=start_date
    dates=[]
    for i in range(freq):
        fecha_nueva = fecha_nueva + timedelta(days=number_days)
        dates.append(fecha_nueva)
    return dates

def is_paid(row):
    if row['payment_dates'] < today:
            status = random.choice(['make', 'miss']) 
    else:
        status = 'waiting'  
    # print(status)
    return status

payments_df = pd.DataFrame(columns=['id', 'payment_date', 'status'])
dframes=[]

# Iterar sobre cada fila del DataFrame original
for index, row in data.iterrows():
    # Generar fechas de pago
    payment_dates = generate_payment_dates(row['Contract_start'], row['Contract_end'], row['payment_freq'])
    df = pd.DataFrame({'id': row['id'], 'payment_dates':  payment_dates})
    dframes.append(df)

payments=pd.concat(dframes)
payments['status']=payments.apply(is_paid, axis=1)
payments


Unnamed: 0,id,payment_dates,status
0,362,2021-03-30 09:23:28.695652,miss
1,362,2021-04-29 18:46:57.391304,make
2,362,2021-05-30 04:10:26.086956,miss
3,362,2021-06-29 13:33:54.782608,make
4,362,2021-07-29 22:57:23.478260,miss
...,...,...,...
8,441,2023-07-28 16:36:55.384614,miss
9,441,2023-08-28 18:27:41.538460,make
10,441,2023-09-28 20:18:27.692306,make
11,441,2023-10-29 22:09:13.846152,make


In [8]:
payments.to_csv('PAYMENT_DATA.csv', index=False)