In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
import pmdarima as pm
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_model import ARIMA

In [2]:
# Reading Data
data = pd.read_excel('../raw_data/full_data_clean.xlsx', engine='openpyxl')

In [3]:
# Dropping extra index column
data = data.drop(columns= 'Unnamed: 0')
data.head()

Unnamed: 0,status,insurance_type,disease,claim_type,provider_type,state,sex,age,country_id,amount,date_issue,age_range,h_type
0,PAGADO,INDIVIDUAL,Hepatitis Viral(A-B-C),EMERGENCIA,CLINICA,Distrito Capital,F,50,29,142,2018-09-01,50-59,Ambulatorio
1,PAGADO,INDIVIDUAL,Laringitis Y Traqueitis Agudas,EMERGENCIA,CLINICA,Distrito Capital,F,48,29,26,2018-09-01,40-49,Ambulatorio
2,PAGADO,INDIVIDUAL,Amigdalo-Faringitis Aguda,EMERGENCIA,CLINICA,Zulia,M,42,29,21,2018-09-02,40-49,Ambulatorio
3,PAGADO,INDIVIDUAL,Bronquitis Aguda,EMERGENCIA,CLINICA,Distrito Capital,F,59,29,311,2018-09-02,50-59,Hospitalización
4,PAGADO,COLECTIVO,Dolor Abdominal Y Pelvico,EMERGENCIA,CLINICA,Miranda,F,47,29,14,2018-09-02,40-49,Ambulatorio


In [4]:
data.dtypes

status                    object
insurance_type            object
disease                   object
claim_type                object
provider_type             object
state                     object
sex                       object
age                        int64
country_id                 int64
amount                     int64
date_issue        datetime64[ns]
age_range                 object
h_type                    object
dtype: object

In [5]:
data.disease.value_counts()

Hipertension Arterial Y Sus Complicaciones                      14545
Bronquitis Aguda                                                 5406
Embarazo, Parto Y Puerperio                                      4520
Control General De Salud De Rutina De Subpoblaciones Definid     4178
Diarrea Y Gastroenteritis De Presunto Origen Infeccioso          4012
                                                                ...  
Sindactilia                                                         1
Trombosis De Seno                                                   1
Coartacion Aortica                                                  1
Acidez                                                              1
Leptospirosis                                                       1
Name: disease, Length: 535, dtype: int64

In [6]:
data['covid_claims'] = data.disease.map(lambda x: 1 if 'Covid' in x else 0)

In [7]:
data.covid_claims.value_counts()

0    146720
1      4531
Name: covid_claims, dtype: int64

In [15]:
data_daily = data.groupby('date_issue', as_index = False).agg({'amount': 'sum', 'covid_claims': 'sum'})
data_daily

Unnamed: 0,date_issue,amount,covid_claims
0,2018-09-01,168,0
1,2018-09-02,346,0
2,2018-09-03,16169,0
3,2018-09-04,28529,0
4,2018-09-05,64135,0
...,...,...,...
915,2021-03-04,350720,42
916,2021-03-05,293427,45
917,2021-03-06,333658,38
918,2021-03-07,139824,26


In [16]:
data_weekly = data_daily.resample('W-Mon', on='date_issue').sum().reset_index().sort_values(by='date_issue')
data_weekly

Unnamed: 0,date_issue,amount,covid_claims
0,2018-09-03,16683,0
1,2018-09-10,201774,0
2,2018-09-17,263661,0
3,2018-09-24,799703,0
4,2018-10-01,900305,0
...,...,...,...
127,2021-02-08,1949930,194
128,2021-02-15,1725225,185
129,2021-02-22,2232202,207
130,2021-03-01,2666662,271


In [17]:
# Exporting to excel

daily_data_clean_with_covid = data_daily.to_excel('../raw_data/daily_data_clean_with_covid.xlsx', engine='xlsxwriter')
weekly_data_clean_with_covid = data_weekly.to_excel('../raw_data/weekly_data_clean_with_covid.xlsx', engine='xlsxwriter')