### Importando bibliotecas

In [1]:
# desabilita os warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

### Importando bases de dados

In [3]:
df_solar_wind = pd.read_csv(r'..\data\solar_wind.csv', usecols=['period',
                                                                'timedelta',
                                                                'bx_gse',
                                                                'by_gse',
                                                                'bz_gse',
                                                                'bt',
                                                                'density',
                                                                'speed',
                                                                'temperature',
                                                                'source'] )
df_solar_wind.head(3)

Unnamed: 0,period,timedelta,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source
0,train_a,0 days 00:00:00,-5.55,3.0,1.25,6.8,1.53,383.92,110237.0,ac
1,train_a,0 days 00:01:00,-5.58,3.16,1.17,6.83,1.69,381.79,123825.0,ac
2,train_a,0 days 00:02:00,-5.15,3.66,0.85,6.77,1.97,389.11,82548.0,ac


In [4]:
df_satellite_pos = pd.read_csv(r'..\data\satellite_pos.csv')
df_satellite_pos.head(3)

Unnamed: 0,period,timedelta,gse_x_ace,gse_y_ace,gse_z_ace,gse_x_dscovr,gse_y_dscovr,gse_z_dscovr
0,train_a,0 days,1522376.9,143704.6,149496.7,,,
1,train_a,1 days,1525410.9,136108.8,151034.1,,,
2,train_a,2 days,1528484.9,128470.5,152387.7,,,


In [5]:
df_sunspots = pd.read_csv(r'..\data\sunspots.csv')
df_sunspots.head(3)

Unnamed: 0,period,timedelta,smoothed_ssn
0,train_a,0 days 00:00:00,65.4
1,train_a,13 days 00:00:00,72.0
2,train_a,44 days 00:00:00,76.9


In [6]:
df_labels = pd.read_csv(r'..\data\labels.csv')
df_labels.head(3)

Unnamed: 0,period,timedelta,dst
0,train_a,0 days 00:00:00,-7
1,train_a,0 days 01:00:00,-10
2,train_a,0 days 02:00:00,-10


### Tratando tipos de dados

##### Tratando tipos de dados do df_solar_wind

In [7]:
df_solar_wind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8392320 entries, 0 to 8392319
Data columns (total 10 columns):
 #   Column       Dtype  
---  ------       -----  
 0   period       object 
 1   timedelta    object 
 2   bx_gse       float64
 3   by_gse       float64
 4   bz_gse       float64
 5   bt           float64
 6   density      float64
 7   speed        float64
 8   temperature  float64
 9   source       object 
dtypes: float64(7), object(3)
memory usage: 640.3+ MB


In [8]:
# Otimizando tipos de dados das colunas numéricas
col_num_solar_wind = df_solar_wind.select_dtypes(include='float64').columns
df_solar_wind[col_num_solar_wind] = (df_solar_wind[col_num_solar_wind]
                                     .apply(pd.to_numeric, 
                                            downcast='float'))

# Corrigindo tipo de dado da coluna timedelta
df_solar_wind["timedelta"] = pd.to_timedelta(df_solar_wind["timedelta"])

# Corrigindo tipo de dado das colunas categoricas
col_cat_solar_wind = df_solar_wind.select_dtypes(include='object').columns
df_solar_wind[col_cat_solar_wind] = (df_solar_wind[col_cat_solar_wind]
                                     .astype('category'))
df_solar_wind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8392320 entries, 0 to 8392319
Data columns (total 10 columns):
 #   Column       Dtype          
---  ------       -----          
 0   period       category       
 1   timedelta    timedelta64[ns]
 2   bx_gse       float32        
 3   by_gse       float32        
 4   bz_gse       float32        
 5   bt           float32        
 6   density      float32        
 7   speed        float32        
 8   temperature  float32        
 9   source       category       
dtypes: category(2), float32(7), timedelta64[ns](1)
memory usage: 304.1 MB


##### Tratando tipos de dados do df_satellite_pos

In [9]:
df_satellite_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5828 entries, 0 to 5827
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   period        5828 non-null   object 
 1   timedelta     5828 non-null   object 
 2   gse_x_ace     5828 non-null   float64
 3   gse_y_ace     5828 non-null   float64
 4   gse_z_ace     5828 non-null   float64
 5   gse_x_dscovr  1034 non-null   float64
 6   gse_y_dscovr  1034 non-null   float64
 7   gse_z_dscovr  1034 non-null   float64
dtypes: float64(6), object(2)
memory usage: 364.4+ KB


In [10]:
# Otimizando tipos de dados das colunas numéricas
col_num_satellite_pos = df_satellite_pos.select_dtypes(include='float64').columns
df_satellite_pos[col_num_satellite_pos] = df_satellite_pos[col_num_satellite_pos].apply(pd.to_numeric, downcast='float')

# Corrigindo tipo de dado da coluna timedelta
df_satellite_pos["timedelta"] = pd.to_timedelta(df_satellite_pos["timedelta"])

# col_cat_satellite_pos = df_satellite_pos.select_dtypes(include='object').columns
# df_satellite_pos[col_cat_satellite_pos] = df_solar_wind[col_cat_satellite_pos].astype('category')
df_satellite_pos["period"] = df_satellite_pos["period"].astype('category')
df_satellite_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5828 entries, 0 to 5827
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   period        5828 non-null   category       
 1   timedelta     5828 non-null   timedelta64[ns]
 2   gse_x_ace     5828 non-null   float64        
 3   gse_y_ace     5828 non-null   float64        
 4   gse_z_ace     5828 non-null   float64        
 5   gse_x_dscovr  1034 non-null   float32        
 6   gse_y_dscovr  1034 non-null   float32        
 7   gse_z_dscovr  1034 non-null   float32        
dtypes: category(1), float32(3), float64(3), timedelta64[ns](1)
memory usage: 256.4 KB


##### Tratando tipos de dados do df_sunspots

In [11]:
df_sunspots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   period        192 non-null    object 
 1   timedelta     192 non-null    object 
 2   smoothed_ssn  192 non-null    float64
dtypes: float64(1), object(2)
memory usage: 4.6+ KB


In [12]:
# otimizando tipos de dados das colunas numéricas
col_num_sunspots = df_sunspots.select_dtypes(include='float64').columns
df_sunspots[col_num_sunspots] = df_sunspots[col_num_sunspots].apply(pd.to_numeric, downcast='float')

# Corrigindo tipo de dado da coluna timedelta
df_sunspots["timedelta"] = pd.to_timedelta(df_sunspots["timedelta"])

# Corrigindo tipo de dado da coluna period
df_sunspots["period"] = df_sunspots["period"].astype('category')
df_sunspots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   period        192 non-null    category       
 1   timedelta     192 non-null    timedelta64[ns]
 2   smoothed_ssn  192 non-null    float32        
dtypes: category(1), float32(1), timedelta64[ns](1)
memory usage: 2.7 KB


##### Tratando tipos de dados do df_labels

In [13]:
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139872 entries, 0 to 139871
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   period     139872 non-null  object
 1   timedelta  139872 non-null  object
 2   dst        139872 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.2+ MB


In [14]:
# otimizando tipos de dados das colunas numéricas
col_num_labels = df_labels.select_dtypes(include='int64').columns
df_labels[col_num_labels] = df_labels[col_num_labels].apply(pd.to_numeric, downcast='integer')

# Corrigindo tipo de dado da coluna timedelta
df_labels["timedelta"] = pd.to_timedelta(df_labels["timedelta"])

# Corrigindo tipo de dado da coluna period
df_labels["period"] = df_labels["period"].astype('category')
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139872 entries, 0 to 139871
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype          
---  ------     --------------   -----          
 0   period     139872 non-null  category       
 1   timedelta  139872 non-null  timedelta64[ns]
 2   dst        139872 non-null  int16          
dtypes: category(1), int16(1), timedelta64[ns](1)
memory usage: 1.5 MB


### Removendo Valores fora de escala do df_solar_wind

In [15]:
# bx_gse: -200 a +200 nT ok
# by_gse: -200 a +200 nT ok
# bz_gse: -200 a +200 nT ok
# bt: 0 a 200 nT ok
# density: 0 a 200 N/cm³ ok 
# speed: 200 a 2.000 km/s speed=0 ?
# temperature: 10.000 K a 10.000.000 K temperature=0 ?
df_solar_wind.describe()

Unnamed: 0,timedelta,bx_gse,by_gse,bz_gse,bt,density,speed,temperature
count,8392320,8066432.0,8066432.0,8066432.0,8066432.0,7707430.0,7702765.0,7580552.0
mean,1044 days 16:41:54.955388112,-0.6610162,0.109787,-0.0227423,5.606227,4.421726,430.5854,115096.7
std,660 days 08:03:01.062266016,3.66973,3.97534,3.351972,3.110002,4.331911,100.5777,120312.0
min,0 days 00:00:00,-54.63,-51.69,-55.0,0.03,0.0,0.0,0.0
25%,485 days 15:59:45,-3.27,-2.46,-1.7,3.64,1.79,356.8,39787.0
50%,971 days 07:59:30,-0.67,0.11,0.01,4.95,3.34,409.7,77371.0
75%,1584 days 23:59:15,2.02,2.64,1.69,6.72,5.71,485.66,151220.0
max,2435 days 23:59:00,55.55,57.6,74.03,80.53,199.7,1198.49,6223700.0


In [16]:
df_solar_wind["speed"].loc[(df_solar_wind["speed"] < 200) | (df_solar_wind["speed"] > 2000)] = np.nan
df_solar_wind["temperature"].loc[(df_solar_wind["temperature"] < 10_000) | (df_solar_wind["temperature"] > 10_000_000)] = np.nan

### Preenchendo fontes ausentes do df_solar_wind com a moda de cada período

In [17]:
df_solar_wind["source"].isna().sum()

np.int64(316816)

In [18]:
df_solar_wind_period_a = (
    df_solar_wind.loc[df_solar_wind["period"] == "train_a"])

df_solar_wind_period_b = (
    df_solar_wind.loc[df_solar_wind["period"] == "train_b"])

df_solar_wind_period_c = (
    df_solar_wind.loc[df_solar_wind["period"] == "train_c"])

moda_a = df_solar_wind_period_a["source"].mode()[0]
moda_b = df_solar_wind_period_b["source"].mode()[0]
moda_c = df_solar_wind_period_c["source"].mode()[0]

df_solar_wind_period_a["source"].fillna(moda_a, inplace=True)
df_solar_wind_period_b["source"].fillna(moda_b, inplace=True)
df_solar_wind_period_c["source"].fillna(moda_c, inplace=True)

df_solar_wind = pd.concat(
    [df_solar_wind_period_a, 
     df_solar_wind_period_b, 
     df_solar_wind_period_c], 
     ignore_index=True)

df_solar_wind.head(3)

Unnamed: 0,period,timedelta,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source
0,train_a,0 days 00:00:00,-5.55,3.0,1.25,6.8,1.53,383.920013,110237.0,ac
1,train_a,0 days 00:01:00,-5.58,3.16,1.17,6.83,1.69,381.790009,123825.0,ac
2,train_a,0 days 00:02:00,-5.15,3.66,0.85,6.77,1.97,389.109985,82548.0,ac


### Consolidando base de dados

##### Preparando df_solar_wind e df_satellite_pos

In [19]:
df_solar_wind["dia"] = df_solar_wind["timedelta"].dt.days
df_satellite_pos["dia"] = df_satellite_pos["timedelta"].dt.days

In [20]:
df_satellite_pos = pd.merge(df_satellite_pos, 
                            df_solar_wind[["period", "dia", "source"]], 
                            on=["period", "dia"], 
                            how="left")

In [21]:
df_satellite_pos["gse_x_satellite"] = np.nan
df_satellite_pos["gse_y_satellite"] = np.nan
df_satellite_pos["gse_z_satellite"] = np.nan

df_satellite_pos["gse_x_satellite"] = np.where(
    df_satellite_pos["gse_x_satellite"].isnull() 
    & (df_satellite_pos["source"]  == "ac"),
    df_satellite_pos["gse_x_ace"],
    df_satellite_pos["gse_x_dscovr"])

df_satellite_pos["gse_y_satellite"] = np.where(
    df_satellite_pos["gse_y_satellite"].isnull() 
    & (df_satellite_pos["source"]  == "ac"),
    df_satellite_pos["gse_y_ace"],
    df_satellite_pos["gse_y_dscovr"])

df_satellite_pos["gse_z_satellite"] = np.where(
    df_satellite_pos["gse_z_satellite"].isnull() 
    & (df_satellite_pos["source"]  == "ac"),
    df_satellite_pos["gse_z_ace"],
    df_satellite_pos["gse_z_dscovr"])

df_satellite_pos = df_satellite_pos[['period', 
                                     'timedelta', 
                                     'dia', 
                                     'source',
                                     'gse_x_satellite', 
                                     'gse_y_satellite', 
                                     'gse_z_satellite']]

##### Consolidando df_solar_wind e df_satellite_pos

In [22]:
df_solar_wind["gse_x_satellite"] = np.nan
df_solar_wind["gse_y_satellite"] = np.nan
df_solar_wind["gse_z_satellite"] = np.nan

df_solar_wind["gse_x_satellite"] = np.where(
    df_solar_wind["gse_x_satellite"].isna() 
    & (df_solar_wind["source"]  == df_satellite_pos["source"])
    & (df_solar_wind["period"]  == df_satellite_pos["period"])
    & (df_solar_wind["dia"]  == df_satellite_pos["dia"]),
    df_satellite_pos["gse_x_satellite"],
    np.nan)

df_solar_wind["gse_y_satellite"] = np.where(
    df_solar_wind["gse_y_satellite"].isna() 
    & (df_solar_wind["source"]  == df_satellite_pos["source"])
    & (df_solar_wind["period"]  == df_satellite_pos["period"])
    & (df_solar_wind["dia"]  == df_satellite_pos["dia"]),
    df_satellite_pos["gse_y_satellite"],
    np.nan)

df_solar_wind["gse_z_satellite"] = np.where(
    df_solar_wind["gse_z_satellite"].isna() 
    & (df_solar_wind["source"]  == df_satellite_pos["source"])
    & (df_solar_wind["period"]  == df_satellite_pos["period"])
    & (df_solar_wind["dia"]  == df_satellite_pos["dia"]),
    df_satellite_pos["gse_z_satellite"],
    np.nan)

df_solar_wind.head(3)

Unnamed: 0,period,timedelta,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source,dia,gse_x_satellite,gse_y_satellite,gse_z_satellite
0,train_a,0 days 00:00:00,-5.55,3.0,1.25,6.8,1.53,383.920013,110237.0,ac,0,1522376.9,143704.6,149496.7
1,train_a,0 days 00:01:00,-5.58,3.16,1.17,6.83,1.69,381.790009,123825.0,ac,0,1522376.9,143704.6,149496.7
2,train_a,0 days 00:02:00,-5.15,3.66,0.85,6.77,1.97,389.109985,82548.0,ac,0,1522376.9,143704.6,149496.7


##### Consolidando df_solar_wind e df_sunspots

In [23]:
df_solar_wind = pd.merge(df_solar_wind, 
                         df_sunspots, 
                         on=['period', 'timedelta'], 
                         how='left')
df_solar_wind.head(3)

Unnamed: 0,period,timedelta,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source,dia,gse_x_satellite,gse_y_satellite,gse_z_satellite,smoothed_ssn
0,train_a,0 days 00:00:00,-5.55,3.0,1.25,6.8,1.53,383.920013,110237.0,ac,0,1522376.9,143704.6,149496.7,65.400002
1,train_a,0 days 00:01:00,-5.58,3.16,1.17,6.83,1.69,381.790009,123825.0,ac,0,1522376.9,143704.6,149496.7,
2,train_a,0 days 00:02:00,-5.15,3.66,0.85,6.77,1.97,389.109985,82548.0,ac,0,1522376.9,143704.6,149496.7,


##### Consolidando df_solar_wind e df_labels

In [24]:
df_solar_wind = pd.merge(df_solar_wind, 
                         df_labels, 
                         on=['period', 'timedelta'], 
                         how='left')
df_solar_wind.head(3)

Unnamed: 0,period,timedelta,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source,dia,gse_x_satellite,gse_y_satellite,gse_z_satellite,smoothed_ssn,dst
0,train_a,0 days 00:00:00,-5.55,3.0,1.25,6.8,1.53,383.920013,110237.0,ac,0,1522376.9,143704.6,149496.7,65.400002,-7.0
1,train_a,0 days 00:01:00,-5.58,3.16,1.17,6.83,1.69,381.790009,123825.0,ac,0,1522376.9,143704.6,149496.7,,
2,train_a,0 days 00:02:00,-5.15,3.66,0.85,6.77,1.97,389.109985,82548.0,ac,0,1522376.9,143704.6,149496.7,,


##### Removendo coluna dia

In [25]:
df_solar_wind.drop("dia", axis=1, inplace=True)
df_solar_wind.head(3)

Unnamed: 0,period,timedelta,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source,gse_x_satellite,gse_y_satellite,gse_z_satellite,smoothed_ssn,dst
0,train_a,0 days 00:00:00,-5.55,3.0,1.25,6.8,1.53,383.920013,110237.0,ac,1522376.9,143704.6,149496.7,65.400002,-7.0
1,train_a,0 days 00:01:00,-5.58,3.16,1.17,6.83,1.69,381.790009,123825.0,ac,1522376.9,143704.6,149496.7,,
2,train_a,0 days 00:02:00,-5.15,3.66,0.85,6.77,1.97,389.109985,82548.0,ac,1522376.9,143704.6,149496.7,,


##### Exportando base de dados consolidada

In [26]:
df_solar_wind.to_parquet(r"..\data\df_consolidado.parquet")

### Removendo valores ausentes para base de dados de validação

In [27]:
df_validacao = df_solar_wind.copy()
df_validacao.dropna(how='any', inplace=True)

df_validacao_period_a = (
    df_validacao.loc[df_validacao["period"] == "train_a"])

df_validacao_period_b = (
    df_validacao.loc[df_validacao["period"] == "train_b"])

df_validacao_period_c = (
    df_validacao.loc[df_validacao["period"] == "train_c"])

##### Exportando bases de dados de validação

In [28]:
df_validacao_period_a.to_parquet( r'..\data\df_validacao_period_a.parquet')
df_validacao_period_b.to_parquet( r'..\data\df_validacao_period_b.parquet')
df_validacao_period_c.to_parquet( r'..\data\df_validacao_period_c.parquet')

### Preenchendo valores ausentes para base de dados de desenvolvimento

##### Segmentando base de dados por período

In [29]:
df_desenvolvimento = df_solar_wind.copy()

df_desenvolvimento_period_a = (
    df_desenvolvimento.loc[df_desenvolvimento["period"] == "train_a"])

df_desenvolvimento_period_b = (
    df_desenvolvimento.loc[df_desenvolvimento["period"] == "train_b"])

df_desenvolvimento_period_c = (
    df_desenvolvimento.loc[df_desenvolvimento["period"] == "train_c"])

##### Preenchendo valores ausentes dos vendos solares com interpolação linear

In [30]:
df_desenvolvimento_period_a["bx_gse"] = (
    df_desenvolvimento_period_a["bx_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["by_gse"] = (
    df_desenvolvimento_period_a["by_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["bz_gse"] = (
    df_desenvolvimento_period_a["bz_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["bt"] = (
    df_desenvolvimento_period_a["bt"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["density"] = (
    df_desenvolvimento_period_a["density"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["speed"] = (
    df_desenvolvimento_period_a["speed"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["temperature"] = (
    df_desenvolvimento_period_a["temperature"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_b["bx_gse"] = (
    df_desenvolvimento_period_b["bx_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["by_gse"] = (
    df_desenvolvimento_period_b["by_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["bz_gse"] = (
    df_desenvolvimento_period_b["bz_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["bt"] = (
    df_desenvolvimento_period_b["bt"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["density"] = (
    df_desenvolvimento_period_b["density"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["speed"] = (
    df_desenvolvimento_period_b["speed"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["temperature"] = (
    df_desenvolvimento_period_b["temperature"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_c["bx_gse"] = (
    df_desenvolvimento_period_c["bx_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["by_gse"] = (
    df_desenvolvimento_period_c["by_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["bz_gse"] = (
    df_desenvolvimento_period_c["bz_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["bt"] = (
    df_desenvolvimento_period_c["bt"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["density"] = (
    df_desenvolvimento_period_c["density"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["speed"] = (
    df_desenvolvimento_period_c["speed"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["temperature"] = (
    df_desenvolvimento_period_c["temperature"]
    .interpolate(method='linear', limit_direction='both'))

##### Preenchendo Valores ausentes das manchas solares com interpolação linear

In [31]:
df_desenvolvimento_period_a["smoothed_ssn"] = (
    df_desenvolvimento_period_a["smoothed_ssn"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_b["smoothed_ssn"] = (
    df_desenvolvimento_period_b["smoothed_ssn"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_c["smoothed_ssn"] = (
    df_desenvolvimento_period_c["smoothed_ssn"]
    .interpolate(method='linear', limit_direction='both'))

##### Preenchendo valores ausentes de Dst com interpolação linear

In [32]:
df_desenvolvimento_period_a["dst"] = (
    df_desenvolvimento_period_a["dst"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_b["dst"] = (
    df_desenvolvimento_period_b["dst"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_c["dst"] = (
    df_desenvolvimento_period_c["dst"]
    .interpolate(method='linear', limit_direction='both'))

##### Exportando bases de dados de desenvolvimento

In [33]:
df_desenvolvimento_period_a.to_parquet(
    r'..\data\df_desenvolvimento_period_a.parquet')
df_desenvolvimento_period_b.to_parquet(
    r'..\data\df_desenvolvimento_period_b.parquet')
df_desenvolvimento_period_c.to_parquet(
    r'..\data\df_desenvolvimento_period_c.parquet')