### Importando bibliotecas

In [1]:
# desabilita os warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

### Importando bases de dados

In [36]:
df_solar_wind = pd.read_csv(r'..\data\solar_wind.csv', usecols=['period',
                                                                'timedelta',
                                                                'bx_gse',
                                                                'by_gse',
                                                                'bz_gse',
                                                                'bt',
                                                                'density',
                                                                'speed',
                                                                'temperature',
                                                                'source'] )
df_solar_wind.head(3)

Unnamed: 0,period,timedelta,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source
0,train_a,0 days 00:00:00,-5.55,3.0,1.25,6.8,1.53,383.92,110237.0,ac
1,train_a,0 days 00:01:00,-5.58,3.16,1.17,6.83,1.69,381.79,123825.0,ac
2,train_a,0 days 00:02:00,-5.15,3.66,0.85,6.77,1.97,389.11,82548.0,ac


In [4]:
df_satellite_pos = pd.read_csv(r'..\data\satellite_pos.csv')
df_satellite_pos.head(3)

Unnamed: 0,period,timedelta,gse_x_ace,gse_y_ace,gse_z_ace,gse_x_dscovr,gse_y_dscovr,gse_z_dscovr
0,train_a,0 days,1522376.9,143704.6,149496.7,,,
1,train_a,1 days,1525410.9,136108.8,151034.1,,,
2,train_a,2 days,1528484.9,128470.5,152387.7,,,


In [5]:
df_sunspots = pd.read_csv(r'..\data\sunspots.csv')
df_sunspots.head(3)

Unnamed: 0,period,timedelta,smoothed_ssn
0,train_a,0 days 00:00:00,65.4
1,train_a,13 days 00:00:00,72.0
2,train_a,44 days 00:00:00,76.9


In [6]:
df_labels = pd.read_csv(r'..\data\labels.csv')
df_labels.head(3)

Unnamed: 0,period,timedelta,dst
0,train_a,0 days 00:00:00,-7
1,train_a,0 days 01:00:00,-10
2,train_a,0 days 02:00:00,-10


### Tratando tipos de dados

##### Tratando tipos de dados do df_solar_wind

In [7]:
df_solar_wind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8392320 entries, 0 to 8392319
Data columns (total 10 columns):
 #   Column       Dtype  
---  ------       -----  
 0   period       object 
 1   timedelta    object 
 2   bx_gse       float64
 3   by_gse       float64
 4   bz_gse       float64
 5   bt           float64
 6   density      float64
 7   speed        float64
 8   temperature  float64
 9   source       object 
dtypes: float64(7), object(3)
memory usage: 640.3+ MB


In [8]:
# Otimizando tipos de dados das colunas numéricas
col_num_solar_wind = df_solar_wind.select_dtypes(include='float64').columns
df_solar_wind[col_num_solar_wind] = (df_solar_wind[col_num_solar_wind]
                                     .apply(pd.to_numeric, 
                                            downcast='float'))

# Corrigindo tipo de dado da coluna timedelta
df_solar_wind["timedelta"] = pd.to_timedelta(df_solar_wind["timedelta"])

train_a_date = dt(year=1998, month=2, day=16, hour=0, minute=0, second=0)
df_solar_wind_period_a = df_solar_wind[df_solar_wind['period'] == 'train_a']
df_solar_wind_period_a['date'] = df_solar_wind_period_a['timedelta'] + train_a_date

train_b_date = dt(year=2013, month=6, day=1, hour=0, minute=0, second=0)
df_solar_wind_period_b = df_solar_wind[df_solar_wind['period'] == 'train_b']
df_solar_wind_period_b['date'] = df_solar_wind_period_b['timedelta'] + train_b_date

train_c_date = dt(year=2004, month=5, day=1, hour=0, minute=0, second=0)
df_solar_wind_period_c = df_solar_wind[df_solar_wind['period'] == 'train_c']
df_solar_wind_period_c['date'] = df_solar_wind_period_c['timedelta'] + train_c_date

df_solar_wind = pd.concat([df_solar_wind_period_a, 
                           df_solar_wind_period_b, 
                           df_solar_wind_period_c], 
                           ignore_index=True)

df_solar_wind.drop(['timedelta'], axis=1, inplace=True)
# df_solar_wind.set_index('date', drop=True, inplace=True)

# test_a_date = dt(year=2001, month=6, day=1, hour=0, minute=0, second=0)
# test_a_date = dt(year=2011, month=1, day=1, hour=0, minute=0, second=0)
# test_a_date = dt(year=2019, month=6, day=1, hour=0, minute=0, second=0)

# Corrigindo tipo de dado das colunas categoricas
col_cat_solar_wind = df_solar_wind.select_dtypes(include='object').columns
df_solar_wind[col_cat_solar_wind] = (df_solar_wind[col_cat_solar_wind]
                                     .astype('category'))
df_solar_wind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8392320 entries, 0 to 8392319
Data columns (total 10 columns):
 #   Column       Dtype         
---  ------       -----         
 0   period       category      
 1   bx_gse       float32       
 2   by_gse       float32       
 3   bz_gse       float32       
 4   bt           float32       
 5   density      float32       
 6   speed        float32       
 7   temperature  float32       
 8   source       category      
 9   date         datetime64[ns]
dtypes: category(2), datetime64[ns](1), float32(7)
memory usage: 304.1 MB


##### Tratando tipos de dados do df_satellite_pos

In [9]:
df_satellite_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5828 entries, 0 to 5827
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   period        5828 non-null   object 
 1   timedelta     5828 non-null   object 
 2   gse_x_ace     5828 non-null   float64
 3   gse_y_ace     5828 non-null   float64
 4   gse_z_ace     5828 non-null   float64
 5   gse_x_dscovr  1034 non-null   float64
 6   gse_y_dscovr  1034 non-null   float64
 7   gse_z_dscovr  1034 non-null   float64
dtypes: float64(6), object(2)
memory usage: 364.4+ KB


In [10]:
# Otimizando tipos de dados das colunas numéricas
col_num_satellite_pos = df_satellite_pos.select_dtypes(include='float64').columns
df_satellite_pos[col_num_satellite_pos] = df_satellite_pos[col_num_satellite_pos].apply(pd.to_numeric, downcast='float')

# Corrigindo tipo de dado da coluna timedelta
df_satellite_pos["timedelta"] = pd.to_timedelta(df_satellite_pos["timedelta"])

train_a_date = dt(year=1998, month=2, day=16, hour=0, minute=0, second=0)
df_satellite_pos_period_a = df_satellite_pos[df_satellite_pos['period'] == 'train_a']
df_satellite_pos_period_a['date'] = df_satellite_pos_period_a['timedelta'] + train_a_date

train_b_date = dt(year=2013, month=6, day=1, hour=0, minute=0, second=0)
df_satellite_pos_period_b = df_satellite_pos[df_satellite_pos['period'] == 'train_b']
df_satellite_pos_period_b['date'] = df_satellite_pos_period_b['timedelta'] + train_b_date

train_c_date = dt(year=2004, month=5, day=1, hour=0, minute=0, second=0)
df_satellite_pos_period_c = df_satellite_pos[df_satellite_pos['period'] == 'train_c']
df_satellite_pos_period_c['date'] = df_satellite_pos_period_c['timedelta'] + train_c_date

df_satellite_pos = pd.concat([df_satellite_pos_period_a, 
                           df_satellite_pos_period_b, 
                           df_satellite_pos_period_c], 
                           ignore_index=True)

df_satellite_pos.drop(['period','timedelta'], axis=1, inplace=True)
# df_satellite_pos.set_index('date', drop=True, inplace=True)

##### Tratando tipos de dados do df_sunspots

In [11]:
df_sunspots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   period        192 non-null    object 
 1   timedelta     192 non-null    object 
 2   smoothed_ssn  192 non-null    float64
dtypes: float64(1), object(2)
memory usage: 4.6+ KB


In [12]:
# otimizando tipos de dados das colunas numéricas
col_num_sunspots = df_sunspots.select_dtypes(include='float64').columns
df_sunspots[col_num_sunspots] = df_sunspots[col_num_sunspots].apply(pd.to_numeric, downcast='float')

# Corrigindo tipo de dado da coluna timedelta
df_sunspots["timedelta"] = pd.to_timedelta(df_sunspots["timedelta"])

train_a_date = dt(year=1998, month=2, day=16, hour=0, minute=0, second=0)
df_sunspots_period_a = df_sunspots[df_sunspots['period'] == 'train_a']
df_sunspots_period_a['date'] = df_sunspots_period_a['timedelta'] + train_a_date

train_b_date = dt(year=2013, month=6, day=1, hour=0, minute=0, second=0)
df_sunspots_period_b = df_sunspots[df_sunspots['period'] == 'train_b']
df_sunspots_period_b['date'] = df_sunspots_period_b['timedelta'] + train_b_date

train_c_date = dt(year=2004, month=5, day=1, hour=0, minute=0, second=0)
df_sunspots_period_c = df_sunspots[df_sunspots['period'] == 'train_c']
df_sunspots_period_c['date'] = df_sunspots_period_c['timedelta'] + train_c_date

df_sunspots = pd.concat([df_sunspots_period_a, 
                           df_sunspots_period_b, 
                           df_sunspots_period_c], 
                           ignore_index=True)

df_sunspots.drop(['period','timedelta'], axis=1, inplace=True)
# df_sunspots.set_index('date', drop=True, inplace=True)

df_sunspots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   smoothed_ssn  192 non-null    float32       
 1   date          192 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float32(1)
memory usage: 2.4 KB


##### Tratando tipos de dados do df_labels

In [13]:
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139872 entries, 0 to 139871
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   period     139872 non-null  object
 1   timedelta  139872 non-null  object
 2   dst        139872 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.2+ MB


In [14]:
# otimizando tipos de dados das colunas numéricas
col_num_labels = df_labels.select_dtypes(include='int64').columns
df_labels[col_num_labels] = df_labels[col_num_labels].apply(pd.to_numeric, downcast='integer')

# Corrigindo tipo de dado da coluna timedelta

df_labels["timedelta"] = pd.to_timedelta(df_labels["timedelta"])

train_a_date = dt(year=1998, month=2, day=16, hour=0, minute=0, second=0)
df_labels_period_a = df_labels[df_labels['period'] == 'train_a']
df_labels_period_a['date'] = df_labels_period_a['timedelta'] + train_a_date

train_b_date = dt(year=2013, month=6, day=1, hour=0, minute=0, second=0)
df_labels_period_b = df_labels[df_labels['period'] == 'train_b']
df_labels_period_b['date'] = df_labels_period_b['timedelta'] + train_b_date

train_c_date = dt(year=2004, month=5, day=1, hour=0, minute=0, second=0)
df_labels_period_c = df_labels[df_labels['period'] == 'train_c']
df_labels_period_c['date'] = df_labels_period_c['timedelta'] + train_c_date

df_labels = pd.concat([df_labels_period_a, 
                           df_labels_period_b, 
                           df_labels_period_c], 
                           ignore_index=True)

df_labels.drop(['period','timedelta'], axis=1, inplace=True)
# df_labels.set_index('date', drop=True, inplace=True)

df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139872 entries, 0 to 139871
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   dst     139872 non-null  int16         
 1   date    139872 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int16(1)
memory usage: 1.3 MB


### Removendo Valores fora de escala do df_solar_wind

In [15]:
# bx_gse: -200 a +200 nT ok
# by_gse: -200 a +200 nT ok
# bz_gse: -200 a +200 nT ok
# bt: 0 a 200 nT ok
# density: 0 a 200 N/cm³ ok 
# speed: 200 a 2.000 km/s speed=0 ?
# temperature: 10.000 K a 10.000.000 K temperature=0 ?
df_solar_wind.describe()

Unnamed: 0,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,date
count,8066432.0,8066432.0,8066432.0,8066432.0,7707430.0,7702765.0,7580552.0,8392320
mean,-0.6610162,0.109787,-0.0227423,5.606227,4.421726,430.5854,115096.7,2009-04-30 02:42:49.313661952
min,-54.63,-51.69,-55.0,0.03,0.0,0.0,0.0,1998-02-16 00:00:00
25%,-3.27,-2.46,-1.7,3.64,1.79,356.8,39787.0,2005-01-11 23:59:45
50%,-0.67,0.11,0.01,4.95,3.34,409.7,77371.0,2009-01-07 23:59:30
75%,2.02,2.64,1.69,6.72,5.71,485.66,151220.0,2015-06-04 23:59:15
max,55.55,57.6,74.03,80.53,199.7,1198.49,6223700.0,2019-05-31 23:59:00
std,3.66973,3.97534,3.351972,3.110002,4.331911,100.5777,120312.0,


In [16]:
df_solar_wind["speed"].loc[(df_solar_wind["speed"] < 200) | (df_solar_wind["speed"] > 2000)] = np.nan
df_solar_wind["temperature"].loc[(df_solar_wind["temperature"] < 10_000) | (df_solar_wind["temperature"] > 10_000_000)] = np.nan

### Preenchendo fontes ausentes do df_solar_wind com a moda de cada período

In [17]:
df_solar_wind["source"].isna().sum()

np.int64(316816)

In [18]:
df_solar_wind_period_a = (
    df_solar_wind.loc[df_solar_wind["period"] == "train_a"])

df_solar_wind_period_b = (
    df_solar_wind.loc[df_solar_wind["period"] == "train_b"])

df_solar_wind_period_c = (
    df_solar_wind.loc[df_solar_wind["period"] == "train_c"])

moda_a = df_solar_wind_period_a["source"].mode()[0]
moda_b = df_solar_wind_period_b["source"].mode()[0]
moda_c = df_solar_wind_period_c["source"].mode()[0]

df_solar_wind_period_a["source"].fillna(moda_a, inplace=True)
df_solar_wind_period_b["source"].fillna(moda_b, inplace=True)
df_solar_wind_period_c["source"].fillna(moda_c, inplace=True)

df_solar_wind = pd.concat(
    [df_solar_wind_period_a, 
     df_solar_wind_period_b, 
     df_solar_wind_period_c], 
     ignore_index=True)

df_solar_wind.head(3)

Unnamed: 0,period,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source,date
0,train_a,-5.55,3.0,1.25,6.8,1.53,383.920013,110237.0,ac,1998-02-16 00:00:00
1,train_a,-5.58,3.16,1.17,6.83,1.69,381.790009,123825.0,ac,1998-02-16 00:01:00
2,train_a,-5.15,3.66,0.85,6.77,1.97,389.109985,82548.0,ac,1998-02-16 00:02:00


### Consolidando base de dados

##### Adicionando fontes dos dados ao df_satellite_pos

In [19]:
df_satellite_pos = pd.merge(df_satellite_pos, 
                            df_solar_wind[["date","source"]], 
                            on='date',
                            how="left")

df_satellite_pos

Unnamed: 0,gse_x_ace,gse_y_ace,gse_z_ace,gse_x_dscovr,gse_y_dscovr,gse_z_dscovr,date,source
0,1522376.9,143704.6,149496.7,,,,1998-02-16,ac
1,1525410.9,136108.8,151034.1,,,,1998-02-17,ac
2,1528484.9,128470.5,152387.7,,,,1998-02-18,ac
3,1531570.3,120818.4,153561.4,,,,1998-02-19,ac
4,1534633.1,113178.5,154534.6,,,,1998-02-20,ac
...,...,...,...,...,...,...,...,...
5823,1413526.2,218154.6,141105.6,,,,2010-12-27,ac
5824,1416712.3,222471.2,141261.8,,,,2010-12-28,ac
5825,1420050.2,226690.3,141247.2,,,,2010-12-29,ac
5826,1423478.8,230817.3,141070.4,,,,2010-12-30,ac


In [20]:
df_satellite_pos["gse_x_satellite"] = np.nan
df_satellite_pos["gse_y_satellite"] = np.nan
df_satellite_pos["gse_z_satellite"] = np.nan

df_satellite_pos["gse_x_satellite"] = np.where(
    df_satellite_pos["gse_x_satellite"].isnull() 
    & (df_satellite_pos["source"]  == "ac"),
    df_satellite_pos["gse_x_ace"],
    df_satellite_pos["gse_x_dscovr"])

df_satellite_pos["gse_y_satellite"] = np.where(
    df_satellite_pos["gse_y_satellite"].isnull() 
    & (df_satellite_pos["source"]  == "ac"),
    df_satellite_pos["gse_y_ace"],
    df_satellite_pos["gse_y_dscovr"])

df_satellite_pos["gse_z_satellite"] = np.where(
    df_satellite_pos["gse_z_satellite"].isnull() 
    & (df_satellite_pos["source"]  == "ac"),
    df_satellite_pos["gse_z_ace"],
    df_satellite_pos["gse_z_dscovr"])

df_satellite_pos = df_satellite_pos[['date', 
                                     'source',
                                     'gse_x_satellite', 
                                     'gse_y_satellite', 
                                     'gse_z_satellite']]

##### Consolidando df_solar_wind e df_satellite_pos

In [21]:
df_solar_wind = pd.merge(df_solar_wind, 
                         df_satellite_pos, 
                         on=['date', 'source'], 
                         how='left')
df_solar_wind.head(3)

Unnamed: 0,period,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source,date,gse_x_satellite,gse_y_satellite,gse_z_satellite
0,train_a,-5.55,3.0,1.25,6.8,1.53,383.920013,110237.0,ac,1998-02-16 00:00:00,1522376.9,143704.6,149496.7
1,train_a,-5.58,3.16,1.17,6.83,1.69,381.790009,123825.0,ac,1998-02-16 00:01:00,,,
2,train_a,-5.15,3.66,0.85,6.77,1.97,389.109985,82548.0,ac,1998-02-16 00:02:00,,,


##### Consolidando df_solar_wind e df_sunspots

In [22]:
df_solar_wind = pd.merge(df_solar_wind, 
                         df_sunspots, 
                         on=['date'], 
                         how='left')
df_solar_wind.head(3)

Unnamed: 0,period,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source,date,gse_x_satellite,gse_y_satellite,gse_z_satellite,smoothed_ssn
0,train_a,-5.55,3.0,1.25,6.8,1.53,383.920013,110237.0,ac,1998-02-16 00:00:00,1522376.9,143704.6,149496.7,65.400002
1,train_a,-5.58,3.16,1.17,6.83,1.69,381.790009,123825.0,ac,1998-02-16 00:01:00,,,,
2,train_a,-5.15,3.66,0.85,6.77,1.97,389.109985,82548.0,ac,1998-02-16 00:02:00,,,,


##### Consolidando df_solar_wind e df_labels

In [23]:
df_solar_wind = pd.merge(df_solar_wind, 
                         df_labels, 
                         on=['date'], 
                         how='left')
df_solar_wind.head(3)

Unnamed: 0,period,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source,date,gse_x_satellite,gse_y_satellite,gse_z_satellite,smoothed_ssn,dst
0,train_a,-5.55,3.0,1.25,6.8,1.53,383.920013,110237.0,ac,1998-02-16 00:00:00,1522376.9,143704.6,149496.7,65.400002,-7.0
1,train_a,-5.58,3.16,1.17,6.83,1.69,381.790009,123825.0,ac,1998-02-16 00:01:00,,,,,
2,train_a,-5.15,3.66,0.85,6.77,1.97,389.109985,82548.0,ac,1998-02-16 00:02:00,,,,,


### Configurano a data como índice do df consolidado (df_solar_wind)

In [24]:
df_solar_wind.set_index('date', drop=True, inplace=True)

### Exportando base de dados consolidada

In [25]:
df_solar_wind.to_parquet(r"..\data\df_consolidado.parquet")

### Removendo valores ausentes para base de dados de validação

In [26]:
df_validacao = df_solar_wind.copy()
df_validacao.dropna(how='any', inplace=True)

df_validacao_period_a = (
    df_validacao.loc[df_validacao["period"] == "train_a"])
df_validacao_period_a.drop('period', axis=1, inplace=True)

df_validacao_period_b = (
    df_validacao.loc[df_validacao["period"] == "train_b"])
df_validacao_period_b.drop('period', axis=1, inplace=True)

df_validacao_period_c = (
    df_validacao.loc[df_validacao["period"] == "train_c"])
df_validacao_period_c.drop('period', axis=1, inplace=True)

##### Exportando bases de dados de validação

In [27]:
df_validacao_period_a.to_parquet(r'..\data\df_validacao_period_a.parquet')
df_validacao_period_b.to_parquet(r'..\data\df_validacao_period_b.parquet')
df_validacao_period_c.to_parquet(r'..\data\df_validacao_period_c.parquet')
df_validacao.to_parquet(r'..\data\df_consolidado_sem_nulos.parquet')

### Preenchendo valores ausentes para base de dados de desenvolvimento

##### Segmentando base de dados por período

In [28]:
df_desenvolvimento = df_solar_wind.copy()

df_desenvolvimento_period_a = (
    df_desenvolvimento.loc[df_desenvolvimento["period"] == "train_a"])

df_desenvolvimento_period_b = (
    df_desenvolvimento.loc[df_desenvolvimento["period"] == "train_b"])

df_desenvolvimento_period_c = (
    df_desenvolvimento.loc[df_desenvolvimento["period"] == "train_c"])

##### Preenchendo valores ausentes dos vendos solares com interpolação linear

In [29]:
df_desenvolvimento_period_a["bx_gse"] = (
    df_desenvolvimento_period_a["bx_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["by_gse"] = (
    df_desenvolvimento_period_a["by_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["bz_gse"] = (
    df_desenvolvimento_period_a["bz_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["bt"] = (
    df_desenvolvimento_period_a["bt"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["density"] = (
    df_desenvolvimento_period_a["density"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["speed"] = (
    df_desenvolvimento_period_a["speed"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["temperature"] = (
    df_desenvolvimento_period_a["temperature"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_b["bx_gse"] = (
    df_desenvolvimento_period_b["bx_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["by_gse"] = (
    df_desenvolvimento_period_b["by_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["bz_gse"] = (
    df_desenvolvimento_period_b["bz_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["bt"] = (
    df_desenvolvimento_period_b["bt"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["density"] = (
    df_desenvolvimento_period_b["density"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["speed"] = (
    df_desenvolvimento_period_b["speed"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["temperature"] = (
    df_desenvolvimento_period_b["temperature"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_c["bx_gse"] = (
    df_desenvolvimento_period_c["bx_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["by_gse"] = (
    df_desenvolvimento_period_c["by_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["bz_gse"] = (
    df_desenvolvimento_period_c["bz_gse"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["bt"] = (
    df_desenvolvimento_period_c["bt"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["density"] = (
    df_desenvolvimento_period_c["density"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["speed"] = (
    df_desenvolvimento_period_c["speed"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["temperature"] = (
    df_desenvolvimento_period_c["temperature"]
    .interpolate(method='linear', limit_direction='both'))

##### Preenchendo Valores ausentes das posições dos satélites com interpolação linear

In [30]:
df_desenvolvimento_period_a["gse_x_satellite"] = (
    df_desenvolvimento_period_a["gse_x_satellite"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["gse_y_satellite"] = (
    df_desenvolvimento_period_a["gse_y_satellite"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_a["gse_z_satellite"] = (
    df_desenvolvimento_period_a["gse_z_satellite"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_b["gse_x_satellite"] = (
    df_desenvolvimento_period_b["gse_x_satellite"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["gse_y_satellite"] = (
    df_desenvolvimento_period_b["gse_y_satellite"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_b["gse_z_satellite"] = (
    df_desenvolvimento_period_b["gse_z_satellite"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_c["gse_x_satellite"] = (
    df_desenvolvimento_period_c["gse_x_satellite"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["gse_y_satellite"] = (
    df_desenvolvimento_period_c["gse_y_satellite"]
    .interpolate(method='linear', limit_direction='both'))
df_desenvolvimento_period_c["gse_z_satellite"] = (
    df_desenvolvimento_period_c["gse_z_satellite"]
    .interpolate(method='linear', limit_direction='both'))

##### Preenchendo Valores ausentes das manchas solares com interpolação linear

In [31]:
df_desenvolvimento_period_a["smoothed_ssn"] = (
    df_desenvolvimento_period_a["smoothed_ssn"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_b["smoothed_ssn"] = (
    df_desenvolvimento_period_b["smoothed_ssn"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_c["smoothed_ssn"] = (
    df_desenvolvimento_period_c["smoothed_ssn"]
    .interpolate(method='linear', limit_direction='both'))

##### Preenchendo valores ausentes de Dst com interpolação linear

In [32]:
df_desenvolvimento_period_a["dst"] = (
    df_desenvolvimento_period_a["dst"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_b["dst"] = (
    df_desenvolvimento_period_b["dst"]
    .interpolate(method='linear', limit_direction='both'))

df_desenvolvimento_period_c["dst"] = (
    df_desenvolvimento_period_c["dst"]
    .interpolate(method='linear', limit_direction='both'))

##### Consilidando dados preenchidos o df_desenvolvimento

In [33]:
df_preenchidos = pd.concat([df_desenvolvimento_period_a, 
                            df_desenvolvimento_period_b, 
                            df_desenvolvimento_period_c], 
                            axis=0)

df_preenchidos

Unnamed: 0_level_0,period,bx_gse,by_gse,bz_gse,bt,density,speed,temperature,source,gse_x_satellite,gse_y_satellite,gse_z_satellite,smoothed_ssn,dst
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1998-02-16 00:00:00,train_a,-5.55,3.00,1.25,6.80,1.53,383.920013,110237.0,ac,1.522377e+06,143704.600000,149496.700000,65.400002,-7.00
1998-02-16 00:01:00,train_a,-5.58,3.16,1.17,6.83,1.69,381.790009,123825.0,ac,1.522379e+06,143699.325139,149497.767639,65.400352,-7.05
1998-02-16 00:02:00,train_a,-5.15,3.66,0.85,6.77,1.97,389.109985,82548.0,ac,1.522381e+06,143694.050278,149498.835278,65.400703,-7.10
1998-02-16 00:03:00,train_a,-5.20,3.68,0.68,6.74,1.97,389.109985,82548.0,ac,1.522383e+06,143688.775417,149499.902917,65.401062,-7.15
1998-02-16 00:04:00,train_a,-5.12,3.68,0.49,6.65,1.77,384.260010,94269.0,ac,1.522385e+06,143683.500556,149500.970556,65.401413,-7.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-12-31 23:55:00,train_c,-1.18,2.00,-3.92,4.56,4.64,348.730011,23368.0,ac,1.426938e+06,234813.700000,140701.800000,42.500000,-14.00
2010-12-31 23:56:00,train_c,-1.34,2.12,-3.83,4.58,4.27,346.359985,26497.0,ac,1.426938e+06,234813.700000,140701.800000,42.500000,-14.00
2010-12-31 23:57:00,train_c,-1.62,2.33,-3.63,4.61,3.95,344.119995,27050.0,ac,1.426938e+06,234813.700000,140701.800000,42.500000,-14.00
2010-12-31 23:58:00,train_c,-2.27,2.48,-3.22,4.65,2.81,338.519989,33257.0,ac,1.426938e+06,234813.700000,140701.800000,42.500000,-14.00


##### Exportando bases de dados de desenvolvimento

In [34]:
df_desenvolvimento_period_a.to_parquet(
    r'..\data\df_desenvolvimento_period_a.parquet')
df_desenvolvimento_period_b.to_parquet(
    r'..\data\df_desenvolvimento_period_b.parquet')
df_desenvolvimento_period_c.to_parquet(
    r'..\data\df_desenvolvimento_period_c.parquet')
df_preenchidos.to_parquet(
    r'..\data\df_consolidado_preenchido.parquet')