#### Importando bibliotecas

In [1]:
# desabilita os warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

import plotly.express as px
import matplotlib.pyplot as plt


# from datetime import datetime as dt

#### Tratando dados de solar_wind

##### Impotando o conjunto de dados 

In [3]:
df_solar_wind = pd.read_csv(r'..\data\solar_wind.csv')
df_solar_wind.head(3)

Unnamed: 0,period,timedelta,bx_gse,by_gse,bz_gse,theta_gse,phi_gse,bx_gsm,by_gsm,bz_gsm,theta_gsm,phi_gsm,bt,density,speed,temperature,source
0,train_a,0 days 00:00:00,-5.55,3.0,1.25,11.09,153.37,-5.55,3.0,1.25,11.09,153.37,6.8,1.53,383.92,110237.0,ac
1,train_a,0 days 00:01:00,-5.58,3.16,1.17,10.1,151.91,-5.58,3.16,1.17,10.1,151.91,6.83,1.69,381.79,123825.0,ac
2,train_a,0 days 00:02:00,-5.15,3.66,0.85,7.87,146.04,-5.15,3.66,0.85,7.87,146.04,6.77,1.97,389.11,82548.0,ac


##### Tratando tipos de dados

In [4]:
df_solar_wind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8392320 entries, 0 to 8392319
Data columns (total 17 columns):
 #   Column       Dtype  
---  ------       -----  
 0   period       object 
 1   timedelta    object 
 2   bx_gse       float64
 3   by_gse       float64
 4   bz_gse       float64
 5   theta_gse    float64
 6   phi_gse      float64
 7   bx_gsm       float64
 8   by_gsm       float64
 9   bz_gsm       float64
 10  theta_gsm    float64
 11  phi_gsm      float64
 12  bt           float64
 13  density      float64
 14  speed        float64
 15  temperature  float64
 16  source       object 
dtypes: float64(14), object(3)
memory usage: 1.1+ GB


In [5]:
# Otimizando tipos de dados das colunas numéricas
col_num_solar_wind = df_solar_wind.select_dtypes(include='float64').columns
df_solar_wind[col_num_solar_wind] = df_solar_wind[col_num_solar_wind].apply(pd.to_numeric, downcast='float')

# Corrigindo tipo de dado da coluna timedelta
df_solar_wind["timedelta"] = pd.to_timedelta(df_solar_wind["timedelta"])

# Corrigindo tipo de dado das colunas categoricas
col_cat_solar_wind = df_solar_wind.select_dtypes(include='object').columns
df_solar_wind[col_cat_solar_wind] = df_solar_wind[col_cat_solar_wind].astype('category')
df_solar_wind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8392320 entries, 0 to 8392319
Data columns (total 17 columns):
 #   Column       Dtype          
---  ------       -----          
 0   period       category       
 1   timedelta    timedelta64[ns]
 2   bx_gse       float32        
 3   by_gse       float32        
 4   bz_gse       float32        
 5   theta_gse    float32        
 6   phi_gse      float32        
 7   bx_gsm       float32        
 8   by_gsm       float32        
 9   bz_gsm       float32        
 10  theta_gsm    float32        
 11  phi_gsm      float32        
 12  bt           float32        
 13  density      float32        
 14  speed        float32        
 15  temperature  float32        
 16  source       category       
dtypes: category(2), float32(14), timedelta64[ns](1)
memory usage: 528.2 MB


In [6]:
# Exportando solar_wind com tipos de dados tratados
df_solar_wind.to_parquet(r'..\data\df_solar_wind.parquet')

##### Tratando Valores fora de escala

In [7]:
# bx_gse: -200 a +200 nT ok
# by_gse: -200 a +200 nT ok
# bz_gse: -200 a +200 nT ok
# theta_gse: -90° a 90° ok
# phi_gse: 0° a 360° ok
# bx_gsm: -200 a +200 nT ok
# by_gsm: -200 a +200 nT ok
# bz_gsm: -200 a +200 nT ok
# theta_gsm: -90° a 90° ok
# phi_gsm: 0° a 360° ok
# bt: 0 a 200 nT ok
# density: 0 a 200 N/cm³ ok 
# speed: 200 a 2.000 km/s speed=0?
# temperature: 10.000 K a 10.000.000 K temperature=0?
df_solar_wind.describe()

Unnamed: 0,timedelta,bx_gse,by_gse,bz_gse,theta_gse,phi_gse,bx_gsm,by_gsm,bz_gsm,theta_gsm,phi_gsm,bt,density,speed,temperature
count,8392320,8066432.0,8066432.0,8066432.0,8066432.0,8065932.0,8066432.0,8066432.0,8066432.0,8066432.0,8065932.0,8066432.0,7707430.0,7702765.0,7580552.0
mean,1044 days 16:41:54.955388112,-0.6610162,0.109787,-0.0227423,0.1545628,194.1759,-0.6608664,0.08842117,-0.02912839,0.1967691,194.3713,5.606227,4.421726,430.5854,115096.7
std,660 days 08:03:01.062266016,3.66973,3.97534,3.351972,32.12916,96.62813,3.667605,3.909885,3.43058,33.27889,96.82818,3.110002,4.331911,100.5777,120312.0
min,0 days 00:00:00,-54.63,-51.69,-55.0,-89.89,0.0,-54.63,-52.68,-55.69,-89.91,0.0,0.03,0.0,0.0,0.0
25%,485 days 15:59:45,-3.27,-2.46,-1.7,-21.45,124.07,-3.27,-2.38,-1.8,-22.79,124.69,3.64,1.79,356.8,39787.0
50%,971 days 07:59:30,-0.67,0.11,0.01,0.14,176.86,-0.67,0.08,0.0,0.06,177.66,4.95,3.34,409.7,77371.0
75%,1584 days 23:59:15,2.02,2.64,1.69,21.68,286.42,2.02,2.54,1.79,23.13,286.56,6.72,5.71,485.66,151220.0
max,2435 days 23:59:00,55.55,57.6,74.03,89.94,360.0,55.54,53.88,72.45,89.96,360.0,80.53,199.7,1198.49,6223700.0


In [8]:
# Removendo valores fora discrepantes
df_solar_wind["speed"].loc[(df_solar_wind["speed"] < 200) | (df_solar_wind["speed"] > 2000)] = np.nan
df_solar_wind["temperature"].loc[(df_solar_wind["temperature"] < 10_000) | (df_solar_wind["temperature"] > 10_000_000)] = np.nan

In [9]:
df_solar_wind["source"].unique()

['ac', NaN, 'ds']
Categories (2, object): ['ac', 'ds']

#### Tratando dados de satellite_pos

##### Impotando o conjunto de dados 

In [10]:
df_satellite_pos = pd.read_csv(r'..\data\satellite_pos.csv')
df_satellite_pos.head(3)

Unnamed: 0,period,timedelta,gse_x_ace,gse_y_ace,gse_z_ace,gse_x_dscovr,gse_y_dscovr,gse_z_dscovr
0,train_a,0 days,1522376.9,143704.6,149496.7,,,
1,train_a,1 days,1525410.9,136108.8,151034.1,,,
2,train_a,2 days,1528484.9,128470.5,152387.7,,,


##### Tratando tipos de dados

In [11]:
df_satellite_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5828 entries, 0 to 5827
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   period        5828 non-null   object 
 1   timedelta     5828 non-null   object 
 2   gse_x_ace     5828 non-null   float64
 3   gse_y_ace     5828 non-null   float64
 4   gse_z_ace     5828 non-null   float64
 5   gse_x_dscovr  1034 non-null   float64
 6   gse_y_dscovr  1034 non-null   float64
 7   gse_z_dscovr  1034 non-null   float64
dtypes: float64(6), object(2)
memory usage: 364.4+ KB


In [12]:
# Otimizando tipos de dados das colunas numéricas
col_num_satellite_pos = df_satellite_pos.select_dtypes(include='float64').columns
df_satellite_pos[col_num_satellite_pos] = df_satellite_pos[col_num_satellite_pos].apply(pd.to_numeric, downcast='float')

# Corrigindo tipo de dado da coluna timedelta
df_satellite_pos["timedelta"] = pd.to_timedelta(df_satellite_pos["timedelta"])

col_cat_satellite_pos = df_satellite_pos.select_dtypes(include='object').columns
df_satellite_pos[col_cat_satellite_pos] = df_solar_wind[col_cat_satellite_pos].astype('category')
df_satellite_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5828 entries, 0 to 5827
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   period        5828 non-null   category       
 1   timedelta     5828 non-null   timedelta64[ns]
 2   gse_x_ace     5828 non-null   float64        
 3   gse_y_ace     5828 non-null   float64        
 4   gse_z_ace     5828 non-null   float64        
 5   gse_x_dscovr  1034 non-null   float32        
 6   gse_y_dscovr  1034 non-null   float32        
 7   gse_z_dscovr  1034 non-null   float32        
dtypes: category(1), float32(3), float64(3), timedelta64[ns](1)
memory usage: 256.4 KB


In [13]:
# Exportando df_satellite_pos com tipos de dados tratados
df_solar_wind.to_parquet(r'..\data\df_satellite_pos.parquet')

#### Tratando dados de sunspots

##### Impotando o conjunto de dados 

In [14]:
df_sunspots = pd.read_csv(r'..\data\sunspots.csv')
df_sunspots.head(3)

Unnamed: 0,period,timedelta,smoothed_ssn
0,train_a,0 days 00:00:00,65.4
1,train_a,13 days 00:00:00,72.0
2,train_a,44 days 00:00:00,76.9


##### Tratando tipos de dados

In [15]:
df_sunspots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   period        192 non-null    object 
 1   timedelta     192 non-null    object 
 2   smoothed_ssn  192 non-null    float64
dtypes: float64(1), object(2)
memory usage: 4.6+ KB


In [16]:
# otimizando tipos de dados das colunas numéricas
col_num_sunspots = df_sunspots.select_dtypes(include='float64').columns
df_sunspots[col_num_sunspots] = df_sunspots[col_num_sunspots].apply(pd.to_numeric, downcast='float')

# Corrigindo tipo de dado da coluna timedelta
df_sunspots["timedelta"] = pd.to_timedelta(df_sunspots["timedelta"])

# Corrigindo tipo de dado da coluna period
df_sunspots["period"] = df_sunspots["period"].astype('category')
df_sunspots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   period        192 non-null    category       
 1   timedelta     192 non-null    timedelta64[ns]
 2   smoothed_ssn  192 non-null    float32        
dtypes: category(1), float32(1), timedelta64[ns](1)
memory usage: 2.7 KB


In [17]:
# Exportando df_sunspots com tipos de dados tratados
df_solar_wind.to_parquet(r'..\data\df_sunspots.parquet')

#### Tratando dados de labels

##### Impotando o conjunto de dados 

In [18]:
df_labels = pd.read_csv(r'..\data\labels.csv')
df_labels.head(3)

Unnamed: 0,period,timedelta,dst
0,train_a,0 days 00:00:00,-7
1,train_a,0 days 01:00:00,-10
2,train_a,0 days 02:00:00,-10


##### Tratando tipos de dados

In [19]:
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139872 entries, 0 to 139871
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   period     139872 non-null  object
 1   timedelta  139872 non-null  object
 2   dst        139872 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.2+ MB


In [20]:
# otimizando tipos de dados das colunas numéricas
col_num_labels = df_labels.select_dtypes(include='int64').columns
df_labels[col_num_labels] = df_labels[col_num_labels].apply(pd.to_numeric, downcast='integer')

# Corrigindo tipo de dado da coluna timedelta
df_labels["timedelta"] = pd.to_timedelta(df_labels["timedelta"])

# Corrigindo tipo de dado da coluna period
df_labels["period"] = df_labels["period"].astype('category')
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139872 entries, 0 to 139871
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype          
---  ------     --------------   -----          
 0   period     139872 non-null  category       
 1   timedelta  139872 non-null  timedelta64[ns]
 2   dst        139872 non-null  int16          
dtypes: category(1), int16(1), timedelta64[ns](1)
memory usage: 1.5 MB


In [21]:
# Exportando df_labels com tipos de dados tratados
df_solar_wind.to_parquet(r'..\data\df_labels.parquet')