In [105]:
import pandas as pd
import numpy as np

### Stock Market Dataset

In [106]:
stock_df = pd.read_csv('./data/Stock_Market_Dataset.csv', index_col=0)
stock_df.head()

Unnamed: 0,Date,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,Platinum_Price,...,Berkshire_Price,Berkshire_Vol.,Netflix_Price,Netflix_Vol.,Amazon_Price,Amazon_Vol.,Meta_Price,Meta_Vol.,Gold_Price,Gold_Vol.
0,02-02-2024,2.079,,72.28,,3.8215,,43194.7,42650.0,901.6,...,589498,10580.0,564.64,4030000.0,171.81,117220000.0,474.99,84710000.0,2053.7,
1,01-02-2024,2.05,161340.0,73.82,577940.0,3.8535,,43081.4,47690.0,922.3,...,581600,9780.0,567.51,3150000.0,159.28,66360000.0,394.78,25140000.0,2071.1,260920.0
2,31-01-2024,2.1,142860.0,75.85,344490.0,3.906,,42580.5,56480.0,932.6,...,578020,9720.0,564.11,4830000.0,155.2,49690000.0,390.14,20010000.0,2067.4,238370.0
3,30-01-2024,2.077,139750.0,77.82,347240.0,3.911,,42946.2,55130.0,931.7,...,584680,9750.0,562.85,6120000.0,159.0,42290000.0,400.06,18610000.0,2050.9,214590.0
4,29-01-2024,2.49,3590.0,76.78,331930.0,3.879,,43299.8,45230.0,938.3,...,578800,13850.0,575.79,6880000.0,161.26,42840000.0,401.02,17790000.0,2034.9,1780.0


In [107]:
print(f'Initial number of columns: {len(stock_df.columns)}')
print(f'Initial number of records: {len(stock_df)}')

Initial number of columns: 38
Initial number of records: 1243


#### Check types

In [108]:
types = stock_df.dtypes
wrong_types = types[types != 'float64'].index.tolist()

In [109]:
changed_cols = stock_df[wrong_types[1:]].replace(',', '', regex=True).astype('float64')

types = changed_cols.dtypes
types[types != 'float64'].index.tolist()


[]

In [110]:
changed_cols.columns

Index(['Bitcoin_Price', 'Platinum_Price', 'Ethereum_Price', 'S&P_500_Price',
       'Nasdaq_100_Price', 'Berkshire_Price', 'Gold_Price'],
      dtype='object')

In [111]:
for column in changed_cols.columns.tolist():
    stock_df[column] = changed_cols[column]

### NaN check

In [112]:
nan_count = stock_df.isna().sum()
nan_count[nan_count > 0]

Natural_Gas_Vol.      4
Crude_oil_Vol.       23
Copper_Vol.          37
Platinum_Vol.       607
Nasdaq_100_Vol.       1
Silver_Vol.          47
Gold_Vol.             2
dtype: int64

In [113]:
len(stock_df.columns)

38

In [114]:
stock_df.drop(columns=['Platinum_Vol.'], inplace=True)

In [115]:
stock_df.interpolate(inplace=True)

  stock_df.interpolate(inplace=True)


In [116]:
nan_count = stock_df.isna().sum()
nan_count[nan_count > 0]

Natural_Gas_Vol.     1
Crude_oil_Vol.       1
Copper_Vol.         26
Silver_Vol.          1
Gold_Vol.            1
dtype: int64

In [117]:
stock_df.drop(columns=['Copper_Vol.'], inplace=True)

In [118]:
stock_df.fillna(method='bfill', inplace=True)

  stock_df.fillna(method='bfill', inplace=True)


In [119]:
nan_count = stock_df.isna().sum()
nan_count[nan_count > 0]

Series([], dtype: int64)

In [120]:
print(f'Final number of columns: {len(stock_df.columns)}')
print(f'Final number of records: {len(stock_df)}')

Final number of columns: 36
Final number of records: 1243


### Date

In [121]:
stock_df['Date'] = pd.to_datetime(stock_df['Date'], format="%d-%m-%Y")
stock_df['Date'] = stock_df['Date'].dt.strftime('%Y-%m-%d')

In [124]:
stock_df = stock_df.sort_values(by='Date')

In [125]:
stock_df.to_csv('./preprocessed_data/stock_market.csv', index=False)

## Ozone dataset

In [127]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
ozone_level_detection = fetch_ucirepo(id=172) 
column_names = ozone_level_detection.variables['name'].tolist()[1:]
  
ozone_df = pd.read_csv('./data/ozone/onehr.data')
ozone_df.columns = column_names
ozone_df.head()

Unnamed: 0,Date,WSR0,WSR1,WSR2,WSR3,WSR4,WSR5,WSR6,WSR7,WSR8,...,RH50,U50,V50,HT50,KI,TT,SLP,SLP_,Precp,Class
0,1/2/1998,2.8,3.2,3.3,2.7,3.3,3.2,2.9,2.8,3.1,...,0.48,8.39,3.84,5805,14.05,29,10275,-55,0.0,0.0
1,1/3/1998,2.9,2.8,2.6,2.1,2.2,2.5,2.5,2.7,2.2,...,0.6,6.94,9.8,5790,17.9,41.3,10235,-40,0.0,0.0
2,1/4/1998,4.7,3.8,3.7,3.8,2.9,3.1,2.8,2.5,2.4,...,0.49,8.73,10.54,5775,31.15,51.7,10195,-40,2.08,0.0
3,1/5/1998,2.6,2.1,1.6,1.4,0.9,1.5,1.2,1.4,1.3,...,?,?,?,?,?,?,?,?,0.58,0.0
4,1/6/1998,3.1,3.5,3.3,2.5,1.6,1.7,1.6,1.6,2.3,...,0.09,11.98,11.28,5770,27.95,46.25,10120,?,5.84,0.0


In [128]:
print(f'Initial number of columns: {len(ozone_df.columns)}')
print(f'Initial number of records: {len(ozone_df)}')

Initial number of columns: 74
Initial number of records: 2535


### Type check

In [129]:
ozone_df.dtypes

Date      object
WSR0      object
WSR1      object
WSR2      object
WSR3      object
          ...   
TT        object
SLP       object
SLP_      object
Precp     object
Class    float64
Length: 74, dtype: object

In [130]:
ozone_df = ozone_df.replace('?', np.nan)
ozone_no_date = ozone_df[ozone_df.columns[1:]].astype('float64')
ozone_no_date.dtypes

WSR0     float64
WSR1     float64
WSR2     float64
WSR3     float64
WSR4     float64
          ...   
TT       float64
SLP      float64
SLP_     float64
Precp    float64
Class    float64
Length: 73, dtype: object

In [131]:
ozone_df = pd.concat([ozone_df['Date'], ozone_no_date], axis=1)

### NaN check

In [132]:
nan_count = ozone_df.isna().sum()
nan_count[nan_count > 0]

WSR0     299
WSR1     292
WSR2     294
WSR3     292
WSR4     293
        ... 
KI       136
TT       125
SLP       95
SLP_     159
Precp      2
Length: 72, dtype: int64

In [133]:
ozone_df.interpolate(inplace=True)

  ozone_df.interpolate(inplace=True)


In [134]:
nan_count = ozone_df.isna().sum()
nan_count[nan_count > 0]

Series([], dtype: int64)

In [135]:
print(f'Initial number of columns: {len(ozone_df.columns)}')
print(f'Initial number of records: {len(ozone_df)}')

Initial number of columns: 74
Initial number of records: 2535


### Date

In [136]:
ozone_df['Date'] = pd.to_datetime(ozone_df['Date'])
ozone_df['Date'] = ozone_df['Date'].dt.strftime('%Y-%m-%d')

In [138]:
ozone_df.to_csv('./preprocessed_data/ozone.csv', index=False)