# Extract, Loading & Cleaning the Data
"All data is logically cleaned, preprocessed and  transformed into the correct format for analysis"

In [1]:
import pandas as pd

### Loading the Raw 'Global Finance Data' dataset from my GitHub portal (for easier access) 

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/tishsrisasi/my-first-project/refs/heads/main/Global%20finance%20data.csv")
df.head(5) #checking first 5 records to see if data loaded successfully.

Unnamed: 0,Country,Date,Stock_Index,Index_Value,Daily_Change_Percent,Market_Cap_Trillion_USD,GDP_Growth_Rate_Percent,Inflation_Rate_Percent,Interest_Rate_Percent,Unemployment_Rate_Percent,...,Commodity_Index,Oil_Price_USD_Barrel,Gold_Price_USD_Ounce,Bond_Yield_10Y_Percent,Credit_Rating,Political_Risk_Score,Banking_Sector_Health,Real_Estate_Index,Export_Growth_Percent,Import_Growth_Percent
0,United States,2024-08-15,S&P_500,5437.2,0.34,51.2,2.8,2.9,5.5,3.7,...,1.12,77.85,2487.5,4.25,AAA,8.1,Strong,145.6,3.2,2.8
1,China,2024-08-15,Shanghai_Composite,2891.6,-0.82,12.4,5.2,0.8,3.1,5.2,...,0.98,77.85,2487.5,2.15,A+,6.7,Moderate,98.7,8.9,6.1
2,Japan,2024-08-15,Nikkei_225,36789.1,1.24,6.8,0.9,2.8,-0.1,2.4,...,1.05,77.85,2487.5,0.89,A+,8.4,Strong,89.3,5.1,4.7
3,Germany,2024-08-15,DAX,18234.5,0.67,2.9,0.3,2.2,4.5,3.1,...,1.08,77.85,2487.5,2.31,AAA,8.7,Strong,112.4,2.1,1.8
4,United Kingdom,2024-08-15,FTSE_100,8156.3,-0.15,3.1,1.1,2.0,5.25,4.2,...,1.06,77.85,2487.5,3.89,AA,7.9,Moderate,97.8,0.9,1.2


### Creating a copy of the dataframe to use (application of Best Practice)

In [3]:
gfd_df = df.copy() #Best Practice

In [4]:
gfd_df.head() #checking again using new dataframe copy

Unnamed: 0,Country,Date,Stock_Index,Index_Value,Daily_Change_Percent,Market_Cap_Trillion_USD,GDP_Growth_Rate_Percent,Inflation_Rate_Percent,Interest_Rate_Percent,Unemployment_Rate_Percent,...,Commodity_Index,Oil_Price_USD_Barrel,Gold_Price_USD_Ounce,Bond_Yield_10Y_Percent,Credit_Rating,Political_Risk_Score,Banking_Sector_Health,Real_Estate_Index,Export_Growth_Percent,Import_Growth_Percent
0,United States,2024-08-15,S&P_500,5437.2,0.34,51.2,2.8,2.9,5.5,3.7,...,1.12,77.85,2487.5,4.25,AAA,8.1,Strong,145.6,3.2,2.8
1,China,2024-08-15,Shanghai_Composite,2891.6,-0.82,12.4,5.2,0.8,3.1,5.2,...,0.98,77.85,2487.5,2.15,A+,6.7,Moderate,98.7,8.9,6.1
2,Japan,2024-08-15,Nikkei_225,36789.1,1.24,6.8,0.9,2.8,-0.1,2.4,...,1.05,77.85,2487.5,0.89,A+,8.4,Strong,89.3,5.1,4.7
3,Germany,2024-08-15,DAX,18234.5,0.67,2.9,0.3,2.2,4.5,3.1,...,1.08,77.85,2487.5,2.31,AAA,8.7,Strong,112.4,2.1,1.8
4,United Kingdom,2024-08-15,FTSE_100,8156.3,-0.15,3.1,1.1,2.0,5.25,4.2,...,1.06,77.85,2487.5,3.89,AA,7.9,Moderate,97.8,0.9,1.2


In [5]:
gfd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 26 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Country                              39 non-null     object 
 1   Date                                 39 non-null     object 
 2   Stock_Index                          39 non-null     object 
 3   Index_Value                          39 non-null     float64
 4   Daily_Change_Percent                 39 non-null     float64
 5   Market_Cap_Trillion_USD              39 non-null     float64
 6   GDP_Growth_Rate_Percent              39 non-null     float64
 7   Inflation_Rate_Percent               39 non-null     float64
 8   Interest_Rate_Percent                39 non-null     float64
 9   Unemployment_Rate_Percent            39 non-null     float64
 10  Currency_Code                        39 non-null     object 
 11  Exchange_Rate_USD                 

In [6]:
#Best Practice - using the category types for the below
gfd_df['Credit_Rating'] = gfd_df['Credit_Rating'].astype('category')
gfd_df['Banking_Sector_Health'] = gfd_df['Banking_Sector_Health'].astype('category')
gfd_df.info() #confirm that category data types are in place

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 26 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   Country                              39 non-null     object  
 1   Date                                 39 non-null     object  
 2   Stock_Index                          39 non-null     object  
 3   Index_Value                          39 non-null     float64 
 4   Daily_Change_Percent                 39 non-null     float64 
 5   Market_Cap_Trillion_USD              39 non-null     float64 
 6   GDP_Growth_Rate_Percent              39 non-null     float64 
 7   Inflation_Rate_Percent               39 non-null     float64 
 8   Interest_Rate_Percent                39 non-null     float64 
 9   Unemployment_Rate_Percent            39 non-null     float64 
 10  Currency_Code                        39 non-null     object  
 11  Exchange_Rate_USD    

In [7]:
gfd_df.index

RangeIndex(start=0, stop=39, step=1)

In [8]:
gfd_df = gfd_df.set_index(['Stock_Index', 'Country', 'Currency_Code']) #multi-indexing
gfd_df.index

MultiIndex([(           'S&P_500',  'United States', 'USD'),
            ('Shanghai_Composite',          'China', 'CNY'),
            (        'Nikkei_225',          'Japan', 'JPY'),
            (               'DAX',        'Germany', 'EUR'),
            (          'FTSE_100', 'United Kingdom', 'GBP'),
            (            'CAC_40',         'France', 'EUR'),
            (            'Sensex',          'India', 'INR'),
            (               'TSX',         'Canada', 'CAD'),
            (           'Bovespa',         'Brazil', 'BRL'),
            (           'ASX_200',      'Australia', 'AUD'),
            (             'KOSPI',    'South Korea', 'KRW'),
            (              'MOEX',         'Russia', 'RUB'),
            (               'IPC',         'Mexico', 'MXN'),
            (          'FTSE_MIB',          'Italy', 'EUR'),
            (           'IBEX_35',          'Spain', 'EUR'),
            (               'AEX',    'Netherlands', 'EUR'),
            (           

In [9]:
gfd_df.head() #checking the new indexes are in place

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Date,Index_Value,Daily_Change_Percent,Market_Cap_Trillion_USD,GDP_Growth_Rate_Percent,Inflation_Rate_Percent,Interest_Rate_Percent,Unemployment_Rate_Percent,Exchange_Rate_USD,Currency_Change_YTD_Percent,...,Commodity_Index,Oil_Price_USD_Barrel,Gold_Price_USD_Ounce,Bond_Yield_10Y_Percent,Credit_Rating,Political_Risk_Score,Banking_Sector_Health,Real_Estate_Index,Export_Growth_Percent,Import_Growth_Percent
Stock_Index,Country,Currency_Code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
S&P_500,United States,USD,2024-08-15,5437.2,0.34,51.2,2.8,2.9,5.5,3.7,1.0,0.0,...,1.12,77.85,2487.5,4.25,AAA,8.1,Strong,145.6,3.2,2.8
Shanghai_Composite,China,CNY,2024-08-15,2891.6,-0.82,12.4,5.2,0.8,3.1,5.2,7.28,2.3,...,0.98,77.85,2487.5,2.15,A+,6.7,Moderate,98.7,8.9,6.1
Nikkei_225,Japan,JPY,2024-08-15,36789.1,1.24,6.8,0.9,2.8,-0.1,2.4,147.2,-8.9,...,1.05,77.85,2487.5,0.89,A+,8.4,Strong,89.3,5.1,4.7
DAX,Germany,EUR,2024-08-15,18234.5,0.67,2.9,0.3,2.2,4.5,3.1,0.92,1.8,...,1.08,77.85,2487.5,2.31,AAA,8.7,Strong,112.4,2.1,1.8
FTSE_100,United Kingdom,GBP,2024-08-15,8156.3,-0.15,3.1,1.1,2.0,5.25,4.2,0.78,-0.9,...,1.06,77.85,2487.5,3.89,AA,7.9,Moderate,97.8,0.9,1.2


### Data Clearning Tasks
1. Check for null values
2. Apply the Pandas dropna() method to the whole dataframe
3. Dropping features with constant values: ["Date", "Oil_Price_USD_Barrel", "Gold_Price_USD_Ounce"]

In [10]:
gfd_df.isnull().head() #checking for missing values in the dataset - first 5 records shown

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Date,Index_Value,Daily_Change_Percent,Market_Cap_Trillion_USD,GDP_Growth_Rate_Percent,Inflation_Rate_Percent,Interest_Rate_Percent,Unemployment_Rate_Percent,Exchange_Rate_USD,Currency_Change_YTD_Percent,...,Commodity_Index,Oil_Price_USD_Barrel,Gold_Price_USD_Ounce,Bond_Yield_10Y_Percent,Credit_Rating,Political_Risk_Score,Banking_Sector_Health,Real_Estate_Index,Export_Growth_Percent,Import_Growth_Percent
Stock_Index,Country,Currency_Code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
S&P_500,United States,USD,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Shanghai_Composite,China,CNY,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Nikkei_225,Japan,JPY,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
DAX,Germany,EUR,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
FTSE_100,United Kingdom,GBP,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
gfd_df.isnull().sum() #checking total amount of missing values - all zeros

Date                                   0
Index_Value                            0
Daily_Change_Percent                   0
Market_Cap_Trillion_USD                0
GDP_Growth_Rate_Percent                0
Inflation_Rate_Percent                 0
Interest_Rate_Percent                  0
Unemployment_Rate_Percent              0
Exchange_Rate_USD                      0
Currency_Change_YTD_Percent            0
Government_Debt_GDP_Percent            0
Current_Account_Balance_Billion_USD    0
FDI_Inflow_Billion_USD                 0
Commodity_Index                        0
Oil_Price_USD_Barrel                   0
Gold_Price_USD_Ounce                   0
Bond_Yield_10Y_Percent                 0
Credit_Rating                          0
Political_Risk_Score                   0
Banking_Sector_Health                  0
Real_Estate_Index                      0
Export_Growth_Percent                  0
Import_Growth_Percent                  0
dtype: int64

In [12]:
gfd_df.dropna() #dropping any features with 'NaN' values

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Date,Index_Value,Daily_Change_Percent,Market_Cap_Trillion_USD,GDP_Growth_Rate_Percent,Inflation_Rate_Percent,Interest_Rate_Percent,Unemployment_Rate_Percent,Exchange_Rate_USD,Currency_Change_YTD_Percent,...,Commodity_Index,Oil_Price_USD_Barrel,Gold_Price_USD_Ounce,Bond_Yield_10Y_Percent,Credit_Rating,Political_Risk_Score,Banking_Sector_Health,Real_Estate_Index,Export_Growth_Percent,Import_Growth_Percent
Stock_Index,Country,Currency_Code,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
S&P_500,United States,USD,2024-08-15,5437.2,0.34,51.2,2.8,2.9,5.5,3.7,1.0,0.0,...,1.12,77.85,2487.5,4.25,AAA,8.1,Strong,145.6,3.2,2.8
Shanghai_Composite,China,CNY,2024-08-15,2891.6,-0.82,12.4,5.2,0.8,3.1,5.2,7.28,2.3,...,0.98,77.85,2487.5,2.15,A+,6.7,Moderate,98.7,8.9,6.1
Nikkei_225,Japan,JPY,2024-08-15,36789.1,1.24,6.8,0.9,2.8,-0.1,2.4,147.2,-8.9,...,1.05,77.85,2487.5,0.89,A+,8.4,Strong,89.3,5.1,4.7
DAX,Germany,EUR,2024-08-15,18234.5,0.67,2.9,0.3,2.2,4.5,3.1,0.92,1.8,...,1.08,77.85,2487.5,2.31,AAA,8.7,Strong,112.4,2.1,1.8
FTSE_100,United Kingdom,GBP,2024-08-15,8156.3,-0.15,3.1,1.1,2.0,5.25,4.2,0.78,-0.9,...,1.06,77.85,2487.5,3.89,AA,7.9,Moderate,97.8,0.9,1.2
CAC_40,France,EUR,2024-08-15,7389.2,0.28,3.0,1.3,2.3,4.5,7.4,0.92,1.8,...,1.08,77.85,2487.5,2.95,AA,7.3,Moderate,103.2,1.8,2.1
Sensex,India,INR,2024-08-15,80456.7,0.89,4.3,6.8,4.9,6.5,3.2,83.7,-0.5,...,1.15,77.85,2487.5,7.04,BBB-,6.8,Moderate,134.8,13.2,10.9
TSX,Canada,CAD,2024-08-15,22567.8,0.45,2.8,2.9,2.8,4.75,6.1,1.37,4.2,...,1.09,77.85,2487.5,3.42,AAA,8.9,Strong,126.7,4.1,3.8
Bovespa,Brazil,BRL,2024-08-15,129834.2,1.67,1.4,2.1,4.5,10.75,7.8,5.47,-8.9,...,0.95,77.85,2487.5,10.84,BB-,5.4,Weak,89.3,8.7,6.9
ASX_200,Australia,AUD,2024-08-15,7923.4,0.22,1.8,1.7,3.8,4.35,4.1,1.52,5.6,...,1.11,77.85,2487.5,4.07,AAA,8.6,Strong,118.9,6.2,5.4


In [13]:
gfd_df = gfd_df.drop(['Date', 'Oil_Price_USD_Barrel', 'Gold_Price_USD_Ounce'], axis=1) #dropping features with constant values

In [14]:
gfd_df.info() #check to see if above features have been dropped

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 39 entries, ('S&P_500', 'United States', 'USD') to ('TA_125', 'Israel', 'ILS')
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype   
---  ------                               --------------  -----   
 0   Index_Value                          39 non-null     float64 
 1   Daily_Change_Percent                 39 non-null     float64 
 2   Market_Cap_Trillion_USD              39 non-null     float64 
 3   GDP_Growth_Rate_Percent              39 non-null     float64 
 4   Inflation_Rate_Percent               39 non-null     float64 
 5   Interest_Rate_Percent                39 non-null     float64 
 6   Unemployment_Rate_Percent            39 non-null     float64 
 7   Exchange_Rate_USD                    39 non-null     float64 
 8   Currency_Change_YTD_Percent          39 non-null     float64 
 9   Government_Debt_GDP_Percent          39 non-null     float64 
 10  Current_Account_Balance_Billi

### Finally, saving the clean data frame as a new file ready for the feature enrichment process


In [15]:
gfd_df.to_csv('global_finance_data_cleaned.csv', index=True) #making sure the index features are included in new file