In [1]:
# Import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read the Excel file
AverageWage = pd.read_excel(r'/dataset/economic/Average Income.xlsx', sheet_name="Table")

In [None]:
# Removing junk rows and columns, transposing and setting correct index and columns
AverageWage.drop(AverageWage.columns[1:4], axis=1, inplace=True)
AverageWage.drop(AverageWage.index[0], axis=0, inplace=True)
AverageWage = AverageWage.set_index("Time period").T

In [None]:
# Remove duplicate columns
mask = ~AverageWage.columns.duplicated(keep='last')
AverageWage = AverageWage.loc[:, mask]

In [None]:
# Loading the USD PPP data
AverageWageUSDPPP = pd.read_excel(r'/dataset/economic/Average Income USD PPP.xlsx', sheet_name="Table")

In [None]:
#Removing junk rows and columns, transposing and setting correct index and columns

AverageWageUSDPPP.drop(AverageWageUSDPPP.columns[1:4], axis=1, inplace=True)
AverageWageUSDPPP.drop(AverageWageUSDPPP.index[0], axis=0, inplace=True)
AverageWageUSDPPP = AverageWageUSDPPP.set_index("Time period").T
mask = ~AverageWageUSDPPP.columns.duplicated(keep='last')
AverageWageUSDPPP = AverageWageUSDPPP.loc[:, mask]
AverageWageUSDPPP_perc_change = AverageWageUSDPPP.pct_change() * 100
AverageWageUSDPPP_perc_change.head()

In [None]:
# Set the style for better visualization
plt.style.use('seaborn')

# Create the figure and axis with a larger size
plt.figure(figsize=(12, 6))

# Create the line plot using different line styles and colors for better distinction
plt.plot(AverageWageUSDPPP.index, AverageWageUSDPPP['Türkiye'], linewidth=2, label='Turkiye', marker='o', markersize=4)
plt.plot(AverageWageUSDPPP.index, AverageWageUSDPPP['United Kingdom'], linewidth=2, label='UK', marker='s', markersize=4)
plt.plot(AverageWageUSDPPP.index, AverageWageUSDPPP['Canada'], linewidth=2, label='Canada', marker='^', markersize=4)
plt.plot(AverageWageUSDPPP.index, AverageWageUSDPPP['Korea'], linewidth=2, label='South Korea', marker='d', markersize=4)

# Customize the plot
plt.title('Average Wage', fontsize=14, pad=20)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Average Wage', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Adjust layout to prevent label cutoff
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
#The Corp income tax similar to personal income tax is more usable from the OECD website as an Excel sheet. This time only 1 is needed

CorpIncomeTax = pd.read_excel(r'/dataset/economic/Corporate income tax.xlsx', sheet_name="Table")

In [None]:
#Removing junk rows and columns, transposing and setting correct index and columns

CorpIncomeTax.drop(CorpIncomeTax.columns[0], axis=1, inplace=True)
CorpIncomeTax.drop(CorpIncomeTax.index[0], axis=0, inplace=True)

In [None]:
CorpIncomeTax.set_index(CorpIncomeTax.columns[0], inplace=True)

In [None]:
CorpIncomeTax.columns = CorpIncomeTax.iloc[0]  # Set column names to first row
CorpIncomeTax.drop(CorpIncomeTax.index[0], inplace=True)  # Remove the first row

In [None]:
#Dropping more junk rows

CorpIncomeTax.drop(CorpIncomeTax.columns[0], axis=1, inplace=True)
CorpIncomeTax.drop(CorpIncomeTax.index[0], axis=0, inplace=True)

In [None]:
CorpIncomeTax = CorpIncomeTax.transpose()

In [None]:
#Standardising the Year (index) names to match with other dataframes as they translated weirdly
CorpIncomeTax.index = CorpIncomeTax.index.astype(int)

#For Corporate Income Tax there is no data until 1990 so this is removed
CorpIncomeTax.drop(CorpIncomeTax.index[0:15], axis=0, inplace=True)

In [None]:
#Narrowing dataset down
CorpIncomeTax = CorpIncomeTax.iloc[:, 1:39]

In [None]:
# Set the style for better visualization
plt.style.use('seaborn')

# Create the figure and axis with a larger size
plt.figure(figsize=(12, 6))

# Create the line plot using different line styles and colors for better distinction
plt.plot(CorpIncomeTax.index, CorpIncomeTax['Türkiye'], linewidth=2, label='Turkiye', marker='o', markersize=4)
plt.plot(CorpIncomeTax.index, CorpIncomeTax['United Kingdom'], linewidth=2, label='UK', marker='s', markersize=4)
plt.plot(CorpIncomeTax.index, CorpIncomeTax['Canada'], linewidth=2, label='Canada', marker='^', markersize=4)
plt.plot(CorpIncomeTax.index, CorpIncomeTax['Korea'], linewidth=2, label='South Korea', marker='d', markersize=4)

# Customize the plot
plt.title('Corporate Income Tax Rates Comparison (2000-2023)', fontsize=14, pad=20)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Corporate Income Tax Rate (%)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Adjust layout to prevent label cutoff
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Creating a % change dataframe
CorpIncomeTax_perc_change = CorpIncomeTax.pct_change() * 100
CorpIncomeTax_perc_change.head()

In [None]:
for i in range(1990, 2000):
    CorpIncomeTax.loc[i] = [np.nan] * len(CorpIncomeTax.columns)

In [None]:
for i in range(1990, 2000):
    CorpIncomeTax_perc_change.loc[i] = [np.nan] * len(CorpIncomeTax_perc_change.columns)

In [None]:
CurrencyPerDollar = pd.read_excel(r"/dataset/economic/LCperUSD.xlsx", sheet_name="Table")

In [None]:
#Removing junk rows and columns, transposing and setting correct index and columns

CurrencyPerDollar.set_index(CurrencyPerDollar.columns[0], inplace=True)


In [None]:
CurrencyPerDollar = CurrencyPerDollar.transpose()
CurrencyPerDollar.drop(CurrencyPerDollar.columns[0], axis=1, inplace=True)

In [None]:
#Standardising the Year (index) and Country (Columns) names to match with other dataframes as they translated weirdly

# Clean column names
def remove_parentheses(text):
    return text.split(')')[-1].strip()

# Rename columns
CurrencyPerDollar.columns = [remove_parentheses(col) for col in CurrencyPerDollar.columns]

# Clean index 
CurrencyPerDollar.index = CurrencyPerDollar.index.str.split().str[-1]

In [None]:
# Set the style for better visualization
plt.style.use('seaborn')

# Create the figure and axis with a larger size
plt.figure(figsize=(12, 6))

# Create the line plot using different line styles and colors for better distinction
plt.plot(CurrencyPerDollar.index, CurrencyPerDollar['Türkiye'], linewidth=2, label='Türkiye', marker='o', markersize=4)
plt.plot(CurrencyPerDollar.index, CurrencyPerDollar['United Kingdom'], linewidth=2, label='UK', marker='s', markersize=4)
plt.plot(CurrencyPerDollar.index, CurrencyPerDollar['Canada'], linewidth=2, label='Canada', marker='^', markersize=4)
plt.plot(CurrencyPerDollar.index, CurrencyPerDollar['Korea'], linewidth=2, label='South Korea', marker='d', markersize=4)

# Customize the plot
plt.title('Exhange Rates Comparison (1960-2023)', fontsize=14, pad=20)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Exchange Rate', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Adjust layout to prevent label cutoff
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
CurrencyPerDollar.head(10)

In [None]:
# Creating a % change dataframe
CurrencyPerDollar_perc_change = CurrencyPerDollar.pct_change() * 100
CurrencyPerDollar_perc_change.head()

In [None]:
#Because the OECD data website sucks, I could only download a certain amount of data at once (6000 cells), so in order to get all the countries they 
#have data on I needed to download 3 different excel sheets (used Excel sheets as they were a lot more usable than the csv formats).

IncomeTax1 = pd.read_excel(r'/dataset/economic/Income tax 1.xlsx', sheet_name="Table")
IncomeTax2 = pd.read_excel(r'/dataset/economic/Income tax 2.xlsx', sheet_name="Table")
IncomeTax3 = pd.read_excel(r'/dataset/economic/Income tax 3.xlsx', sheet_name="Table")

#These are all the formated the same so I can repeat operations on them in a set

IncomeTax = [IncomeTax1, IncomeTax2, IncomeTax3]

In [None]:
#Removing junk rows and columns, transposing and setting correct index and columns

for df in IncomeTax:
    df.drop(df.columns[0], axis=1, inplace=True)
    df.drop(df.index[0], axis=0, inplace=True)

In [None]:
for df in IncomeTax:
        df.set_index(df.columns[0], inplace=True)

In [None]:
for df in IncomeTax:
    df.columns = df.iloc[0]  # Set column names to first row
    df.drop(df.index[0], inplace=True)  # Remove the first row

In [None]:
#Dropping more junk rows
for df in IncomeTax:
    df.drop(df.columns[0], axis=1, inplace=True)
    df.drop(df.index[0], axis=0, inplace=True)

In [None]:
#I had weirdness with transposing here... ignore this
IncomeTax1 = IncomeTax1.transpose()
IncomeTax2 = IncomeTax2.transpose()
IncomeTax3 = IncomeTax3.transpose()

IncomeTax = [IncomeTax1, IncomeTax2, IncomeTax3]

In [None]:
  #Standardising the Year (index) names to match with other dataframes as they translated weirdly
for df in IncomeTax:
    df.index = df.index.astype(int)

In [None]:
#Now I have my dataset and need to organise it until a usable way.
#I believe we could split it up into different datasets.
#This can be done with the different datasets and then the final products can be concatenated together

PersonalAllowance1 = IncomeTax1.iloc[:, 1:11]

PersonalAllowance2 = IncomeTax2.iloc[:, 1:10]

PersonalAllowance3 = IncomeTax3.iloc[:, 1:3]

PersonalAllowance = pd.concat([PersonalAllowance1, PersonalAllowance2, PersonalAllowance3], axis=1)

PersonalAllowance.sort_index(axis=1,inplace=True)

In [None]:
#Now the Process can be repeated to get a series of dataframes

MarginalRate1_1 = IncomeTax1.iloc[:, 30:47]

MarginalRate1_2 = IncomeTax2.iloc[:, 26:43]

MarginalRate1_3 = IncomeTax3.iloc[:, 7:11]

Marginalrate = pd.concat([MarginalRate1_1, MarginalRate1_2, MarginalRate1_3], axis=1)

Marginalrate.sort_index(axis=1, inplace=True)

In [None]:
Threshold1_1 = IncomeTax1.iloc[:, 48:65]

Threshold1_2 = IncomeTax2.iloc[:, 44:60]

Threshold1_3 = IncomeTax3.iloc[:, 12:16]

Threshold1 = pd.concat([Threshold1_1, Threshold1_2, Threshold1_3], axis=1)

Threshold1.sort_index(axis=1, inplace=True)

In [None]:
MarginalRate2_1 = IncomeTax1.iloc[:, 66:82]

MarginalRate2_2 = IncomeTax2.iloc[:, 61:77]

MarginalRate2_3 = IncomeTax3.iloc[:, 17:21]

Marginalrate2 = pd.concat([MarginalRate2_1, MarginalRate2_2, MarginalRate2_3], axis=1)

Marginalrate2.sort_index(axis=1, inplace=True)

In [None]:

Threshold2_1 = IncomeTax1.iloc[:, 83:100]

Threshold2_2 = IncomeTax2.iloc[:, 78:92]

Threshold2_3 = IncomeTax3.iloc[:, 22:26]

Threshold2 = pd.concat([Threshold2_1, Threshold2_2, Threshold2_3], axis=1)

Threshold2.sort_index(axis=1, inplace=True)

In [None]:
MarginalRate3_1 = IncomeTax1.iloc[:, 101:117]

MarginalRate3_2 = IncomeTax2.iloc[:, 93:107]

MarginalRate3_3 = IncomeTax3.iloc[:, 27:31]

Marginalrate3 = pd.concat([MarginalRate3_1, MarginalRate3_2, MarginalRate3_3], axis=1)

Marginalrate3.sort_index(axis=1, inplace=True)

In [None]:

Threshold3_1 = IncomeTax1.iloc[:, 118:133]

Threshold3_2 = IncomeTax2.iloc[:, 108:118]

Threshold3_3 = IncomeTax3.iloc[:, 32:35]

Threshold3 = pd.concat([Threshold3_1, Threshold3_2, Threshold3_3], axis=1)

Threshold3.sort_index(axis=1, inplace=True)

In [None]:
MarginalRate4_1 = IncomeTax1.iloc[:, 134:149]

MarginalRate4_2 = IncomeTax2.iloc[:, 119:129]

MarginalRate4_3 = IncomeTax3.iloc[:, 36:39]

Marginalrate4 = pd.concat([MarginalRate4_1, MarginalRate4_2, MarginalRate4_3], axis=1)

Marginalrate4.sort_index(axis=1, inplace=True)

In [None]:
Threshold4_1 = IncomeTax1.iloc[:, 150:164]

Threshold4_2 = IncomeTax2.iloc[:, 130:139]

Threshold4_3 = IncomeTax3.iloc[:, 40:43]

Threshold4 = pd.concat([Threshold4_1, Threshold4_2, Threshold4_3], axis=1)

Threshold4.sort_index(axis=1, inplace=True)

In [None]:
MarginalRate5_1 = IncomeTax1.iloc[:, 165:179]

MarginalRate5_2 = IncomeTax2.iloc[:, 140:149]

MarginalRate5_3 = IncomeTax3.iloc[:, 44:47]

Marginalrate5 = pd.concat([MarginalRate5_1, MarginalRate5_2, MarginalRate5_3], axis=1)

Marginalrate5.sort_index(axis=1, inplace=True)

In [None]:
#Now importing the data from 1990-1999.
# It was downloaded in a different format to the other years, as an excel spreadsheet containing a sheet for each
# year. I download each of these sheets and skip the first 5 rows of irrelevant data.

IncomeTax1990 = pd.read_excel(r'/dataset/economic/Income tax 1981-1999.xlsx', sheet_name="1990", skiprows=5)
IncomeTax1991 = pd.read_excel(r'/dataset/economic/Income tax 1981-1999.xlsx', sheet_name="1991", skiprows=5)
IncomeTax1992 = pd.read_excel(r'/dataset/economic/Income tax 1981-1999.xlsx', sheet_name="1992", skiprows=5)
IncomeTax1993 = pd.read_excel(r'/dataset/economic/Income tax 1981-1999.xlsx', sheet_name="1993", skiprows=5)
IncomeTax1994 = pd.read_excel(r'/dataset/economic/Income tax 1981-1999.xlsx', sheet_name="1994", skiprows=5)
IncomeTax1995 = pd.read_excel(r'/dataset/economic/Income tax 1981-1999.xlsx', sheet_name="1995", skiprows=5)
IncomeTax1996 = pd.read_excel(r'/dataset/economic/Income tax 1981-1999.xlsx', sheet_name="1996", skiprows=5)
IncomeTax1997 = pd.read_excel(r'/dataset/economic/Income tax 1981-1999.xlsx', sheet_name="1997", skiprows=5)
IncomeTax1998 = pd.read_excel(r'/dataset/economic/Income tax 1981-1999.xlsx', sheet_name="1998", skiprows=5)
IncomeTax1999 = pd.read_excel(r'/dataset/economic/Income tax 1981-1999.xlsx', sheet_name="1999", skiprows=5)

In [None]:
IncomeTaxA = [IncomeTax1990, IncomeTax1991, IncomeTax1992, IncomeTax1993, IncomeTax1994, IncomeTax1995, 
    IncomeTax1996, IncomeTax1997, IncomeTax1998, IncomeTax1999]

In [None]:
#Performing transformation on the data to make it usable.

def clean_income_tax_df(df):
    # Fixing columns
    df.columns = df.iloc[0]
    
    # Dropping irrelevant rows
    df.drop(index=[0, 1, 2], inplace=True)
    
    # Dropping NaN rows
    df.dropna(axis=0, how="all", inplace=True)
    
    # Dropping NaN columns
    df.dropna(axis=1, how="all", inplace=True)
    
    # Renaming the country column
    df.columns = ['Country Name'] + list(df.columns[1:])
    
    # Dropping NaN labelled rows and columns
    df = df.loc[~df.index.isna(), ~df.columns.isna()]
    df = df.dropna(subset=['Country Name'])
    
    # Dropping Surtax as I am not using it
    df.drop(df.columns[2], axis=1, inplace=True)
    
    # Renaming wrongly named columns
    df.columns = list(df.columns[0:1]) + ['PersonalAllowance'] + list(df.columns[2:])
    df.columns = list(df.columns[0:2]) + ['Marginal rate'] + list(df.columns[3:])
    
    # Some NaNs are labelled differently, so replace them with NaN
    df.replace("n.a", np.nan, inplace=True)
    df.replace("n.a.", np.nan, inplace=True)
    df.replace("-", np.nan, inplace=True)
    
    return df

# Apply the function to all DataFrames in the IncomeTaxA list
for i, df in enumerate(IncomeTaxA):
    IncomeTaxA[i] = clean_income_tax_df(df)

In [None]:
# Dealing with column names repeating
def rename_duplicate_columns(df):
    column_count = {}  
    new_columns = []    

    for col in df.columns:
        if col in column_count:
            column_count[col] += 1
            new_columns.append(f"{col}{column_count[col]}")
        else:
            column_count[col] = 1
            new_columns.append(col) 

    df.columns = new_columns 

for df in IncomeTaxA:
    rename_duplicate_columns(df)

In [None]:
# Dealing with currencies being displayed differently
def reindex_and_convert_currency(df):
    # Reindexing
    df.reset_index(drop=True, inplace=True)
    
    # Converting currencies which are divided by 1000
    columns_to_operate = [col for col in df.columns if "Marginal Rate" not in col]
    for i, country in enumerate(df["Country Name"]):
        if "(´000)" in country:
            df.loc[i, columns_to_operate] *= 1000
    
    return df

# Apply the function to all DataFrames in the IncomeTaxA list
for i, df in enumerate(IncomeTaxA):
    IncomeTaxA[i] = reindex_and_convert_currency(df)

In [None]:
# Fixing Country Names
two_word_countries = ["Czech Republic ", "New Zealand", "Slovak Republic", "United Kingdom", "United States"]
def filter_country_name(country):
    country = country.replace('*', '').replace('#', '').replace(',', '')  # Remove special characters
    if country in two_word_countries:
        return country
    else:
        return country.split()[0]

for df in IncomeTaxA:
    df['Country Name'] = df['Country Name'].apply(filter_country_name)

In [None]:
# Fixing Personal Allowance column

def clean_personal_allowance(value):
    # Convert the value to a string (handles integers, floats, and None)
    value_str = str(value) if value is not None else ""
    
    # Use regex to find the numeric part of the string, including commas
    numeric_part = re.search(r'[\d,]+', value_str)
    if numeric_part:
        # Remove commas and convert to integer
        return int(numeric_part.group().replace(",", ""))

for df in IncomeTaxA:
    df['PersonalAllowance'] = df['PersonalAllowance'].apply(clean_personal_allowance)

In [None]:
# Removing spaces to match the column names with variable names
for df in IncomeTaxA:
    df.columns = [col.replace(" ", "") for col in df.columns]

In [None]:
# Transposing the column and index to merge with other dataframes

for i, df in enumerate(IncomeTaxA):
    IncomeTaxA[i] = df.set_index("CountryName").T

In [None]:
# Aligning country names with other dataframes

for df in IncomeTaxA:
    df.columns = df.columns.str.replace(r'Turkey', 'Türkiye', regex=True)
    df.columns = df.columns.str.replace(r'Czech Republic', 'Czechia', regex=True)

In [None]:
[IncomeTax1990, IncomeTax1991, IncomeTax1992, IncomeTax1993, IncomeTax1994, IncomeTax1995, 
    IncomeTax1996, IncomeTax1997, IncomeTax1998, IncomeTax1999] = IncomeTaxA

In [None]:
IncomeTaxB = [PersonalAllowance, Marginalrate, Marginalrate2, Marginalrate3, Marginalrate4, Marginalrate5, 
Threshold1, Threshold2, Threshold3, Threshold4]

In [None]:
# Now I have my List of fixed dataframes, I can concatenate them into the dataframes containing data from 2000-2018

# Combine yearly data (1990-1999) with statistic-specific data (2000-2018)
combined_dfs = {}

statistic_names=["PersonalAllowance", "Marginalrate", "Marginalrate2", "Marginalrate3", "Marginalrate4", 
    "Marginalrate5", "Threshold", "Threshold2", "Threshold3", "Threshold4"]

for i, stat_df in enumerate(IncomeTaxB):
    statistic = statistic_names[i]  # Get the corresponding statistic name
    # Extract yearly data for the current statistic
    yearly_stat_data = []
    for year, yearly_df in enumerate(IncomeTaxA, start=1990):  # Assuming IncomeTaxA is ordered from 1990-1999
        # Get the row corresponding to the current statistic
        stat_row = yearly_df.loc[statistic].to_frame().T
        stat_row.index = [year]  # Set the index to the year
        yearly_stat_data.append(stat_row)
    
    # Concatenate yearly data (1990-1999) with statistic-specific data (2000-2018)
    combined_df = pd.concat(yearly_stat_data + [stat_df])
    combined_dfs[statistic] = combined_df

In [None]:
for name, df in combined_dfs.items():
    globals()[name] = df

In [None]:
Marginalrate.head()

In [None]:
Threshold1_perc_change = Threshold.pct_change()*100
Threshold2_perc_change = Threshold2.pct_change()*100
Threshold3_perc_change = Threshold3.pct_change()*100
Threshold4_perc_change = Threshold4.pct_change()*100
MarginalRate1_perc_change = Marginalrate.pct_change()*100
MarginalRate2_perc_change = Marginalrate2.pct_change()*100
MarginalRate3_perc_change = Marginalrate3.pct_change()*100
MarginalRate4_perc_change = Marginalrate4.pct_change()*100
MarginalRate5_perc_change = Marginalrate5.pct_change()*100
PersonalAllowance_perc_change = PersonalAllowance.pct_change()*100

In [None]:
# Now with all my statistics on tax, I can calculate tax paid by the median person, which will be done later after
# imputing data.

In [None]:
#Now we have a number of usable dataframes, including personal allowance, marginal rates and thresholds.

In [None]:
#Before using the dataframes to alter each other, we need to check that all the names of the countries match
#between the dataframes. This can be done using the index of the dataframes.

unmatched_names_1 = set(PersonalAllowance.columns) - set(CurrencyPerDollar.columns)

unmatched_names_1

In [None]:
#This will be continued after all the dataframes I need are created, to allow standardisation on every dataframe
#at once.

In [None]:
Inflation = pd.read_csv(r'/dataset/economic/Inflation.csv')

In [None]:
#Setting Country name as the index
Inflation.set_index(Inflation.columns[0], inplace=True)

In [None]:
#Removing junk rows
Inflation.drop(Inflation.columns[0:3], axis=1, inplace=True)

In [None]:
#Moving the years to index
Inflation = Inflation.transpose()

In [None]:
#Creating a dataset which contains percentage changes in inflation in each year
Inflation_perc_change = Inflation.pct_change() * 100
Inflation_perc_change.head()

In [None]:
# Set the style for better visualization
plt.style.use('seaborn')

# Create the figure and axis with a larger size
plt.figure(figsize=(12, 6))

# Create the line plot using different line styles and colors for better distinction
plt.plot(Inflation.index, Inflation['Turkiye'], linewidth=2, label='Turkiye', marker='o', markersize=4)
plt.plot(Inflation.index, Inflation['United Kingdom'], linewidth=2, label='UK', marker='s', markersize=4)
plt.plot(Inflation.index, Inflation['Bulgaria'], linewidth=2, label='Bulgaria', marker='^', markersize=4)
plt.plot(Inflation.index, Inflation['Korea, Rep.'], linewidth=2, label='South Korea', marker='d', markersize=4)

# Customize the plot
plt.title('Inflation Rates Comparison (1960-2023)', fontsize=14, pad=20)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Inflation', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Adjust layout to prevent label cutoff
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
Unemployment = pd.read_csv(r'/dataset/economic/Unemployment.csv')

In [None]:
#Setting Country name as the index
Unemployment.set_index(Unemployment.columns[0], inplace=True)

In [None]:
#Removing junk rows
Unemployment.drop(Unemployment.columns[0:3], axis=1, inplace=True)

In [None]:
#Moving the years to index
Unemployment = Unemployment.transpose()

In [None]:
#For Unemployment there is no data until 1990 so this is removed
Unemployment.drop(Unemployment.index[0:30], axis=0, inplace=True)

In [None]:
#Creating a dataset which contains percentage changes in inflation in each year
Unemployment_perc_change = Unemployment.pct_change() * 100

In [None]:
# Set the style for better visualization
plt.style.use('seaborn')

# Create the figure and axis with a larger size
plt.figure(figsize=(12, 6))

# Create the line plot using different line styles and colors for better distinction
plt.plot(Unemployment.index, Unemployment['Turkiye'], linewidth=2, label='Turkiye', marker='o', markersize=4)
plt.plot(Unemployment.index, Unemployment['United Kingdom'], linewidth=2, label='UK', marker='s', markersize=4)
plt.plot(Unemployment.index, Unemployment['Bulgaria'], linewidth=2, label='Bulgaria', marker='^', markersize=4)
plt.plot(Unemployment.index, Unemployment['Korea, Rep.'], linewidth=2, label='South Korea', marker='d', markersize=4)

# Customize the plot
plt.title('Unemployment Rates Comparison (1991-2023)', fontsize=14, pad=20)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Unemployment Rate (%)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Adjust layout to prevent label cutoff
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
VAT = pd.read_excel(r"/dataset/economic/VAT.xlsx")

In [None]:
#Setting Country name as the index
VAT.set_index(VAT.columns[0], inplace=True)

In [None]:
#Removing Junk rows
VAT.drop(VAT.columns[20:24], axis=1, inplace=True)
VAT.drop(VAT.columns[18], axis=1, inplace=True)
VAT.drop(VAT.columns[16], axis=1, inplace=True)

In [None]:
VAT = VAT.transpose()

In [None]:
#Changing "-" into NaN values

VAT.replace({' - ': np.nan}, inplace=True)

In [None]:
#Standardising column names by removing "*" for countries to match column names with other datasets
VAT.columns = VAT.columns.str.replace(r'\*$', '', regex=True)
VAT.head()

In [None]:
#Creating a dataset which contains percentage changes in VAT in each year
VAT_perc_change = VAT.pct_change() * 100
VAT_perc_change.head()

In [None]:
for i in range(1990, 2005):
    VAT.loc[i] = [np.nan] * len(VAT.columns)

In [None]:
for i in range(1990, 2000):
    VAT_perc_change.loc[i] = [np.nan] * len(VAT_perc_change.columns)

In [None]:
# Set the style for better visualization
plt.style.use('seaborn')

# Create the figure and axis with a larger size
plt.figure(figsize=(12, 6))

# Create the line plot using different line styles and colors for better distinction
plt.plot(VAT.index, VAT['Türkiye'], linewidth=2, label='Turkiye', marker='o', markersize=4)
plt.plot(VAT.index, VAT['United Kingdom'], linewidth=2, label='UK', marker='s', markersize=4)
plt.plot(VAT.index, VAT['Canada'], linewidth=2, label='Canada', marker='^', markersize=4)
plt.plot(VAT.index, VAT['Spain'], linewidth=2, label='Spain', marker='d', markersize=4)

# Customize the plot
plt.title('VAT Comparison (2005-2023)', fontsize=14, pad=20)
plt.xlabel('Year', fontsize=12)
plt.ylabel('VAT (%)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Adjust layout to prevent label cutoff
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
#Now to make all my dataframes into a usable format and deal with issues with NaN before concatenation.

In [None]:
AverageTaxPaid = AverageWageUSDPPP
AverageTaxPaid_perc_change = AverageWageUSDPPP

In [None]:
#Making a list of all my dataframes
Dataframes = [
    AverageTaxPaid, AverageTaxPaid_perc_change, AverageWage, AverageWageUSDPPP, AverageWageUSDPPP_perc_change,
    CorpIncomeTax, CorpIncomeTax_perc_change, CurrencyPerDollar, CurrencyPerDollar_perc_change, Inflation, 
    Inflation_perc_change, Marginalrate, MarginalRate1_perc_change, Marginalrate2, MarginalRate2_perc_change, 
    Marginalrate3, MarginalRate3_perc_change, Marginalrate4, MarginalRate4_perc_change,
    Marginalrate5, MarginalRate5_perc_change, PersonalAllowance, PersonalAllowance_perc_change, Threshold,
    Threshold1_perc_change, Threshold2, Threshold2_perc_change,
    Threshold3, Threshold3_perc_change, Threshold4, Threshold4_perc_change, Unemployment,
    Unemployment_perc_change, VAT, VAT_perc_change]

Dataframes_names =["AverageTaxPaid", "AverageTaxPaid_perc_change",
"AverageWage", "AverageWageUSDPPP","AverageWageUSDPPP_perc_change", "CorpIncomeTax", "CorpIncomeTax_perc_change","CurrencyPerDollar", "CurrencyPerDollar_perc_change", "Inflation", "Inflation_perc_change",
    "Marginalrate", "MarginalRate1_perc_change", "Marginalrate2", "MarginalRate2_perc_change", 
    "Marginalrate3", "MarginalRate3_perc_change", "Marginalrate4", "MarginalRate4_perc_change",
    "Marginalrate5", "MarginalRate5_perc_change", "PersonalAllowance", "PersonalAllowance_perc_change", "Threshold",
    "Threshold1_perc_change", "Threshold2", "Threshold2_perc_change",
    "Threshold3", "Threshold3_perc_change", "Threshold4", "Threshold4_perc_change", "Unemployment",
    "Unemployment_perc_change", "VAT", "VAT_perc_change"]



In [None]:
#Standardising country names for problem countries (named differently in different dataframes) 
# so they are the same in each Dataframe
for df in Dataframes:
    df.columns = df.columns.str.replace(r'Türkiye', 'Turkiye', regex=True)
    df.columns = df.columns.str.replace(r'Turkey', 'Turkiye', regex=True)
    df.columns = df.columns.str.replace(r'Czech Republic', 'Czechia', regex=True)
    df.columns = df.columns.str.replace(r'^Korea, Rep\.$', 'South Korea', regex=True)
    df.columns = [col if 'South Korea' in col else col.replace('Korea', 'South Korea') for col in df.columns]
    df.sort_index(axis=1, inplace=True)

In [None]:
# Removing all the Non-OECD countries from my dataframes

oecd_countries = [
    'Australia','Austria','Belgium', 'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Czechia', 'Denmark', 
    'Estonia', 'Finland','France','Germany','Greece','Hungary','Iceland','Ireland','Israel','Italy','Japan',
    'Latvia','Lithuania','Luxembourg','Mexico','Netherlands','New Zealand','Norway','Poland','Portugal',
    'Slovak Republic','Slovenia','South Korea','Spain','Sweden','Switzerland','Turkiye','United Kingdom',
    'United States']

In [None]:
# Dropping Non-OECD countries

def filter_oecd_countries(df, oecd_countries):
    countries_in_df = df.columns
    invalid_columns = [country for country in countries_in_df if country not in oecd_countries]
    df.drop(columns=invalid_columns, inplace=True)

for df in Dataframes:
    filter_oecd_countries(df, oecd_countries)

In [None]:
#Dealing with NaN values

#If NaN values are above 20%, I will remove the column. If not, I will replace them using KNN interpolation.

In [None]:
# Replace inf and large values with NaN
for df in Dataframes:
    df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace inf with NaN
    df[df > 1e308] = np.nan  # Replace extremely large values with NaN

In [None]:
# Need to match index so putting them as the same datatype 
dfs = [AverageWage, PersonalAllowance, Threshold, Threshold2, Threshold3, Threshold4,
       Marginalrate, Marginalrate2, Marginalrate3, Marginalrate4, Marginalrate5]

for df in dfs:
    df.index = df.index.astype(int)  # Convert index to integers

#Removing 2023 as there isn't tax data for it,

AverageWage = AverageWage.iloc[:-1]

In [None]:
# Create a dictionary to store country sets for each DataFrame
dfs = {
    "AverageWage": AverageWage,
    "PersonalAllowance": PersonalAllowance,
    "Threshold1": Threshold,
    "Threshold2": Threshold2,
    "Threshold3": Threshold3,
    "Threshold4": Threshold4,
    "MarginalRate1": Marginalrate,
    "MarginalRate2": Marginalrate2,
    "MarginalRate3": Marginalrate3,
    "MarginalRate4": Marginalrate4,
    "MarginalRate5": Marginalrate5,
}

# Extract the set of country names for each DataFrame
country_sets = {name: set(df.columns) for name, df in dfs.items()}

# Find the full set of all countries across all DataFrames
all_countries = set.union(*country_sets.values())
missing_countries = {}

for name, countries in country_sets.items():
    missing_countries[name] = all_countries - countries  # Countries missing in this DataFrame

# Print missing countries for each DataFrame
for name, missing in missing_countries.items():
    if missing:
        print(f"{name} is missing: {missing}")

In [None]:
# Dictionary mapping DataFrames to missing countries
missing_countries = {
    "PersonalAllowance": {'Colombia', 'Costa Rica'},
    "Threshold2": {'Lithuania'},
    "Threshold3": {'Lithuania'},
    "Threshold4": {'Lithuania', 'Czechia'},
    "MarginalRate3": {'Lithuania'},
    "MarginalRate4": {'Lithuania'},
    "MarginalRate5": {'Lithuania', 'Czechia'},
}

# Dictionary of your DataFrames
dfs = {
    "PersonalAllowance": PersonalAllowance,
    "Threshold2": Threshold2,
    "Threshold3": Threshold3,
    "Threshold4": Threshold4,
    "MarginalRate3": Marginalrate3,
    "MarginalRate4": Marginalrate4,
    "MarginalRate5": Marginalrate5,
}

# Add missing columns with NaN
for name, missing_cols in missing_countries.items():
    for col in missing_cols:
        if col not in dfs[name].columns:
            dfs[name][col] = np.nan

In [None]:
# Function to prepare tax parameters with proper NaN handling
def prepare_tax_parameters(year, country):
    # Get average wage
    income = AverageWage.loc[year, country]
    
    # Handle Personal Allowance - convert NaN to 0
    personal_allowance = PersonalAllowance.loc[year, country]
    if pd.isna(personal_allowance):
        personal_allowance = 0
    
    # Handle Thresholds - replace NaN with very high number
    thresholds = []
    for threshold_df in [Threshold, Threshold2, Threshold3, Threshold4]:
        threshold_value = threshold_df.loc[year, country]
        if pd.isna(threshold_value):
            threshold_value = 1000000000  # 1 billion as an effectively "infinite" threshold
        thresholds.append(threshold_value)
    
    # Handle rates - use 0 for any NaN
    rates = []
    for rate_df in [Marginalrate, Marginalrate2, Marginalrate3, Marginalrate4, Marginalrate5]:
        rate_value = rate_df.loc[year, country]
        if pd.isna(rate_value):
            rate_value = 0
        rates.append(rate_value)
    
    return income, personal_allowance, thresholds, rates

In [None]:
# Now we have imputed, I want to create one last dataset by combining other ones.
# This dataframe will be average tax paid per person, and will be calculated using my tax dataframes and average
# wage dataframe

# Initialize a DataFrame to store tax paid by average wage earners
AverageTaxPaid = pd.DataFrame(index=AverageWage.index, columns=AverageWage.columns)

def calculate_tax(income, personal_allowance, thresholds, rates):
    # No tax below personal allowance
    taxable_income = max(income - personal_allowance, 0)
    
    # Convert rates from percentages to decimals
    decimal_rates = [rate/100 for rate in rates]
    
    # Calculate tax for each bracket
    tax = 0
    for i in range(len(thresholds)):
        if i == 0:
            lower_bound = personal_allowance
        else:
            lower_bound = thresholds[i - 1]
        upper_bound = thresholds[i]
        
        # Amount in this bracket
        bracket_income = max(0, min(taxable_income, upper_bound - lower_bound))
        tax += bracket_income * decimal_rates[i]
        
        # Reduce taxable income by the amount taxed in this bracket
        taxable_income -= bracket_income
    
    # Tax for income above the highest threshold
    if taxable_income > 0:
        tax += taxable_income * decimal_rates[-1]
    
    return tax

# Loop through each country and calculate tax
for country in AverageWage.columns:
    for year in AverageWage.index:
        # Get the average wage for this country and year
        income = AverageWage.loc[year, country]
        
        # Get tax parameters for this country and year
        personal_allowance = PersonalAllowance.loc[year, country]
        thresholds = [
            Threshold.loc[year, country],
            Threshold2.loc[year, country],
            Threshold3.loc[year, country],
            Threshold4.loc[year, country]
        ]
        rates = [
            Marginalrate.loc[year, country],
            Marginalrate2.loc[year, country],
            Marginalrate3.loc[year, country],
            Marginalrate4.loc[year, country],
            Marginalrate5.loc[year, country]
        ]
        
        # Calculate tax for this average wage earner
        tax = calculate_tax(income, personal_allowance, thresholds, rates)
        
        # Store the result
        AverageTaxPaid.loc[year, country] = tax

In [None]:
# Making a % change version of this dataset

AverageTaxPaid_perc_change = AverageTaxPaid.pct_change() * 100

In [None]:
Dataframes[0] = AverageTaxPaid
Dataframes[1] = AverageTaxPaid_perc_change

In [None]:
Dataframes = [d.reset_index(names="Year") for d in Dataframes]

In [None]:
for df in Dataframes:
    df = df.apply(pd.to_numeric, errors='coerce')

In [None]:
# Filtering code to be from 1990 onwards

# Assuming Dataframes is your list of DataFrames
for i, df in enumerate(Dataframes):
    # Check if the 'Year' column contains any string values
    if df['Year'].dtype == object:  # 'object' dtype typically indicates strings in pandas
        try:
            # Attempt to convert the 'Year' column to integers
            Dataframes[i]['Year'] = pd.to_numeric(df['Year'], errors='raise')
        except ValueError as e:
            print(f"DataFrame at index {i} has non-numeric 'Year' values that cannot be converted to integers: {e}")
            continue  # Skip this DataFrame or handle the error as needed

    # Now filter the DataFrame to keep only rows where Year >= 1990
    Dataframes[i] = df[df['Year'] >= 1990]

In [None]:
#First, removing the columns with too many NaN values

def remove_nan(df):
    for column in df.columns:
        if df[column].isnull().sum() / len(df) > 0.9:
            df.drop(column, axis=1, inplace=True)
    return df
        

for df in Dataframes:
    df = remove_nan(df)

In [None]:
#Then using KNN for the remaining Dataframes

def make_ts(df, idx):
    if df.empty:  # Check for empty DataFrame
        print(f"Skipping empty DataFrame at index {idx}")
        return df
    
    df.index = pd.to_datetime(df.index, format="%Y", errors="coerce")
    df.index = df.index.year  # Convert index to years

    # Ensure all data is numeric
    df = df.apply(pd.to_numeric, errors="coerce")

    # Replace infinite values with NaN
    if np.isinf(df.values).any():
        print(f"Replacing infinities in DataFrame at index {idx}")
        df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Drop rows/columns that are completely NaN (optional)
    df.dropna(axis=0, how="all", inplace=True)  # Drop full NaN rows
    df.dropna(axis=1, how="all", inplace=True)  # Drop full NaN columns

    # Check again after cleanup
    if df.empty:
        print(f"DataFrame at index {idx} became empty after cleaning, skipping.")
        return df

    # Impute missing values using KNN
    imputer = KNNImputer(n_neighbors=2)
    
    try:
        df.loc[:, :] = imputer.fit_transform(df)
    except ValueError as e:
        print(f"Imputation failed for DataFrame at index {idx}: {e}")
        return df  # Return as-is if imputation fails
    
    return df

# Apply transformation to each DataFrame and track empty ones
Dataframes = [make_ts(df, idx) for idx, df in enumerate(Dataframes)]

In [None]:
#Now all the names are standardised, we can do conversions using different datasets.
#We can go back to the Personal Allowance and the Threshold datasets and standardise the currencies to USD using the 
#"CurrencyPerDollar" dataset.

Need_converting = [AverageTaxPaid, PersonalAllowance, Threshold, Threshold2, Threshold3, Threshold4]

In [None]:
for df in Need_converting:
    df = df.apply(pd.to_numeric, errors='coerce')

In [None]:
def convert_to_dollars(need_converting, currency_per_dollar):
    converted_dfs = []
    
    # Debug: Print data types of input DataFrames
    print("Data Types in Need_converting:")
    for df in need_converting:
        print(df.dtypes)
    
    print("\nData Types in CurrencyPerDollar:")
    print(currency_per_dollar.dtypes)
    
    # Loop through each DataFrame in the Need_converting list
    for df in need_converting:
        # Ensure all columns in the DataFrame are numeric
        df = df.apply(pd.to_numeric, errors='coerce')
        
        # Loop through the countries (columns) in the current DataFrame
        for country in df.columns:
            # Check if the country exists in the CurrencyPerDollar columns
            if country in currency_per_dollar.columns:
                # Extract the exchange rate for that country, aligning by the index (years)
                exchange_rate = currency_per_dollar[country]
                
                # Ensure the exchange rate is numeric
                exchange_rate = pd.to_numeric(exchange_rate, errors='coerce')
                
                # Ensure the indices (years) align. Use `reindex` to align the years of the df with CurrencyPerDollar
                exchange_rate_aligned = exchange_rate.reindex(df.index, method='ffill')
                
                # Debug: Print exchange rate and aligned exchange rate
                print(f"\nCountry: {country}")
                print("DataFrame Values:")
                print(df[country])
                print("Exchange Rate Values:")
                print(exchange_rate_aligned)
                
                # Handle missing values
                df[country] = df[country].fillna(0)
                exchange_rate_aligned = exchange_rate_aligned.fillna(1)  # Avoid division by zero
                
                # Convert the values in the DataFrame to dollars (divide by exchange rate)
                df[country] = df[country] / exchange_rate_aligned
        
        # Add the converted DataFrame to the list
        converted_dfs.append(df)
    
    return converted_dfs
converted_dfs = convert_to_dollars(Need_converting, CurrencyPerDollar)

In [None]:
# Now I have imputed, I can convert the dtype of the index to integer and make it a column

In [None]:
AverageTaxPaid = Dataframes[0]
AverageTaxPaid_perc_change = Dataframes[1]
AverageWage = Dataframes[2]
AverageWageUSDPPP = Dataframes[3]
AverageWageUSDPPP_perc_change = Dataframes[4]
CorpIncomeTax = Dataframes[5]
CorpIncomeTax_perc_change = Dataframes[6]
CurrencyPerDollar = Dataframes[7]
CurrencyPerDollar_perc_change = Dataframes[8]
Inflation = Dataframes[9]
Inflation_perc_change = Dataframes[10]
Marginalrate = Dataframes[11]
MarginalRate1_perc_change = Dataframes[12]
Marginalrate2 = Dataframes[13]
MarginalRate2_perc_change = Dataframes[14]
Marginalrate3 = Dataframes[15]
MarginalRate3_perc_change = Dataframes[16]
Marginalrate4 = Dataframes[17]
MarginalRate4_perc_change = Dataframes[18]
Marginalrate5 = Dataframes[19]
MarginalRate5_perc_change = Dataframes[20]
PersonalAllowance = Dataframes[21]
PersonalAllowance_perc_change = Dataframes[22]
Threshold = Dataframes[23]
Threshold1_perc_change = Dataframes[24]
Threshold2 = Dataframes[25]
Threshold2_perc_change = Dataframes[26]
Threshold3 = Dataframes[27]
Threshold3_perc_change = Dataframes[28]
Threshold4 = Dataframes[29]
Threshold4_perc_change = Dataframes[30]
Unemployment = Dataframes[31]
Unemployment_perc_change = Dataframes[32]
VAT = Dataframes[33]
VAT_perc_change = Dataframes[34]

In [None]:
#I now have my set of standardised Dataframes :)

In [None]:
# Melting the dfs to reformat them 

In [None]:
# Melt AverageTaxPaid
AverageTaxPaid_melted = pd.melt(
    AverageTaxPaid,
    id_vars=['Year'],  # Keep 'Year' as the identifier
    value_vars=AverageTaxPaid.columns[1:],  # Melt all columns except 'Year'
    var_name='Country Name',  # Rename the variable column to 'Country Name'
    value_name='AverageTaxPaid'  # Rename the value column to 'CorpIncomeTax'
)
AverageTaxPaid_melted = AverageTaxPaid_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Melt AverageTaxPaid_perc_change
AverageTaxPaid_perc_change_melted = pd.melt(
    AverageTaxPaid_perc_change,
    id_vars=['Year'],  # Keep 'Year' as the identifier
    value_vars=AverageTaxPaid_perc_change.columns[1:],  # Melt all columns except 'Year'
    var_name='Country Name',  # Rename the variable column to 'Country Name'
    value_name='AverageTaxPaid_perc_change'  # Rename the value column to 'CorpIncomeTax'
)
AverageTaxPaid_perc_change_melted = AverageTaxPaid_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Melt AverageWageUSDPPP
AverageWageUSDPPP_melted = pd.melt(
    AverageWageUSDPPP,
    id_vars=['Year'],  # Keep 'Year' as the identifier
    value_vars=AverageWageUSDPPP.columns[1:],  # Melt all columns except 'Year'
    var_name='Country Name',  # Rename the variable column to 'Country Name'
    value_name='AverageWageUSDPPP'  # Rename the value column to 'CorpIncomeTax'
)
AverageWageUSDPPP_melted = AverageWageUSDPPP_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Melt AverageWageUSDPPP_perc_chanbge
AverageWageUSDPPP_perc_change_melted = pd.melt(
    AverageWageUSDPPP_perc_change,
    id_vars=['Year'],  # Keep 'Year' as the identifier
    value_vars=AverageWageUSDPPP_perc_change.columns[1:],  # Melt all columns except 'Year'
    var_name='Country Name',  # Rename the variable column to 'Country Name'
    value_name='AverageWageUSDPPP_perc_change'  # Rename the value column to 'CorpIncomeTax'
)
AverageWageUSDPPP_perc_change_melted = AverageWageUSDPPP_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Melt CorpIncomeTax
CorpIncomeTax_melted = pd.melt(
    CorpIncomeTax,
    id_vars=['Year'],  # Keep 'Year' as the identifier
    value_vars=CorpIncomeTax.columns[1:],  # Melt all columns except 'Year'
    var_name='Country Name',  # Rename the variable column to 'Country Name'
    value_name='CorpIncomeTax'  # Rename the value column to 'CorpIncomeTax'
)
CorpIncomeTax_melted = CorpIncomeTax_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Melt CorpIncomeTax_perc_change
CorpIncomeTax_perc_change_melted = pd.melt(
    CorpIncomeTax_perc_change,
    id_vars=['Year'],  # Keep 'Year' as the identifier
    value_vars=CorpIncomeTax_perc_change.columns[1:],  # Melt all columns except 'Year'
    var_name='Country Name',  # Rename the variable column to 'Country Name'
    value_name='CorpIncomeTax_perc_change'  # Rename the value column to 'CorpIncomeTax_perc_change'
)

CorpIncomeTax_perc_change_melted = CorpIncomeTax_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# CurrencyPerDollar
CurrencyPerDollar_melted = pd.melt(CurrencyPerDollar, id_vars=['Year'], 
                                  value_vars=CurrencyPerDollar.columns[1:], 
                                  var_name='Country Name', 
                                  value_name='CurrencyPerDollar')
CurrencyPerDollar_melted = CurrencyPerDollar_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# CurrencyPerDollar_perc_change
CurrencyPerDollar_perc_change_melted = pd.melt(CurrencyPerDollar_perc_change, id_vars=['Year'], 
                                              value_vars=CurrencyPerDollar_perc_change.columns[1:], 
                                              var_name='Country Name', 
                                              value_name='CurrencyPerDollar_perc_change')
CurrencyPerDollar_perc_change_melted = CurrencyPerDollar_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Inflation
Inflation_melted = pd.melt(Inflation, id_vars=['Year'], 
                          value_vars=Inflation.columns[1:], 
                          var_name='Country Name', 
                          value_name='Inflation')
Inflation_melted = Inflation_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Inflation_perc_change
Inflation_perc_change_melted = pd.melt(Inflation_perc_change, id_vars=['Year'], 
                                      value_vars=Inflation_perc_change.columns[1:], 
                                      var_name='Country Name', 
                                      value_name='Inflation_perc_change')
Inflation_perc_change_melted = Inflation_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# MarginalRate1
MarginalRate1_melted = pd.melt(Marginalrate, id_vars=['Year'], 
                              value_vars=Marginalrate.columns[1:], 
                              var_name='Country Name', 
                              value_name='Marginalrate')
MarginalRate1_melted = MarginalRate1_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# MarginalRate1_perc_change
MarginalRate1_perc_change_melted = pd.melt(MarginalRate1_perc_change, id_vars=['Year'], 
                                          value_vars=MarginalRate1_perc_change.columns[1:], 
                                          var_name='Country Name', 
                                          value_name='MarginalRate1_perc_change')
MarginalRate1_perc_change_melted = MarginalRate1_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# MarginalRate2
MarginalRate2_melted = pd.melt(Marginalrate2, id_vars=['Year'], 
                              value_vars=Marginalrate2.columns[1:], 
                              var_name='Country Name', 
                              value_name='Marginalrate2')
MarginalRate2_melted = MarginalRate2_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# MarginalRate2_perc_change
MarginalRate2_perc_change_melted = pd.melt(MarginalRate2_perc_change, id_vars=['Year'], 
                                          value_vars=MarginalRate2_perc_change.columns[1:], 
                                          var_name='Country Name', 
                                          value_name='MarginalRate2_perc_change')
MarginalRate2_perc_change_melted = MarginalRate2_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# MarginalRate3
MarginalRate3_melted = pd.melt(Marginalrate3, id_vars=['Year'], 
                              value_vars=Marginalrate3.columns[1:], 
                              var_name='Country Name', 
                              value_name='Marginalrate3')
MarginalRate3_melted = MarginalRate3_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# MarginalRate3_perc_change
MarginalRate3_perc_change_melted = pd.melt(MarginalRate3_perc_change, id_vars=['Year'], 
                                          value_vars=MarginalRate3_perc_change.columns[1:], 
                                          var_name='Country Name', 
                                          value_name='MarginalRate3_perc_change')
MarginalRate3_perc_change_melted = MarginalRate3_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# MarginalRate4
MarginalRate4_melted = pd.melt(Marginalrate4, id_vars=['Year'], 
                              value_vars=Marginalrate4.columns[1:], 
                              var_name='Country Name', 
                              value_name='Marginalrate4')
MarginalRate4_melted = MarginalRate4_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# MarginalRate4_perc_change
MarginalRate4_perc_change_melted = pd.melt(MarginalRate4_perc_change, id_vars=['Year'], 
                                          value_vars=MarginalRate4_perc_change.columns[1:], 
                                          var_name='Country Name', 
                                          value_name='MarginalRate4_perc_change')
MarginalRate4_perc_change_melted = MarginalRate4_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# MarginalRate5
MarginalRate5_melted = pd.melt(Marginalrate5, id_vars=['Year'], 
                              value_vars=Marginalrate5.columns[1:], 
                              var_name='Country Name', 
                              value_name='Marginalrate5')
MarginalRate5_melted = MarginalRate5_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# MarginalRate5_perc_change
MarginalRate5_perc_change_melted = pd.melt(MarginalRate5_perc_change, id_vars=['Year'], 
                              value_vars=MarginalRate5_perc_change.columns[1:], 
                              var_name='Country Name', 
                              value_name='MarginalRate5_perc_change')
MarginalRate5_perc_change_melted = MarginalRate5_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# PersonalAllowance
PersonalAllowance_melted = pd.melt(PersonalAllowance, id_vars=['Year'], 
                                  value_vars=PersonalAllowance.columns[1:], 
                                  var_name='Country Name', 
                                  value_name='PersonalAllowance')
PersonalAllowance_melted = PersonalAllowance_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# PersonalAllowance_perc_change
PersonalAllowance_perc_change_melted = pd.melt(PersonalAllowance_perc_change, id_vars=['Year'], 
                                              value_vars=PersonalAllowance_perc_change.columns[1:], 
                                              var_name='Country Name', 
                                              value_name='PersonalAllowance_perc_change')
PersonalAllowance_perc_change_melted = PersonalAllowance_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Threshold1
Threshold1_melted = pd.melt(Threshold, id_vars=['Year'], 
                           value_vars=Threshold.columns[1:], 
                           var_name='Country Name', 
                           value_name='Threshold')
Threshold1_melted = Threshold1_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Threshold1_perc_change
Threshold1_perc_change_melted = pd.melt(Threshold1_perc_change, id_vars=['Year'], 
                                       value_vars=Threshold1_perc_change.columns[1:], 
                                       var_name='Country Name', 
                                       value_name='Threshold1_perc_change')
Threshold1_perc_change_melted = Threshold1_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Threshold2
Threshold2_melted = pd.melt(Threshold2, id_vars=['Year'], 
                           value_vars=Threshold2.columns[1:], 
                           var_name='Country Name', 
                           value_name='Threshold2')
Threshold2_melted = Threshold2_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Threshold2_perc_change
Threshold2_perc_change_melted = pd.melt(Threshold2_perc_change, id_vars=['Year'], 
                                       value_vars=Threshold2_perc_change.columns[1:], 
                                       var_name='Country Name', 
                                       value_name='Threshold2_perc_change')
Threshold2_perc_change_melted = Threshold2_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Threshold3
Threshold3_melted = pd.melt(Threshold3, id_vars=['Year'], 
                           value_vars=Threshold3.columns[1:], 
                           var_name='Country Name', 
                           value_name='Threshold3')
Threshold3_melted = Threshold3_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Threshold3_perc_change
Threshold3_perc_change_melted = pd.melt(Threshold3_perc_change, id_vars=['Year'], 
                                       value_vars=Threshold3_perc_change.columns[1:], 
                                       var_name='Country Name', 
                                       value_name='Threshold3_perc_change')
Threshold3_perc_change_melted = Threshold3_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Threshold4
Threshold4_melted = pd.melt(Threshold4, id_vars=['Year'], 
                           value_vars=Threshold4.columns[1:], 
                           var_name='Country Name', 
                           value_name='Threshold4')
Threshold4_melted = Threshold4_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Threshold4_perc_change
Threshold4_perc_change_melted = pd.melt(Threshold4_perc_change, id_vars=['Year'], 
                                       value_vars=Threshold4_perc_change.columns[1:], 
                                       var_name='Country Name', 
                                       value_name='Threshold4_perc_change')
Threshold4_perc_change_melted = Threshold4_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Unemployment
Unemployment_melted = pd.melt(Unemployment, id_vars=['Year'], 
                             value_vars=Unemployment.columns[1:], 
                             var_name='Country Name', 
                             value_name='Unemployment')
Unemployment_melted = Unemployment_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# Unemployment_perc_change
Unemployment_perc_change_melted = pd.melt(Unemployment_perc_change, id_vars=['Year'], 
                                         value_vars=Unemployment_perc_change.columns[1:], 
                                         var_name='Country Name', 
                                         value_name='Unemployment_perc_change')
Unemployment_perc_change_melted = Unemployment_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# VAT
VAT_melted = pd.melt(VAT, id_vars=['Year'], 
                     value_vars=VAT.columns[1:], 
                     var_name='Country Name', 
                     value_name='VAT')
VAT_melted = VAT_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

# VAT_perc_change
VAT_perc_change_melted = pd.melt(VAT_perc_change, id_vars=['Year'], 
                                value_vars=VAT_perc_change.columns[1:], 
                                var_name='Country Name', 
                                value_name='VAT_perc_change')
VAT_perc_change_melted = VAT_perc_change_melted.sort_values(by=['Year', 'Country Name']).reset_index(drop=True)

In [None]:
Melted_Dataframes = [AverageTaxPaid_melted, AverageTaxPaid_perc_change_melted, AverageWageUSDPPP_melted,  
AverageWageUSDPPP_perc_change_melted, CurrencyPerDollar_melted,CurrencyPerDollar_perc_change_melted,
Inflation_melted,Inflation_perc_change_melted,MarginalRate1_melted,MarginalRate1_perc_change_melted,
    MarginalRate2_melted,MarginalRate2_perc_change_melted,MarginalRate3_melted, MarginalRate3_perc_change_melted,
    MarginalRate4_melted,MarginalRate4_perc_change_melted,MarginalRate5_melted, MarginalRate5_perc_change_melted,
    PersonalAllowance_melted,PersonalAllowance_perc_change_melted,Threshold1_melted,
    Threshold1_perc_change_melted,Threshold2_melted,Threshold2_perc_change_melted,Threshold3_melted,
    Threshold3_perc_change_melted,Threshold4_melted,Threshold4_perc_change_melted,Unemployment_melted,
    Unemployment_perc_change_melted,VAT_melted,VAT_perc_change_melted
]

In [None]:
# Merge all DataFrames iteratively
Matt = Melted_Dataframes[0]  # Start with the first DataFrame
Matt = AverageTaxPaid_melted
for df in Melted_Dataframes[1:]:
    Matt = pd.merge(Matt, df, on=['Year', 'Country Name'], how='outer')  # Adjust 'on' and 'how' as needed
Matt

In [None]:
Matt.isna().sum()

In [None]:
Matt.to_csv('/dataset/econ_merged_2.csv', index=False)