We use Z-Score Standardization to scale our data 

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset
df = pd.read_csv("final_tech_adoption_index_data.csv", index_col=0)

# Step 2: Initialize the StandardScaler
scaler = StandardScaler()

# Step 3: Fit and transform the data
zscore_scaled = scaler.fit_transform(df)

# Step 4: Convert the result back to a DataFrame
df_zscore = pd.DataFrame(zscore_scaled, columns=df.columns, index=df.index)

# Step 5: Preview the result
print(df_zscore.to_string())

# Optional: Save the standardized data
df_zscore.to_csv("tech_adoption_zscore.csv")


               Internet_Users_Pct  Fixed_Broadband_per_100  R&D_Expenditure_Pct_GDP  Mobile_Cellular_per_100  High_Technology_Exports_USD  ICT_Graduates_Pct  Problem_Solving_Tech_Level3_Pct  B2C_Index_2020  DAI_Overall  DAI_Business  DAI_People   DAI_Gov
Country Name                                                                                                                                                                                                                                                  
United States           -0.140575                 0.210816                 1.069450                -0.863663                     2.244790          -0.516384                        -0.177795        0.431718    -0.208791     -0.802425   -0.267266  0.225745
Denmark                  1.323302                 1.262288                 0.490707                 0.214851                    -0.412011          -0.093020                         0.366061        0.942558     0.425243      1.333502   

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
standardized_data = scaler.fit_transform(df)
df_standardized = pd.DataFrame(standardized_data, columns=df.columns, index=df.index)


In [16]:
import pandas as pd
from scipy.stats.mstats import winsorize
import matplotlib.pyplot as plt

# Load the merged dataset
df = pd.read_csv("final_tech_adoption_index_data.csv", index_col=0)

# Min-max normalization:
# For each column: (value - min) / (max - min)
normalized_df = (df - df.min()) / (df.max() - df.min())

# Save normalized dataset to CSV
normalized_df.to_csv("normalized_tech_adoption_data.csv")

# Preview
print(normalized_df.to_string())


               Internet_Users_Pct  Fixed_Broadband_per_100  R&D_Expenditure_Pct_GDP  Mobile_Cellular_per_100  High_Technology_Exports_USD  ICT_Graduates_Pct  Problem_Solving_Tech_Level3_Pct  B2C_Index_2020  DAI_Overall  DAI_Business  DAI_People   DAI_Gov
Country Name                                                                                                                                                                                                                                                  
United States            0.569620                 0.642901                 0.518845                 0.296803                     0.784999           0.284858                         0.374846        0.824818     0.551709      0.305918    0.505520  0.653337
Denmark                  0.930380                 0.963264                 0.381258                 0.553590                     0.056869           0.388722                         0.526510        0.952555     0.722930      0.899744   

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

# Load your raw dataset
df = pd.read_csv("final_tech_adoption_index_data.csv", index_col=0)

# Define column groups
minmax_cols = [
    'Internet_Users_Pct', 'Fixed_Broadband_per_100', 'Mobile_Cellular_per_100',
    'ICT_Graduates_Pct', 'Problem_Solving_Tech_Level3_Pct', 'B2C_Index_2020',
    'DAI_Overall', 'DAI_Business', 'DAI_People', 'DAI_Gov'
]
zscore_cols = ['R&D_Expenditure_Pct_GDP', 'High_Technology_Exports_USD']

# Step 1: Winsorize all features (IQR-based capping)
for col in df.columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower, upper)

# Step 2: Min-Max scale percentage features into (0.01, 0.99)
df_minmax = df[minmax_cols].copy()
for col in df_minmax.columns:
    col_min, col_max = df_minmax[col].min(), df_minmax[col].max()
    df_minmax[col] = 0.01 + 0.98 * (df_minmax[col] - col_min) / (col_max - col_min)

# Step 3: Z-score scale (Robust) skewed features — no compression
robust_scaler = RobustScaler()
df_zscore = pd.DataFrame(
    robust_scaler.fit_transform(df[zscore_cols]),
    columns=zscore_cols,
    index=df.index
)

# Step 4: Combine both results
df_normalized = pd.concat([df_minmax, df_zscore], axis=1)

# Save or display
df_normalized.to_csv("tech_index_mixed_normalized_unsquashed.csv")
print(df_normalized.to_string())


               Internet_Users_Pct  Fixed_Broadband_per_100  Mobile_Cellular_per_100  ICT_Graduates_Pct  Problem_Solving_Tech_Level3_Pct  B2C_Index_2020  DAI_Overall  DAI_Business  DAI_People   DAI_Gov  R&D_Expenditure_Pct_GDP  High_Technology_Exports_USD
Country Name                                                                                                                                                                                                                                                  
United States            0.497283                 0.640043                 0.300867           0.297915                         0.377349        0.816900     0.550675      0.309800    0.505410  0.650270                 1.107195                     2.084319
Denmark                  0.910296                 0.953999                 0.552518           0.402893                         0.525980        0.943119     0.718471      0.891749    0.990000  0.291986                 0.680243          