In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [2]:
FEATS = [f'f{i}' for i in range(1024)]

In [3]:
df_dhs = pd.read_csv('./dhs_clusters.csv')

In [46]:
def transform_df(X):
    
    # merge with other information available for these locations
    X = pd.merge(X, df_dhs, how = 'left', on = ['svyid', 'cluster_index'])
    
    final_cols = FEATS.copy()
    
    # apply transformations for each feature f0,..., f1023, 
    # such as taking average within given country, year, or cluster
    
    FEATS_country_means = X.groupby('country')[FEATS].mean()
    X = X.merge(FEATS_country_means, left_on='country', right_on='country',
          suffixes=('', '_country_mean'))
    
    for f in FEATS:
        final_cols.append(f'{f}_country_mean')        
    
    FEATS_years_means = X.groupby('year')[FEATS].mean()
    X = X.merge(FEATS_years_means, left_on='year', right_on='year',
          suffixes=('', '_year_mean'))
    
    for f in FEATS:
        final_cols.append(f'{f}_year_mean')
        
    
    FEATS_years_means = X.groupby('cluster_index')[FEATS].mean()
    X = X.merge(FEATS_years_means, left_on='cluster_index', right_on='cluster_index',
          suffixes=('', '_cluster_index_mean'))
    
    for f in FEATS:
        final_cols.append(f'{f}_cluster_index_mean')
        
    
    FEATS_rural_means = X.groupby('urban_rural')[FEATS].mean()
    X = X.merge(FEATS_rural_means, left_on='urban_rural', right_on='urban_rural',
          suffixes=('', '_urban_rural_mean'))
    for f in FEATS:
        final_cols.append(f'{f}_urban_rural_mean')
        
        
    final_cols.extend(['urban_rural', 'labels'])
    
    return X[final_cols]

In [25]:
def fit_ridge(X_train, y_train, X_val, y_val):
    try_alphas = np.logspace(-5, 5, 20)

    min_error = np.inf 
    for alpha in try_alphas:
        model = Ridge(alpha=alpha)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        if mse < min_error:
            min_error = mse
            best_alpha = alpha

    print(f"Best alpha: {best_alpha}")
    # print(f"Minimum Mean Squared Error: {min_error}")
    
    model = Ridge(alpha = best_alpha)
    model.fit(pd.concat([X_train, X_val], axis = 0), 
              pd.concat([y_train, y_val], axis = 0))
    
    return model

In [53]:
model_scores = []
correlations = []
baseline_scores = []
baseline_correlatins = []

def add_transformations(fold_index):

    train_df = pd.read_csv(f'./1027features/ic_1027_train_{fold_index + 1}.csv')
    val_df = pd.read_csv(f'./1027features/ic_1027_val_{fold_index + 1}.csv')
    test_df = pd.read_csv(f'./1027features/ic_1027_test_{fold_index + 1}.csv')
    
    # calculating baseline scores:
    best_baseline = fit_ridge(X_train=train_df[FEATS], y_train=train_df['labels'], 
                              X_val=val_df[FEATS], y_val=val_df['labels'])
    
    y_pred = best_baseline.predict(test_df[FEATS])
    baseline_scores.append(r2_score(test_df['labels'], y_pred))
    baseline_correlatins.append(pd.Series(test_df['labels']).corr(pd.Series(y_pred)))
    del y_pred

    # applying transformations:
    X_train = transform_df(train_df)
    X_val = transform_df(val_df)
    X_test = transform_df(test_df)
    
    y_train = X_train.pop('labels')
    y_val = X_val.pop('labels')
    y_test = X_test.pop('labels')
        
    best_model = fit_ridge(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val)
    
    y_pred = best_model.predict(X_test)
    model_scores.append(r2_score(y_test, y_pred))
    correlations.append(pd.Series(y_test).corr(pd.Series(y_pred)))                       
                        
    
    #X_train.to_csv(f'./1027features/ic_1027_train_{fold_index + 1}_transformed.csv')
    #X_val.to_csv(f'./1027features/ic_1027_val_{fold_index + 1}_transformed.csv')
    #X_test.to_csv(f'./1027features/ic_1027_test_{fold_index + 1}_transformed.csv')
    
    print(f'done with fold = {fold_index}')

In [54]:
for fold_index in range(5):
    add_transformations(fold_index)

Best alpha: 10000.0
Best alpha: 10000.0
done with fold = 0
Best alpha: 1438.44988828766
Best alpha: 10000.0
done with fold = 1
Best alpha: 1438.44988828766
Best alpha: 10000.0
done with fold = 2
Best alpha: 10000.0
Best alpha: 10000.0
done with fold = 3
Best alpha: 3792.690190732246
Best alpha: 3792.690190732246
done with fold = 4


In [55]:
# averaging out all correlations and R^2:
pd.DataFrame(data = { 
        'Correlation' : [np.mean(baseline_correlatins), np.mean(correlations)], 
        'R^2' : [np.mean(baseline_scores), np.mean(model_scores)]
    }, index = ['original', 'with transformations'])

Unnamed: 0,Correlation,R^2
original,0.829236,0.684952
with transformations,0.849272,0.718177


Improvement happened on every held-out fold:

In [59]:
np.array(baseline_correlatins) - np.array(correlations)

array([-0.03145841, -0.02144291, -0.00612288, -0.03083058, -0.01032407])

In [60]:
np.array(baseline_scores) - np.array(model_scores)

array([-0.05415461, -0.03253243, -0.01090052, -0.05220162, -0.01633691])