# Part 1

In [101]:
import numpy as np
from tabulate import tabulate

In [102]:
X = np.genfromtxt("life_expectancy_X.csv", delimiter=",")
y = np.genfromtxt("life_expectancy_y.csv")

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (1000, 7)
y shape: (1000,)


In [103]:
# scaling to be between 0 and 1
def scale(x):
    xmax = np.max(x)
    xmin = np.min(x)
    xrange = xmax - xmin
    
    x = (x - xmin) / xrange
    return x

X_scaled = scale(X)

print(f'X scaled shape: {X_scaled.shape}')
print(f'X scaled: {X_scaled}')


X scaled shape: (1000, 7)
X scaled: [[0.35210526 0.52631579 0.10526316 ... 0.26578947 0.41473684 0.47894737]
 [0.35052632 0.89473684 0.10526316 ... 0.20105263 0.00736842 0.47473684]
 [0.17526316 0.57894737 0.21052632 ... 0.22631579 0.27631579 0.16526316]
 ...
 [0.46894737 0.21052632 0.26315789 ... 0.15578947 0.51368421 0.25263158]
 [0.15684211 0.47368421 0.10526316 ... 0.04947368 0.01894737 0.02578947]
 [0.52315789 0.         0.15789474 ... 0.20368421 0.25210526 0.14473684]]


In [104]:
# function for gradient descent
def lasso(X, y, learning_rate=0.1, iters=1000, alpha=0.1):
    _, n = X.shape
    w = np.zeros(n)
    
    for _ in range(iters):
        # Calculate predicted y
        y_pred = np.dot(X, w)

        # get the gradient (with the l1 norm expression)
        gradient = X.T.dot(y_pred - y) - alpha * np.sign(w)
        w -= learning_rate * gradient
        
    return w

X_bias = np.hstack((X_scaled, np.ones((X.shape[0], 1)))) # add bias so shape is (1000, 8)

w = lasso(X_bias, y, learning_rate=0.001, alpha=0.2)

# weights
print([float(i.round(4)) for i in w])

[57.0572, 15.0005, 0.0286, -19.0715, 0.0137, 0.0068, -0.204, 60.0062]


# Part 2

According to LASSO, the traits that influence longevity positively the most are exercise amount and number of suppotive relationships. The factors that negatively impact longevity is alcohol, drugs, and smoking. The rest have little impact on longevity. 

# Part 3

In [105]:
feature_labels = [
    "exercise", 
    "supportive relationships", 
    "siblings", 
    "alcohol and drugs", 
    "height", 
    "attractiveness", 
    "work ethics"
]

table = []

for i in range(X_scaled.shape[1]):
    R = np.corrcoef(X_scaled[:, i], y)

    table.append([feature_labels[i], R[0, 1]])

print(tabulate(table, headers=['Feature', 'Correlation']))

Feature                     Correlation
------------------------  -------------
exercise                     0.850863
supportive relationships     0.436436
siblings                     0.00245975
alcohol and drugs           -0.312273
height                      -0.0362541
attractiveness              -0.0348242
work ethics                  0.031364


# Part 4

In [106]:
n, m = X_scaled.shape

C = np.eye(n) - (1/n) * np.ones((n, n))

Q = np.cov(X_scaled.T, bias=True)

D, V = np.linalg.eig(Q)

print(f"eigenvalues: {D}")
print(f"eigenvectors: {V}\n")

print(f'Best v_hat: {V[0]}') # 0.903367 is the largest eigenvalue, so we want the corresponding eigenvector

eigenvalues: [0.0903367  0.00795597 0.01141564 0.02051009 0.0242996  0.02293463
 0.02350855]
eigenvectors: [[ 4.16625725e-03 -1.48102671e-02 -2.09715572e-02 -3.81763612e-01
  -5.49344588e-01 -7.42697753e-01 -1.41636506e-02]
 [-9.98664712e-01  3.42138532e-03 -1.13825708e-02 -4.08798820e-02
  -1.37239732e-02  2.58316827e-02 -8.57575504e-04]
 [ 1.86346295e-03  9.97618044e-01 -4.33262486e-02  2.19182949e-02
   1.02703870e-02 -3.80762741e-02  2.90156181e-02]
 [ 4.68498956e-03 -2.29807883e-02  4.21343660e-02 -3.83565054e-01
   4.39859671e-01 -1.44103455e-01  7.97690505e-01]
 [ 1.07446600e-02 -4.42578710e-02 -9.97084575e-01 -1.30567206e-02
   5.81997655e-02 -7.46054931e-03  1.16099973e-02]
 [ 9.35756514e-03  1.84067719e-02  3.44014998e-02 -5.19018384e-01
   5.85264986e-01 -1.55919688e-01 -6.01800654e-01]
 [-4.92239900e-02 -4.12492744e-02  2.04217634e-02  6.59872909e-01
   3.98155558e-01 -6.33364473e-01 -1.86490602e-02]]

Best v_hat: [ 0.00416626 -0.01481027 -0.02097156 -0.38176361 -0.54934459

# Part 5

No, not all the results for PCA, LASSO, and correlations agree with each other. LASSO and the correlation table identify exercise amount and number of supportive relations as having the strongest impact on longevity, while drug/alcohol use have the most negative impact. However, the best v_hat for PCA gave the greatest weight to drug use, height, and attractiveness, all of which had a negative impact on longevity. 