In [56]:
import numpy as np
import pandas as pd
import scipy as sy


1. Let `x` and `y` be two vectors of the same length, $n$.
Compute [the Pearson linear correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient),
given by:
$$
r(\mathbf{x},\mathbf{y}) =  \frac{1}{n-1} \sum_{i=1}^n
   \frac{x_i - \bar{x}}{s_{x}}
\,
   \frac{y_i - \bar{y}}{s_{y}}.
$$


2. Let `x` and `y` be two vectors of the same length, $n$.
Compute [the Spearman rank correlation coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient),
given by:
$$
\varrho(\mathbf{x},\mathbf{y})
=
1-\frac{6 \sum_{i=1}^n d_i^2}{n (n^2-1)},
$$
where $d_i=R(\mathbf{x})_i-R(\mathbf{y})_i$, $i=1,\dots,n$,
and $R(\mathbf{x})_i$ denotes the rank of $x_i$ in `x`.

3. Given a matrix with $n$ rows and $m$ columns (e.g., the first 4 rows
from the `iris` dataset), compute the correlation matrix, i.e.,
an $m\times m$ matrix $\mathbf{C}$ with $c_{i,j}$ denoting
the Pearson coefficient for the $i$-th and the $j$-th column.

In [57]:
#exercise_1

def z_score(arr): 
    return ((arr-np.mean(arr)))/np.std(arr)

def pearson_correlation(arr1,arr2):
    return np.mean((z_score(arr1))*(z_score(arr2)))
                       
#test
array1 = np.array([1,2,3,4,3,6])
array2 = np.array([7,8,88,10,11,12])

print(pearson_correlation(array1,array2))

#if I want to solve the exercise_1 using the direct command:
np.corrcoef(array1,array2)

0.004829097402467249


array([[1.       , 0.0048291],
       [0.0048291, 1.       ]])

In [58]:
#exercise_2

def d(arr1,arr2):
    return np.sum(((arr1.argsort()-arr2.argsort())**2))

def spearman_correlation(arr1,arr2):
    return 1-((6*d(arr1,arr2))/((len(arr1)*((len(arr1)**2)-1))))
 
#test    
array1 = np.array([1,2,3,4,5,6])
array2 = np.array([7,8,9,10,11,12])    

print(spearman_correlation(array1,array2))  

#if I want to solve the exercise_2 using the direct command:
pd.Series(array1).corr(pd.Series(array2))

1.0


1.0

In [59]:
#exercise_3

data = np.random.randn(5, 3)
print(data)

mean = np.mean(data, axis=0)
centered = data - mean
covariance = np.dot(centered.T, centered) / (len(data) - 1)
stddev = np.sqrt(np.diag(covariance))
correlation = covariance / np.outer(stddev, stddev)


print("Correlation matrix:")
print(correlation)

#if I want to solve the exercise_2 using the direct command:
print(np.corrcoef(data, rowvar=False))


[[ 0.35480861  1.81259031 -1.3564758 ]
 [-0.46363197  0.82465384 -1.17643148]
 [ 1.56448966  0.71270509 -0.1810066 ]
 [ 0.53419953 -0.58661296 -1.48185327]
 [ 0.85724762  0.94309899  0.11444143]]
Correlation matrix:
[[ 1.         -0.09417611  0.653226  ]
 [-0.09417611  1.          0.17696048]
 [ 0.653226    0.17696048  1.        ]]
[[ 1.         -0.09417611  0.653226  ]
 [-0.09417611  1.          0.17696048]
 [ 0.653226    0.17696048  1.        ]]


In [60]:
np.random.seed(6)
x = np.round(np.random.normal(size=20), 2)
x 



array([-0.31,  0.73,  0.22, -0.9 , -2.49,  0.91,  1.13, -1.51,  1.64,
       -0.43,  2.63,  0.6 , -0.34,  1.24,  0.11,  0.13,  0.08, -0.16,
        0.63,  0.81])

* Print all values in $[-2,-1]\cup[1,2]$.
* Print the number and the proportion of nonnegative elements in `x`.
* Compute the arithmetic mean of absolute values.
* Determine elements in `x` which are the least and the most distant from 0.
* Determine 3 elements in `x` which are the most distant from the arithmetic mean of `x`.
* Create a vector `x2`, which is a version of `x` with all outliers removed,
i.e., all observations $x_i$ such that $x_i\not\in[Q_1-1.5IQR, Q_3+1.5IQR]$,
where $IQR=Q_3-Q_1$ denotes the interquartile range and $Q_1$ and $Q_3$
denote the 1st and 3rd sample quartiles, respectively.
* Create a vector `x2`, which is a version of `x` with all outliers removed,
i.e., all observations $x_i$ such that $x_i\not\in[Q_1-1.5IQR, Q_3+1.5IQR]$,
where $IQR=Q_3-Q_1$ denotes the interquartile range and $Q_1$ and $Q_3$
denote the 1st and 3rd sample quartiles, respectively.

In [61]:
#1
x[((x>-2) & (x<-1)) | ((x>1) & (x<2))]


array([ 1.13, -1.51,  1.64,  1.24])

In [55]:
#2
print(x)

print((len(x[x<0])/len(x)))

[-0.31  0.73  0.22 -0.9  -2.49  0.91  1.13 -1.51  1.64 -0.43  2.63  0.6
 -0.34  1.24  0.11  0.13  0.08 -0.16  0.63  0.81]
0.35


In [None]:
#3
