In [1]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
from sklearn.metrics import classification_report,f1_score
# from sklearn.model_selection import GridSearchCV
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import norm

In [2]:
data_raw = loadmat("./data/ex8data1.mat")

In [3]:
data_raw.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])

In [4]:
X = data_raw["X"]
X_val = data_raw["Xval"]
y_val = data_raw["yval"]

In [5]:
fig = px.scatter(x=X[:,0],y=X[:,1],color=X[:,1])
fig.show()

In [6]:
def gaussien_param_estimate(X):
    mu = np.mean(X,axis=0)
    std = np.std(X,axis=0,ddof=0)
    return mu, std

In [7]:
mu, std = gaussien_param_estimate(X)

In [8]:
norm.mean(mu,std)

array([14.11222578, 14.99771051])

In [9]:
norm.std(mu,std)

array([1.35374717, 1.3075723 ])

In [10]:
x_range = np.linspace(0,30,200)
y_range = np.linspace(0,30,200)

In [11]:
coordinates = np.stack(np.meshgrid(x_range,y_range),-1).reshape(-1,2)
z_plot = np.prod(norm.pdf(coordinates,mu,std),axis=1)

In [12]:
coordinates.shape

(40000, 2)

In [13]:
fig2 = px.scatter_3d(x=coordinates[:,0],y=coordinates[:,1],z=z_plot,color=z_plot)
fig2.show()

In [14]:
trace = go.Contour(
        x=coordinates[:,0],
        y=coordinates[:,1],
        z=z_plot,
        contours_coloring='lines',
        autocontour=True,
        ncontours = 20
    )

In [15]:
fig.add_trace(trace)

In [16]:
p_val = np.prod(norm.pdf(X_val,mu,std),axis=1)

In [17]:
def select_threshold(X,X_val,y_val):
    mu, std = gaussien_param_estimate(X)
    p_val = np.prod(norm.pdf(X_val,mu,std),axis=1)
    # set up epsilon candidates
    epsilon = np.linspace(np.min(p_val), np.max(p_val), num=10000)
    # calculate f-score
    fs = []
    for e in epsilon:
        y_pred = (p_val <= e).astype('int')
        fs.append(f1_score(y_val, y_pred))
    # find the best f-score
    argmax_fs = np.argmax(fs)
    return epsilon[argmax_fs], fs[argmax_fs]



In [18]:
epsilon,f1 = select_threshold(X,X_val,y_val)

In [19]:
y_pred = (p_val <= epsilon).astype('int')

In [20]:
# construct test DataFrame
data = pd.DataFrame(X_val, columns=['Latency', 'Throughput'])
data['y_pred'] = y_pred

In [21]:
px.scatter(data_frame=data,x="Latency",y="Throughput",symbol="y_pred",color=data.y_pred.astype("str"))

# High Dimensional data

In [22]:
data_raw = loadmat("./data/ex8data2.mat")
data_raw.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])

In [23]:
X = data_raw["X"]
X_val = data_raw["Xval"]
y_val = data_raw["yval"]

In [24]:
e, fs = select_threshold(X, X_val, y_val)

In [25]:
mu, std = gaussien_param_estimate(X)

In [26]:
p_val = np.prod(norm.pdf(X_val,mu,std),axis=1)

In [27]:
y_pred = (p_val <= e).astype('int')

In [28]:
y_pred.sum()

9

In [29]:
e

1.377367163219564e-19

In [30]:
fs

0.7368421052631577

# Recommandation System

In [32]:
data_raw = loadmat("./data/ex8_movies.mat")

In [38]:
Y = data_raw["Y"] # user ratings
R = data_raw["R"] # rated indicator

In [51]:
avg_rating = np.mean(Y,axis=1,where=np.bool8(R))

In [52]:
data_raw = loadmat("./data/ex8_movieParams.mat")

In [54]:
data_raw.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [55]:
X = data_raw["X"]
Theta = data_raw["Theta"]
num_users = data_raw["num_users"]
num_movies = data_raw["num_movies"]
num_features = data_raw["num_features"]

In [56]:
def cost(X,Theta,Y,R):
    dist = np.power(X@Theta.T-Y,2)
    cost = np.sum(dist,where=np.bool8(R))/2
    return cost


In [57]:
cost(X,Theta,Y,R)

27918.64012454425