In [1]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from sklearn.datasets import load_digits
from scipy.stats import bernoulli

Each positive example has the probability $e(x)$ of being selected to be labeled.

And $e(x) = \mathbb{P}(s=1 \vert y=1,x)$. 

So we get that : $f_l = \frac{e(x)}{c}f_+(x)$, which is a biased version of the positive density. c is the normalization constant, the $\textit{label frequency}$, defined by $c=\mathbb{P}(s=1 \vert y=1)$.

Class prior : $\alpha = \mathbb{P}(y=1)$

$c=\mathbb{P}(s=1 \vert y=1)=\frac{\mathbb{P}(s=1,y=1)}{\mathbb{P}(y=1)} = \frac{\mathbb{P}(s=1)}{\mathbb{P}(y=1)} \text{(by definition of PU)}$

So in single training set scenario : $c=\frac{\mathbb{P}(s=1)}{\alpha}$


### So, first assumption : SCAR 
$e(x)=c$

### Very common other assuption, SAR (selected at random)

The probability of being labelled depends on attributes of a datapoint.

Def : The labeled sample is a biased sample from the original distribution, and the bias only depends on the attributes and is defined by the propensity score $e(x)$, we have : $e(x) = \mathbb{P}(s=1 \vert x,y=1)$

### Probabilistic gap : examples that resamble the negative ones are less likely to be labeled.

Hence, the probability gap is defined by : $\Delta \mathbb{P} = \mathbb{P}(y=1 \vert x) - \mathbb{P}(y=0 \vert x)$.

$\rightarrow e(x) = f(\Delta \mathbb{P}(x)), \frac{\partial}{\partial t}f(t) >0$


### Assumption of separability is used in graph frameworks so might be good for material science. 



## Assumptions for an identifiable class prior $\alpha$

Separable classes, non overlapping distributions

Positive subdomain, anchor

Positive function separability

Irreducibility






In [2]:
#digits = load_digits(as_frame = True).data

In [3]:
mushroom_data_df = pd.read_csv('data/mushroom.csv', encoding = 'utf-8', engine = 'python')

In [4]:
for_class=mushroom_data_df['class']

mushroom_data_df = mushroom_data_df.drop(['class'],axis=1)-mushroom_data_df.drop(['class'],axis=1).mean()/mushroom_data_df.drop(['class',],axis=1).std()




In [5]:
mushroom_data_df['class'] = for_class
mushroom_data_df['label'] = mushroom_data_df['class']

In [6]:
columns_to_use = mushroom_data_df.columns

In [7]:
mushroom_data = mushroom_data_df.to_numpy()

In [8]:
for i in range(mushroom_data.shape[0]):
    random = bernoulli.rvs(p=1/2)
    if mushroom_data[i,8] == 0:
        mushroom_data[i,8] = 99
    elif mushroom_data[i,8] == 1 and random == 1:
        mushroom_data[i,8] = 99
    elif mushroom_data[i,8] == 1 and random == 0:
        mushroom_data[i,8] = 1

# First method : two-step method with k-means and iterative LS-SVM

In [9]:
mushroom_data_kmeans = mushroom_data

In [10]:
def generalized_euclidian_dist_sq(x,y):
    """a function to calculate euclidian distance for any dimension

    Parameters
    ---------------------
    x,y : array-like
    the two vectors between which we want to calculate the euclidian distance
    --------------------
    """
    dimension = x.shape[0]
    sum_of_squares = 0
    for i in range(dimension):
        sum_of_squares += (x[i]-y[i])**2
    return sum_of_squares

In [11]:
mushroom_data_kmeans.shape

(54035, 10)

In [12]:
mushroom_data_kmeans[1,:8]

array([ 1.45942378e+03,  1.48435640e-01,  1.03892854e+00,  7.70971882e+00,
        2.64134324e+00,  1.55565600e+03,  8.41941702e+00, -1.31150916e+00])

In [13]:
iterations = 100
k = 4

In [14]:
centroids = np.random.uniform(low=0., high=3000, size=(k,8))

In [15]:
mushroom_data_df.describe()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class,label
count,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0
mean,565.68098,2.14875,1.180985,5.039227,-0.407014,1049.737302,5.837479,-2.163619,0.549181,0.549181
std,359.883763,2.160505,2.228821,3.200266,0.650969,782.056076,3.262078,0.305594,0.49758,0.49758
min,-1.576223,-1.851564,-0.961071,-2.290281,-1.165698,-1.343997,-2.580583,-3.08841,0.0,0.0
25%,287.423777,0.148436,-0.961071,2.709719,-0.895127,419.656003,3.419417,-2.227332,0.0,0.0
50%,523.423777,3.148436,0.038929,5.709719,-0.572828,921.656003,8.419417,-2.172587,1.0,1.0
75%,779.423777,4.148436,3.038929,7.709719,-0.111266,1521.656003,8.419417,-2.172587,1.0,1.0
max,1889.423777,4.148436,5.038929,8.709719,2.669196,3567.656003,9.419417,-1.311509,1.0,1.0


In [16]:
list_of_max = mushroom_data_df.describe().loc['max']
list_of_max.pop('label')
list_of_max.pop('class')

1.0

In [17]:
centroids = np.empty((k,8))
position = 0
for i in list_of_max:
    component = np.random.uniform(low=0., high = i , size=k)
    centroids[:,np.where(list_of_max == i)[0][0]] = component
    position += 1


In [18]:
mushroom_data_df_algo = mushroom_data_df.drop(['label', 'class'], axis=1)

In [19]:
mushroom_data_kmeans_train = mushroom_data_kmeans[:,:8]

In [20]:
distances = np.empty((mushroom_data_kmeans_train.shape[0], k))
cluster_cat = np.empty((mushroom_data_kmeans_train.shape[0],1))

In [21]:
n=0
while n<=iterations:
    #computation of weights
    for cluster in range(k):
        for i in range(mushroom_data_kmeans_train.shape[0]):
            distances[i,cluster] = generalized_euclidian_dist_sq(x=mushroom_data_kmeans_train[i,:],
                                                                 y=centroids[cluster,:])
    #defining to which centroid each point is the closest
    for i in range((mushroom_data_kmeans_train.shape[0])):
        cluster_cat[i,0] = np.where(distances[i,:] == np.min(distances[i,:]))[0][0]
    mushroom_data_df_algo['cluster'] = cluster_cat
    centroids = mushroom_data_df_algo.groupby('cluster').mean().to_numpy()
    n += 1

In [22]:
distances_inter_clusters = np.empty((4,4))

for i in range(4):
    for j in range(4):
        distances_inter_clusters[i,j] = generalized_euclidian_dist_sq(x=centroids[i,:],
                                                                      y=centroids[j,:])



In [23]:
distances_inter_clusters_df = pd.DataFrame(distances_inter_clusters)

In [24]:
distances_inter_clusters_df

Unnamed: 0,0,1,2,3
0,0.0,1063587.0,6171473.0,2995154.0
1,1063587.0,0.0,2147180.0,499377.0
2,6171473.0,2147180.0,0.0,575815.0
3,2995154.0,499377.0,575815.0,0.0


In [25]:
mushroom_data_df_kmeans = mushroom_data_df

In [26]:
mushroom_data_df_kmeans['cluster'] = mushroom_data_df_algo['cluster']

In [27]:
mean_per_cluster = mushroom_data_df_kmeans[['class','cluster']].groupby('cluster').mean().to_numpy()

In [28]:
mushroom_data_df_kmeans[mushroom_data_df_kmeans['cluster'] == 2]

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class,label,cluster
2835,318.423777,-1.851564,1.038929,7.709719,-0.803610,297.656003,8.419417,-2.227332,1,1,2.0
2836,417.423777,-1.851564,1.038929,7.709719,-0.620576,320.656003,8.419417,-2.172587,1,1,2.0
2837,374.423777,-1.851564,1.038929,7.709719,-0.640471,322.656003,8.419417,-2.172587,1,1,2.0
2838,231.423777,-1.851564,1.038929,2.709719,-0.437543,295.656003,8.419417,-2.172587,1,1,2.0
2839,262.423777,0.148436,1.038929,2.709719,-0.712093,329.656003,8.419417,-2.227332,1,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...
54030,71.423777,3.148436,2.038929,-0.290281,-0.278383,567.656003,9.419417,-2.172587,1,1,2.0
54031,80.423777,0.148436,2.038929,-0.290281,0.020041,488.656003,9.419417,-2.172587,1,1,2.0
54032,80.423777,3.148436,2.038929,-0.290281,-0.250530,582.656003,9.419417,-2.227332,1,1,2.0
54033,77.423777,0.148436,2.038929,-0.290281,-0.131161,489.656003,9.419417,-2.227332,1,1,2.0


In [29]:
positive_cluster = np.where(mean_per_cluster == np.max(mean_per_cluster))[0][0]

In [30]:
positive_cluster

2