# Semi-Supervisée

In [31]:
import numpy as np 
np.set_printoptions(threshold=10000,suppress=True) 
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

# I. Découpage de la base en apprentissage/test

In [32]:
data = pd.read_csv('./Wave.txt', delimiter=' ', header=None)
y = data[40]
data = data.drop(columns=[40])
print('labels')
print(y)
data

labels
0       2
1       0
2       1
3       0
4       1
       ..
4995    0
4996    1
4997    2
4998    0
4999    0
Name: 40, Length: 5000, dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-0.23,-1.21,1.20,1.23,-0.10,0.12,2.49,1.19,1.34,0.58,...,-0.63,-0.86,-0.70,0.51,0.34,-0.13,-0.87,0.56,-0.53,0.29
1,0.38,0.38,-0.31,-0.09,1.52,1.35,1.49,3.81,2.33,1.34,...,0.31,1.28,1.40,0.00,-0.18,0.71,0.04,0.91,-0.79,0.22
2,-0.69,1.00,1.08,1.48,2.44,3.39,3.09,4.08,5.48,3.61,...,0.93,0.29,1.12,0.60,0.28,2.17,0.18,-0.09,-1.33,1.00
3,0.40,0.68,0.27,1.39,1.03,-0.32,-1.23,-0.50,0.11,0.87,...,1.18,0.43,-0.30,-0.07,-0.99,-0.75,1.11,1.35,-1.63,0.10
4,-0.81,1.59,-0.69,1.16,4.22,4.98,4.52,2.54,5.60,4.66,...,-0.07,0.62,0.14,1.33,-1.87,1.48,-0.02,-0.58,0.93,0.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.44,0.56,1.84,1.94,3.43,4.88,4.04,2.11,1.83,0.78,...,0.52,-2.56,-0.68,0.97,-0.78,-1.14,0.45,0.18,1.44,1.11
4996,1.18,-0.48,1.81,1.51,1.41,3.61,3.75,3.80,3.44,3.71,...,-0.41,0.37,-1.96,-1.19,-1.08,1.37,-1.02,0.71,-0.10,0.36
4997,0.64,0.81,-0.38,-0.88,1.55,0.10,0.42,-0.93,-0.70,1.54,...,0.49,1.61,-0.81,-1.35,-0.84,-0.64,1.19,-0.38,-0.70,-0.85
4998,0.18,1.65,1.91,2.07,4.28,3.61,4.46,4.62,4.80,0.25,...,0.17,-0.65,-0.73,0.89,-0.97,-1.30,-0.20,-0.63,-0.92,0.63


In [33]:
X_A, X_T, y_A , y_T = train_test_split(data, y, train_size=0.5, test_size=0.5, random_state=42)

# II. Simulation de l’aspect semi-supervisé

In [34]:
def remove_labels(labels: pd.DataFrame, taux: int = 50):
    labels_cp = labels.copy()
    index = np.array(labels_cp.index)
    np.random.shuffle(index)
    nb_to_select = int((len(index)*taux)/100)
    print(nb_to_select)
    print(len(index))
    index = index[:nb_to_select]
    for i in index:
        labels_cp.loc[index] = -1
    return labels_cp

y_A_semi = remove_labels(y_A, 5)
y_A_semi

125
2500


1250   -1
4792   -1
2830   -1
4052   -1
419    -1
       ..
3294    2
3637    1
1324    2
3285    2
1274    2
Name: 40, Length: 2500, dtype: int64

# III. Sélection de variables semi-supervisée

In [35]:
var_no_lab = X_A[0][y_A_semi == -1]
var_no_lab

2216   -0.29
4236    1.06
2176    0.38
3681   -0.36
4943   -0.05
        ... 
2731    0.36
4297   -1.01
1500   -0.51
3202   -0.51
1082   -0.14
Name: 0, Length: 125, dtype: float64

In [36]:
def plot_variable_scores(scores : list[float]):
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, len(scores) + 1), scores)
    plt.xlabel("Variables")
    plt.ylabel("Score de pertinence")
    plt.title("Histogramme des scores de pertinence des variables")
    plt.show()

In [37]:
#Emilien

def s1(X : np.array, y : np.array) -> list[float]:
    total_label = len(y.value_counts()) - 1
    scores = []
    for col in X.columns:
        overall_mean = X[col].mean()
        
        num = 0
        den = 0
        for i in range(total_label):
            total_from_label = y.value_counts()[i]
            class_data = X[y == i][col]
            class_mean = class_data.mean()
            class_std = class_data.std()

            num += total_from_label * (class_mean - overall_mean)**2
            den += total_from_label * (class_std)**2
        
        scores.append(num/den)

    return scores

def s2(X : np.array, y : np.array, t:int=10) -> list[float]:
    scores = []
    non_labelised = X.where(y==-1).dropna()
    #print(non_labelised.shape)
    # print(non_labelised.shape)
    for col in X.columns:
        var = X[col].var()
        den = 0
        num = 0
        for i in range(len(non_labelised)):
            for j in range(len(non_labelised)):
                if i != j:
                    diff = np.linalg.norm(non_labelised.iloc[i]  - non_labelised.iloc[j])
                    # print(diff)
                    s = np.exp(-diff**2  / t)

                    num += (i - j)**2 * s 
                    den = var
        lap = num/den if den != 0 else 1
        scores.append(lap)
    #print(scores)
    return scores

def score(X, y):
    return s1/s2

s1(X_A, y_A)

[np.float64(0.00022573809788103765),
 np.float64(0.0026516526094277353),
 np.float64(0.00037258143068004407),
 np.float64(0.00039658345035146656),
 np.float64(0.00011562709598176118),
 np.float64(2.3428649572870204e-05),
 np.float64(4.67449190290375e-05),
 np.float64(0.0003866022733302948),
 np.float64(0.00020571634272713047),
 np.float64(0.0001579318316439992),
 np.float64(0.00017398714805263877),
 np.float64(0.0002999330414615192),
 np.float64(0.0010886721669079074),
 np.float64(5.372673621916071e-05),
 np.float64(7.95465015572809e-05),
 np.float64(7.223428403925285e-05),
 np.float64(0.000586981573060362),
 np.float64(0.00023673372008076749),
 np.float64(0.0006820597518377221),
 np.float64(0.0003166597459603777),
 np.float64(0.00013434540874122494),
 np.float64(0.0017918478014412257),
 np.float64(6.888776161017172e-05),
 np.float64(0.00012018119779679565),
 np.float64(0.0003517605966212576),
 np.float64(0.00029193653057968616),
 np.float64(0.0012566230627898414),
 np.float64(0.000412

In [38]:
#Abdu
def fisher_score(variable : pd.Series, Y, num_classes=3):
    variable_copy = variable.copy()
    variable_copy = variable_copy[Y != -1]

    mean = variable_copy.mean()
    sum1 = 0
    sum2 = 0
    for class_index in range(num_classes):
        class_values = variable_copy[Y == class_index]
        sum1 += len(class_values) * (class_values.mean() - mean) ** 2
        sum2 += len(class_values) * class_values.std() ** 2
    
    return sum1 / sum2


def laplacien_score(variable : pd.Series, Y, t=10):
    variable_copy = variable.copy()
    variable_copy = variable_copy[Y == -1]
    variance = variable_copy.var()
    sum = 0

    for val1 in variable_copy:
        for val2 in variable_copy:
            s_ij = np.exp(-np.abs(val1 - val2)**2 / t)
            sum += (val1 - val2) ** 2 * s_ij

    return sum / variance

        
def pertinence_score(variable : pd.Series, Y, num_classes=3, t=10):
    return fisher_score(variable, Y, num_classes) / laplacien_score(variable, Y, t)

In [40]:
def s1(variable, y_A_semi): # Label
    sum_haut = 0
    sum_bas = 0
    u = np.mean(variable)
    for i in y_A_semi.unique():
        n_i = y_A_semi.value_counts()[i] #len(y_A_semi.where(y_A_semi == i).dropna())
        u_i = np.mean(variable[y_A_semi == i])
        sum_haut += n_i*(u_i-u)**2
        #print(sum_haut)

        o_i = np.std(variable[y_A_semi == i])
        sum_bas += n_i*o_i**2
    return sum_haut/sum_bas

def s2(X : np.array, y : np.array, t:int=10) -> list[float]:
    scores = []
    non_labelised = X.where(y==-1).dropna()
    #print(non_labelised.shape)
    # print(non_labelised.shape)
    for col in X.columns:
        var = X[col].var()
        den = 0
        num = 0
        for i in range(len(non_labelised)):
            for j in range(len(non_labelised)):
                if i != j:
                    diff = np.linalg.norm(non_labelised.iloc[i]  - non_labelised.iloc[j])
                    # print(diff)
                    s = np.exp(-diff**2  / t)

                    num += (i - j)**2 * s 
                    den = var
        lap = num/den if den != 0 else 1
        scores.append(lap)
    #print(scores)
    return scores

def s2(variable, X_A):
    X_A_non_lab = X_A.copy()
    X_A_non_lab = X_A_non_lab.where(y==-1).dropna()
    sum_haut = 0
    for i, v_i in enumerate(variable):
        for j, v_j in enumerate(variable):
            if i != j:
                S_ij = np.exp(-np.linalg.norm(X_A_non_lab.iloc[i]-X_A_non_lab.iloc[j])**2/10)
                sum_haut += (v_i-v_j)**2 * S_ij
    return sum_haut / variable.var()

def score(v):
    y_A_lab = y_A_semi[y_A_semi != -1]
    y_A_no_lab = y_A_semi[y_A_semi == -1]
    var_no_lab = v.loc[np.array(y_A_no_lab.index)]
    var_lab =  v.loc[np.array(y_A_lab.index)]
    s1_res = s1(var_lab, y_A_lab)
    s2_res = s2(var_no_lab, X_A)
    print(s1_res)
    return s1_res / s2_res

scores = []
for i, v in enumerate(X_A.columns):
    print('#############',i, '#################')
    sc = score(X_A[v])
    #print(sc)
    scores.append(sc)


############# 0 #################


IndexError: single positional indexer is out-of-bounds

In [20]:
def plot_variable_scores(scores : list[float]):
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, len(scores) + 1), scores)
    plt.xlabel("Variables")
    plt.ylabel("Score de pertinence")
    plt.title("Histogramme des scores de pertinence des variables")
    plt.show()

In [None]:
plot_variable_scores(scores)


# IV. Evaluation de la sélection