## I. Chargement des données



In [2]:
import numpy as np
import math

# On défini la longueur du dataset en fonction du nombre de vecteurs fournis pour ce laboratoire moins ceux pour lesquelles
# aucune image correspondante n'existe et donc, pour lesquelles nous ne pouvons pas retirer les primitives que nous aurions
# ovtenu au premiers laboratoire.
dataset_len = 16908 - 1048

#count_features = 74 + 3 # +3 pour les trois primitives extraites lors du premier laboratoire.
count_features = 77

# Definition de X_raw_galaxies et Y_raw_galaxies
X_galaxies_ids = np.zeros(dataset_len, dtype="uint")
X_raw_galaxies = np.zeros((dataset_len,count_features), dtype="object")
Y_raw_galaxies = np.zeros(dataset_len)
    

#Chargement des données dans X_raw_galaxies et Y_raw_galaxies
# On effectue une lecture en parallèle des fichiers contenant l'ensemble de vecteurs.
nb_line = 0
for vector_line, primitive_line in zip(open("data/galaxy_feature_vectors.csv"), open("data/tp01_galaxy_primitives.csv")):
    v_line = vector_line.split(",")
    p_line = primitive_line.split(",")
    
    # Si nous n'avons pas de donnees sur les primitives du TP01, on ignore la ligne.
    if (float(p_line[1]) == -1 and float(p_line[2]) == -1 and float(p_line[3]) == -1):
        continue
        
    X_galaxies_ids[nb_line] = int(float(v_line[0]));
    X_raw_galaxies[nb_line][:count_features - 3] = v_line[1: count_features - 2]
    
    # On ajoute les primitives obtenues au TP#1
    X_raw_galaxies[nb_line][count_features - 3] = p_line[1]
    X_raw_galaxies[nb_line][count_features - 2] = p_line[2]
    X_raw_galaxies[nb_line][count_features - 1] = p_line[3]
    
    Y_raw_galaxies[nb_line] = v_line[count_features - 2]
    nb_line+=1
    if nb_line == dataset_len :
        break

#Visualisation du chargement
np.set_printoptions(threshold = 2)

print("\nPREMIERE LIGNE X_raw_galaxies : \n", X_raw_galaxies[0])
print("\nPREMIERE LIGNE Y_raw_galaxies : ", Y_raw_galaxies[0])

print("\nDERNIERE LIGNE X_raw_galaxies : ", X_raw_galaxies[dataset_len-1])
print("\nDERNIERE LIGNE Y_raw_galaxies : ", Y_raw_galaxies[dataset_len-1], "\n")

print(X_raw_galaxies.shape)


PREMIERE LIGNE X_raw_galaxies : 
 ['4.393092408703673613e+01' '5.101459655976992735e+01'
 '5.771034782136849373e+01' ... '1.363720739422686901e-01'
 '1.217846971034011005e+01' '4.000000000000000000e+01\n']

PREMIERE LIGNE Y_raw_galaxies :  1.0

DERNIERE LIGNE X_raw_galaxies :  ['2.383338632750397323e+01' '3.438340222575516947e+01'
 '4.432165341812400783e+01' ... '2.938703437671911778e-01'
 '2.854937287134253054e+01' '6.200000000000000000e+01\n']

DERNIERE LIGNE Y_raw_galaxies :  1.0 

(15860, 77)


## II. Partitionnement des données (Train and Test)

In [3]:
from sklearn.model_selection import train_test_split

train_partition = 0.20 #20%
X_train_galaxies, X_test_galaxies, Y_train_galaxies, Y_test_galaxies = train_test_split(X_raw_galaxies, Y_raw_galaxies, test_size=train_partition, random_state=33)

# Vérification du partitionnement.
print("TAILLE X_TRAIN_GALAXIES : ", len(X_train_galaxies))
print("TAILLE Y_TRAIN_GALAXIES : ", len(Y_train_galaxies))
print("\nTAILLE X_TEST_GALAXIES : ", len(X_test_galaxies))
print("TAILLE Y_TEST_GALAXIES : ", len(Y_test_galaxies))

TAILLE X_TRAIN_GALAXIES :  12688
TAILLE Y_TRAIN_GALAXIES :  12688

TAILLE X_TEST_GALAXIES :  3172
TAILLE Y_TEST_GALAXIES :  3172


## III. Pré-traitements des données

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_galaxies_normalized = scaler.fit_transform(X_train_galaxies)
X_test_galaxies_normalized = scaler.fit_transform(X_test_galaxies)



################################################################################################################################# 

## IV. Modèle : Machines à vecteur de support



In [5]:
from sklearn import svm
def linearSVM_model():
    print("SVM with Linear Kernel\n")
    # create model
    model = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

    return model

In [6]:
model = linearSVM_model()
model.fit(X_train_galaxies_normalized, Y_train_galaxies)

Ytest_pred = model.predict(X_test_galaxies_normalized)
Ytest_pred

Ytest_pred_prob = model.predict_proba(X_test_galaxies_normalized)
Ytest_pred_prob

SVM with Linear Kernel

[LibSVM]

array([[2.29673905e-01, 7.70326095e-01],
       [9.87090508e-01, 1.29094923e-02],
       [4.41592619e-11, 1.00000000e+00],
       ...,
       [6.64256221e-01, 3.35743779e-01],
       [9.57068517e-01, 4.29314826e-02],
       [9.87950908e-01, 1.20490922e-02]])

#### V.1.2. Evaluation et comparaison des performances

In [7]:
from sklearn.metrics import accuracy_score

scores = accuracy_score(Y_test_galaxies, Ytest_pred )
print("Correct classification rate for the test dataset = "+str(scores*100)+"%")

Correct classification rate for the test dataset = 94.04161412358134%


Fin Notebook