In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

## Check out data

In [2]:
magic = pd.read_csv('MAGIC_gamma_telescope.csv', header=None)

In [3]:
magic.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [4]:
magic.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0
mean,53.250154,22.180966,2.825017,0.380327,0.214657,-4.331745,10.545545,0.249726,27.645707,193.818026
std,42.364855,18.346056,0.472599,0.182813,0.110511,59.206062,51.000118,20.827439,26.103621,74.731787
min,4.2835,0.0,1.9413,0.0131,0.0003,-457.9161,-331.78,-205.8947,0.0,1.2826
25%,24.336,11.8638,2.4771,0.2358,0.128475,-20.58655,-12.842775,-10.849375,5.547925,142.49225
50%,37.1477,17.1399,2.7396,0.35415,0.1965,4.01305,15.3141,0.6662,17.6795,191.85145
75%,70.122175,24.739475,3.1016,0.5037,0.285225,24.0637,35.8378,10.946425,45.88355,240.563825
max,334.177,256.382,5.3233,0.893,0.6752,575.2407,238.321,179.851,90.0,495.561


In [5]:
magic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19020 entries, 0 to 19019
Data columns (total 11 columns):
0     19020 non-null float64
1     19020 non-null float64
2     19020 non-null float64
3     19020 non-null float64
4     19020 non-null float64
5     19020 non-null float64
6     19020 non-null float64
7     19020 non-null float64
8     19020 non-null float64
9     19020 non-null float64
10    19020 non-null object
dtypes: float64(10), object(1)
memory usage: 1.6+ MB


## Get data in a format that fits sklearn

In [6]:
magic[10] = pd.Categorical(magic[10])

magic[10] = magic[10].cat.codes
magic.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
19015,21.3846,10.917,2.6161,0.5857,0.3934,15.2618,11.5245,2.8766,2.4229,106.8258,1
19016,28.9452,6.702,2.2672,0.5351,0.2784,37.0816,13.1853,-2.9632,86.7975,247.456,1
19017,75.4455,47.5305,3.4483,0.1417,0.0549,-9.3561,41.0562,-9.4662,30.2987,256.5166,1
19018,120.5135,76.9018,3.9939,0.0944,0.0683,5.8043,-93.5224,-63.8389,84.6874,408.3166,1
19019,187.1814,53.0014,3.2093,0.2876,0.1539,-167.3125,-168.4558,31.4755,52.731,272.3174,1


In [7]:
# Get data as arrays, shuffle, and separate features from labels
X_raw = magic.values

np.random.shuffle(X_raw)

y = X_raw[:,-1]
X = X_raw[:,:-1]

In [8]:
# Normalize X to get unit standard deviation
col_std = np.std(X, axis=1)
for j in range(X.shape[1]):
    X[:,j] = X[:,j] / col_std[j]

## Train a model supervised just to see how it works

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [11]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [12]:
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)

accuracy

0.7712933753943217

## Select data for supervised and unsupervised training

In [15]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [16]:
num_labeled = 25
num_unlabeled = 20#[0, 10, 20, 40, 80, 160, 320, 640]

In [17]:
X, y = unison_shuffled_copies(X, y)

In [18]:
X_l = X[:num_labeled]
y_l = y[:num_labeled]

X_u = X[num_labeled:num_labeled + num_unlabeled]
y_u = y[num_labeled:num_labeled + num_unlabeled]

## Semi-supervised 1
Train on labeled data first, predict labels for unlabeled data, and train classifier further with these predicted labels

In [19]:
# Train on labeled data
clf = LinearDiscriminantAnalysis()
clf.fit(X_l, y_l)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [20]:
# Predict labels for unlabeled data
y_u_pred = clf.predict(X_u)
print("Accuracy on prediction for \"unlabeled\" data: {:.4f}". format(accuracy_score(y_pred=y_u_pred, y_true=y_u)))

Accuracy on prediction for "unlabeled" data: 0.9000


In [21]:
# Train classifier with predicted labels
clf.fit(X_u, y_u_pred)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [22]:
# Join labeled and "unlabeled" and check accuracy
X_tot = X[:num_labeled + num_unlabeled]
y_tot = y[:num_labeled + num_unlabeled]

y_tot_pred = clf.predict(X_tot)
print("Accuracy on prediction for labeled and \"unlabeled\" data: {:.4f}". format(accuracy_score(y_pred=y_tot_pred, y_true=y_tot)))

Accuracy on prediction for labeled and "unlabeled" data: 0.8667
