# Example semi-supervised alignment

In this notebook, we select a source tweet and show its nearest neighbours in the non-aligned versus in the aligned embedding. The source tweet is selected to illustrate that the alignment makes it more similar to target tweets of the same class, but that those target tweets need to be semantically similar.

In [0]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import sys
import random
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import pickle as pc
import dateutil
import numpy.random as rnd
from scipy.linalg import eig, eigh, svd, norm
from scipy.spatial.distance import pdist, cdist, squareform

# Import visualizers
from sklearn.decomposition import PCA

# Import classifiers
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors

# Import class imbalance techniques
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import make_pipeline

from IPython.core.debugger import Tracer

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Set font size
fS = 20

In [3]:
# Set random seed to reproduce example
# seed = random.randrange(sys.maxsize)
seed = 1104216542866770581
rng = random.Random(seed)
print("Seed was:", seed)

Seed was: 1104216542866770581


In [0]:
def is_pos_def(A):
    """Check for positive definiteness."""
    return np.all(np.real(np.linalg.eigvals(A)) > 0)

def reg_cov(X):
    """
    Regularize covariance matrix until non-singular.

    Parameters
    ----------
    C : array
        square symmetric covariance matrix.

    Returns
    -------
    C : array
        regularized covariance matrix.

    """
    # Number of data points
    N = X.shape[0]

    # Compute mean of data
    muX = np.mean(X, axis=0, keepdims=1)

    # Compute covariance matrix without regularization
    SX = (X - muX).T @ (X - muX) / N

    # Initialize regularization parameter
    reg = 1e-8

    # Keep going until non-singular
    while not is_pos_def(SX):

        # Compute covariance matrix with regularization
        SX = (X - muX).T @ (X - muX) / N + reg*np.eye(X.shape[1])

        # Increment reg
        reg *= 10

    # Report regularization
    if reg > 1e-5:
      print('Final regularization parameter = {}'.format(reg))

    return SX

def align_classes(X, Y, Z, u, CX, CZ, V):
    """
    Project each class separately.
    Parameters
    ----------
    X : array
        source data set (N samples x D features)
    Y : array
        source labels (N samples x 1)
    Z : array
        target data set (M samples x D features)
    u : array
        target labels (m samples x 2)
    CX : array
        source principal components (K classes x D features x d subspaces)
    CZ : array
        target principal components (K classes x D features x d subspaces)
    V : array
        transformation matrix (K classes x d subspaces x d subspaces)
    Returns
    -------
    X : array
        transformed X (N samples x d features)
    Z : array
        transformed Z (M samples x d features)
    """
    # Number of source samples
    N = X.shape[0]

    # Number of classes
    K = len(np.unique(Y))

    # Subspace dimensionality
    d = V.shape[1]

    # Preallocate
    XV = np.zeros((N, d))

    for k in range(K):

        # Project the k-th class
        XV[Y == k, :] = X[Y == k, :] @ CX[k] @ V[k]

        # Indices of all target samples with label k
        uk = u[u[:, 1] == k, 0]

        # Mean of labeled target samples
        muZk = np.mean(Z[uk, :], axis=0, keepdims=1)

        # Remove mean after projection
        XV[Y == k, :] -= np.mean(XV[Y == k, :], axis=0, keepdims=1)

        # Center the projected class on mean of labeled target samples
        XV[Y == k, :] += muZk @ CZ

    # Project target data onto components
    Z = Z @ CZ

    return XV, Z

def semi_subspace_alignment(X, Y, Z, u, subspace_dim=1):
    """
    Compute subspace and alignment matrix, for each class.
    Parameters
    ----------
    X : array
        source data set (N samples x D features)
    Y : array
        source labels (N samples x 1)
    Z : array
        target data set (M samples x D features)
    u : array
        target labels, first column is index in Z, second column is label
        (m samples x 2)
    subspace_dim : int
        Dimensionality of subspace to retain (def: 1)
    Returns
    -------
    V : array
        transformation matrix (K, D features x D features)
    CX : array
        source principal component coefficients
    CZ : array
        target principal component coefficients
    """
    # Data shapes
    N, DX = X.shape
    M, DZ = Z.shape

    # Check for sufficient samples
    if (N < subspace_dim) or (M < subspace_dim):
        raise ValueError('Too few samples for subspace dimensionality.')

    # Assert equivalent dimensionalities
    if not DX == DZ:
        raise ValueError('Dimensionalities of X and Z should be equal.')

    # Number of classes
    K = len(np.unique(Y))

    for k in range(K):

        # Check number of samples per class
        Nk = np.sum(Y == k)

        # Check if subspace dim is too large
        if (Nk < subspace_dim):

            # Reduce subspace dim
            subspace_dim = min(subspace_dim, Nk)

            # Report
            print('Reducing subspace dim to {}'.format(subspace_dim))

    # Total covariance matrix of target data
    SZ = reg_cov(Z)

    # Eigendecomposition for first d eigenvectors
    valZ, vecZ = eigh(SZ, eigvals=(DZ - subspace_dim, DZ-1))

    # Sort eigenvectors x descending eigenvalues
    CZ = vecZ[:, np.argsort(np.real(valZ))[::-1]]

    # Use k-nn to label target samples
    kNN = KNeighborsClassifier(n_neighbors=1)
    U = kNN.fit(Z[u[:, 0], :], u[:, 1]).predict(Z)

    # Preallocate
    CX = np.zeros((K, DX, subspace_dim))
    V = np.zeros((K, subspace_dim, subspace_dim))

    # For each class, align components
    for k in range(K):

        # Take means
        muXk = np.mean(X[Y == k, :], axis=0, keepdims=1)
        muZk = np.mean(Z[U == k, :], axis=0, keepdims=1)

        # Compute covariance matrix of current class
        SXk = reg_cov(X[Y == k, :])
        SZk = reg_cov(Z[U == k, :])

        # Eigendecomposition for first d eigenvectors
        valX, vecX = eigh(SXk, eigvals=(DX - subspace_dim, DX-1))
        valZ, vecZ = eigh(SZk, eigvals=(DZ - subspace_dim, DZ-1))

        # Sort based on descending eigenvalues
        CX[k] = vecX[:, np.argsort(np.real(valX))[::-1]]
        vecZ = vecZ[:, np.argsort(np.real(valZ))[::-1]]

        # Aligned source components
        V[k] = CX[k].T @ vecZ

    # Return transformation matrix and principal component coefficients
    return V, CX, CZ

In [0]:
# Dimensionality of subspace
subspace_dim = 100

# Number of target samples to be labeled
nT = 50

In [0]:
# Load tweets dataframe
tweets = pd.read_json('RumEval19.json')

# Load tweets mapped to embedding space
X = np.load('rumeval19.npy')

# Impute all NaN's to 0
X[np.isnan(X)] = 0

In [7]:
# Process data

# Sort tweets by time
tweets = tweets.sort_values(by=['datetime'])
dates_ = tweets['datetime'].values
dates = np.unique(tweets['datetime'].values)
sortix = np.argsort(dates_)

# Subselect labels and map to numerical
labels_ = tweets['label'].values.tolist()
labels = np.unique(labels_)
Y = np.array([np.argwhere(label == labels)[0][0] for label in labels_])
K = len(np.unique(Y))

rumours_ = tweets['rumour'].values[sortix]
indexes = np.unique(rumours_, return_index=True)[1]
all_rumours = [rumours_[index] for index in sorted(indexes)]

# Remove rumours with too few samples
cutoff = subspace_dim
rumours = []
for rumour in all_rumours:
    
    # Number of samples for current rumour
    num_rumour = np.sum(rumours_ == rumour)
    
    if num_rumour >= cutoff:
      rumours.append(rumour)
        
print('{} rumours discarded for having less than {} samples per class.\n'.format(len(all_rumours) - len(rumours), cutoff))

print('Remaining rumours:')
[print(rumour) for rumour in rumours]

# Sort embedding
X = X[sortix, :]

2 rumours discarded for having less than 100 samples per class.

Remaining rumours:
ferguson
ottawashooting
prince-toronto
sydneysiege
charliehebdo
germanwings-crash
illary


In [8]:
# Which pair of rumours to consider
src_rum = 'charliehebdo'
tgt_rum = 'germanwings-crash'

# Create training data from all past rumours
src_index = (tweets['rumour'] == src_rum).values
tweets_src = tweets.loc[src_index]

# Find all tweets from current rumour
tgt_index = (tweets['rumour'] == tgt_rum).values
tweets_tgt = tweets.loc[tgt_index]

# Split out source data
SX = X[src_index, :]
SY = Y[src_index]

# Split out target data
TX = X[tgt_index, :]
TY = Y[tgt_index]

# Subselect tweets
tweets_2r = pd.concat([tweets.loc[src_index], tweets.loc[tgt_index]], axis=0, ignore_index=True)

# Check label proportions
print('Label shape of train set: \t %s' % Counter(SY))
print('Label shape of test set: \t %s' % Counter(TY))

# Sample size
N = SX.shape[0]
M = TX.shape[0]

# Random selection of target labels per class for semi-supervision
u0 = np.zeros((0,1))
u1 = np.zeros((0,1))
for k in range(K):

    # Check for enough target samples per class
    if np.sum(TY == k) < nT:

      # Update number of target samples per class
      nT = np.sum(TY == k)

      print('Reducing number of labeled target samples to {}'.format(nT))

    # Randomly sample
    ix = rnd.choice(range(M), size=nT, replace=False, p=(TY==k)/np.sum(TY==k))

    u0 = np.vstack((u0, ix[:,np.newaxis]))
    u1 = np.vstack((u1, k*np.ones((nT,1))))

u = np.concatenate((u0, u1), axis=1).astype('uint8')

# Semi-supervised alignment
V, CX, CZ = semi_subspace_alignment(SX, SY, TX, u, subspace_dim=min(subspace_dim, min(N, M)))

# Align classes
SXV, TXC = align_classes(SX, SY, TX, u, CX, CZ, V)


Label shape of train set: 	 Counter({0: 795, 3: 248, 2: 61, 1: 60})
Label shape of test set: 	 Counter({0: 244, 3: 80, 2: 43, 1: 16})
Reducing number of labeled target samples to 16
Reducing subspace dim to 60


In [9]:
# Source tweet to consider
ix = 0
tweets_src.iloc[ix]

date         2015-01-07 00:00:00                                                                                                                   
datetime     2015-01-07 11:07:51                                                                                                                   
id           552783667052167168                                                                                                                    
label        support                                                                                                                               
reply_ix     0                                                                                                                                     
rumour       charliehebdo                                                                                                                          
text         France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according 

In [0]:
# Number of neighbours to consider
num_neighbors = 3

In [17]:
# Find neighbours in non-aligned space
NN = NearestNeighbors(n_neighbors=num_neighbors).fit(TX)
neighbours_nonaligned = NN.kneighbors(np.atleast_2d(SX[ix,:]), n_neighbors=num_neighbors, return_distance=False)[0]

# Display found tweets
tweets_tgt.iloc[neighbours_nonaligned]

Unnamed: 0,date,datetime,id,label,reply_ix,rumour,text,thread_ix
1232,2015-03-24,2015-03-24 13:24:59,580359656750653440,query,6,germanwings-crash,"@PascalSyn @airlivenet if they had, it’s likely the descent rate would’ve been steeper and the speed not reduce, no ?",6
1334,2015-03-24,2015-03-24 12:09:18,580340609958858752,comment,2,germanwings-crash,@NBCNews Praying for the families and friends of those involved in crash. I'm so sorry for your loss.,11
1557,2015-03-24,2015-03-24 12:42:58,580349082406895616,comment,12,germanwings-crash,"@mashable His Facebook, https://t.co/LwXQnkqC3P Address coming shortly",27


In [18]:
# Find the neighbours of each source tweet in aligned space
NN = NearestNeighbors(n_neighbors=num_neighbors).fit(TXC)
neighbours_aligned = NN.kneighbors(np.atleast_2d(SXV[ix,:]), n_neighbors=num_neighbors, return_distance=False)[0]

# Display found tweets
tweets_tgt.iloc[neighbours_aligned]

Unnamed: 0,date,datetime,id,label,reply_ix,rumour,text,thread_ix
1201,2015-03-24,2015-03-24 10:55:03,580321926515572736,support,3,germanwings-crash,“@AlArabiya_Eng: 148 passengers were on board #GermanWings Airbus A320 which has crashed in the southern French Alps http://t.co/VSqaycAsIG”,0
1235,2015-03-24,2015-03-24 13:32:35,580361570334674944,comment,9,germanwings-crash,"@PascalSyn @airlivenet radio call then passengers, no ? 20 sec isn’t much… it takes me longer to tweet sometimes ;-)",6
1216,2015-03-26,2015-03-26 00:01:57,580882341880446976,support,0,germanwings-crash,Report: Co-Pilot Locked Out Of Cockpit Before Fatal Plane Crash http://t.co/aXGQuacv2E #Germanwings http://t.co/T80L7BhXX6,4
