# Semi-Supervised EM Algorithm to Cluster Social Network 
### By Swetha Revanur and Keanu Spies

In this notebook we present a semi-supervised EM approach that aims to cluster posts from a social network into high risk and low risk for being drawn into sex work.

In [35]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import pandas as pd

np.random.seed(648)

K = 2           # Number of Gaussians in the mixture model
NUM_TRIALS = 3  # Number of trials to run (can be adjusted for debugging)
UNLABELED = -1  # Cluster label for unlabeled data points (do not change)d
EPS = 1e-8

## Setup and Helpers

In [36]:
def load_dataset(path):
    df = pd.read_pickle(path)
    
    # list of friends cannot be handeled by em algorithm
    df = df.drop('friends_id_list', axis=1)
    
    # seperate out labeled and unlabeled data
    labeled_df = df[df['risk'].notnull()]
    unlabeled_df = df[df['risk'].isnull()]
    
    # labels
    y_tilde = labeled_df['risk'].values

    # extract datasets 
    labeled_df= labeled_df.drop('risk', axis=1)
    unlabeled_df = unlabeled_df.drop('risk', axis=1)
    X_tilde = labeled_df.values
    X = unlabeled_df.values
    
    return X, X_tilde, y_tilde

def plot_gmm_preds(x, z, with_supervision, plot_id):
    pass

# The gaussian PDF helper
def gaussian(x, phi, mu, sigma, n):
    try:
        sigma_inv = np.linalg.inv(sigma)
    except:
        sigma_inv = np.linalg.pinv(sigma)
    det = (np.sqrt(np.linalg.det(sigma)+ EPS)*(2*np.pi)**(n/2))
    expon = np.exp(-0.5*(x - mu).T.dot(sigma_inv).dot(x - mu))
    return (expon/det)*phi

## Semi-Supervised EM Algorithm

In [37]:
def run_semi_supervised_em(x, x_tilde, z, w, phi, mu, sigma):
    
    alpha = 20.  # Weight for the labeled examples
    eps = 1e-3   # Convergence threshold
    max_iter = 1000
    it = 0
    ll = prev_ll = None
    
    m, n = x.shape
    mt, nt = x_tilde.shape
    
    while it < max_iter and (prev_ll is None or np.abs(ll - prev_ll) >= eps):
        print(it)
        prev_ll = ll
        print("In ESTEP")
        # (1) E-step: Update your estimates in w
        for i in range(m):
            sum_ = sum(gaussian(x[i], phi[l], mu[l], sigma[l], n) for l in range(K))
            for j in range(K):
                w[i][j] = gaussian(x[i], phi[j], mu[j], sigma[j], n)/sum_

        print("In MSTEP")
        # (2) M-step: Update the model parameters phi, mu, and sigma
        sum_zit = np.zeros(K)
        for j in range(K):
            for i in range(mt):
                if z[i] == j:
                    sum_zit[j] += alpha

        mu_tilde_num = np.zeros((K, n))
        for j in range(K):
            for i in range(mt):
                if z[i] == j:
                    print(type(mu_tilde_num))
                    print(type(x_tilde))
                    mu_tilde_num[j, :] += alpha*x_tilde[i]
                    
        w_sum = np.sum(w, axis = 0)
        w_z_sum = w_sum + sum_zit

        mu = ((np.matmul(x.T, w) + mu_tilde_num.T)/w_z_sum).T
        phi = w_z_sum/(m + alpha*mt)
        sum_ =  w_sum + sum_zit
        
        
        for j in range(K):
            inner_sum = np.zeros_like(sigma[0])
            for i in range(m):
                inner_sum += w[i][j]*np.outer(x[i] - mu[j], x[i] - mu[j])
            for i in range(mt):
                if z[i] == j:
                    inner_sum += alpha*np.outer(x_tilde[i] - mu[j], x_tilde[i] - mu[j])
            sigma[j] = inner_sum/w_z_sum[j]
        
        print("In LL")
        # (3) Compute the log-likelihood of the data to check for convergence.
        ll = 0
        for i in range(m):
            inner_sum = 0
            for j in range(K):
                inner_sum += gaussian(x[i], phi[j], mu[j], sigma[j], n)
            ll += np.log(inner_sum)

        if it % 10 == 0: print(ll, it)
        it += 1
    print("It: %d"%it)
    return w

## Run SSEM

In [38]:
# Load dataset
base_data_dir = "../../data"
train_path = os.path.join(base_data_dir, 'online_sex_work_clean.pkl')
x, x_tilde, z = load_dataset(train_path)

# randomly assign data to clusters
m, n = x.shape
assignments = [np.random.randint(0, K) for i in range(m)] # of size m
mu, sigma = [None] * K, [None] * K
for i in range(K):
    x_vals = np.array([x[j, :] for j in range(len(x)) if assignments[j] == i])
    mu[i] = np.mean(x_vals, axis = 0)
    sigma[i] = np.cov(x_vals.T.astype(float))
    
# initialize phi
phi = np.ones((K,))/K
# initialize w
w = np.ones((m, K))/K

# run the SSEM algorithm
w = run_semi_supervised_em(x, x_tilde, z, w, phi, mu, sigma)

0
In ESTEP




In MSTEP
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


TypeError: ufunc 'add' output (typecode 'O') could not be coerced to provided output parameter (typecode 'd') according to the casting rule ''same_kind''