In [2]:
%matplotlib inline
import numpy as np
from sklearn.preprocessing import scale
import json
from sensible_raw.loaders import loader

 # Part 1. Build dataset

In [3]:
from build_dataset.extractors.sms_extractor import Sms_extractor
from build_dataset.extractors.bandicoot_extractor import Bandicoot_extractor
from build_dataset.extractors.stop_locations_extractor import Stop_locations_extractor
from build_dataset.extractors.screen_extractor import Screen_extractor
##from facebook_friends_extractor import Facebook_friends_extractor
from build_dataset.extractors.bluetooth_extractor import Bluetooth_extractor
##from calllog_extractor import Calllog_extractor
##from location_extractor import Location_extractor
from build_dataset.extractors.big_five_extractor import Big_five_extractor

from build_dataset.analysis.outlier_detection import Outlier_detector_svm, Outlier_detector_kd
from build_dataset.analysis.location_reference import Load_location_reference
#from analysis.social_state_reference import Load_social_state_reference
from build_dataset.analysis.consensus_archetypes import Consensus_archetypes

In [4]:
# tc0: School periods (when there are lectures)
# tc1: Exam periods
# tc2: Holiday periods

tc0 = {'hours': range(24), 'days': range(7), 'spans': [("06/01/14","24/01/14"), ("03/02/14","16/05/14"), ("01/09/14","05/12/14"), ("02/06/14","20/06/14")]}
tc1 = {'hours': range(24), 'days': range(7), 'spans': [("17/05/14","01/06/14"), ("06/12/14", "21/12/14")]}
tc2 = {'hours': range(24), 'days': range(7), 'spans': [("01/01/14","05/01/14"), ("25/01/14","02/02/14"), ("14/04/14","20/04/14"), ("21/06/14","30/08/14"), ("22/12/14", "31/12/14")]}

In [5]:
location_reference = Load_location_reference(tc0, auxlabel="tc0_", load_reference=True)
#social_state_reference = Load_social_state_reference(tc0, auxlabel="tc0_", load_reference=False)

[location_reference] Loading datasource from local.


In [5]:
load_datasources_from_local = True

sms = Sms_extractor(tc0, suppress=[], auxlabel="tc0_", load_old_datasources=load_datasources_from_local)
bandicoot = Bandicoot_extractor(tc0, supress=[], auxlabel="tc0_", load_old_datasources=load_datasources_from_local)
stop_locations = Stop_locations_extractor(tc0, suppress=[], auxlabel="tc0_", load_old_datasources=load_datasources_from_local)
screen = Screen_extractor(tc0, suppress=[], auxlabel="tc0_", load_old_datasources=load_datasources_from_local)
##facebook_friends = Facebook_friends_extractor()
bluetooth = Bluetooth_extractor(tc0, suppress=[], auxlabel="tc0_", load_old_datasources=load_datasources_from_local)
##calllog = Calllog_extractor()
##location = Location_extractor()
big_five = Big_five_extractor()

[sms] Loading datasource from local.
[sms] Number of datapoints in range: 1786225
[stop_locations] Loading datasource from local.
[stop_locations] Number of datapoints in range: 450662
[screen] Loading datasource from local.
[screen] Number of datapoints in range: 20211170
[bluetooth] Loading datasource from local.
[bluetooth] Number of datapoints in range: 3312659
[big_five_extractor] Loaded data from local copy!


In [6]:
#tc0 = {'hours': range(24), 'days': range(7), 'spans': [("06/01/14","24/01/14"), ("03/02/14","16/05/14"), ("01/09/14","05/12/14"), ("02/06/14","20/06/14")]} #in school
#location_reference = Load_location_reference(tc1, auxlabel="tc1_", load_reference=False)

### Build full JSON dataset

In [8]:
def build_json_dataset(load_cached_data_sets=True):
    """Build json dataset with key for every user
    
    Loop over user-ids and for each one, collect features from the
    extractors. That's basically it.
    
    Parameters
    ----------
    load_cached_data_sets : bool
        Option to load prebuilt json datasets and just return those
        or to build them from scratch again. Note that building from
    """
    
    # Collect a list of valid user-ids
    with open('build_dataset/data_cache/users.json', 'r') as infile:
        users = [int(i) for i in json.load(infile)]

    if load_cached_data_sets:
        with open('build_dataset/data_cache/dataset_X.json') as infile:
            dataset_X = json.load(infile)
        with open('build_dataset/data_cache/dataset_Y.json') as infile:
            dataset_Y = json.load(infile)
    else:
        dataset_X = {}
        dataset_Y = {}

        for user in users:

            if user%10 == 0:
                print user,

            datapoint_x = {}
            datapoint_y = {}

            # Ordered by fail/execution speed
            try:
                datapoint_x.update(bluetooth.main(user))
                datapoint_x.update(stop_locations.main(user))
                datapoint_x.update(sms.main(user))
                datapoint_x.update(screen.main(user))
                #datapoint_x.update(facebook_friends.main(user))
                #datapoint_x.update(calllog.main(user))
                #datapoint_x.update(location.main(user))
                datapoint_y.update(big_five.main(user))
            except Exception as e:
                print "<"+str(e)+">",
                continue

            dataset_X[user] = datapoint_x
            dataset_Y[user] = datapoint_y

        # Store loaded data    
        with open('build_dataset/data_cache/dataset_X.json', 'w') as outfile:
            json.dump(dataset_X,outfile)
        with open('build_dataset/data_cache/dataset_Y.json', 'w') as outfile:
            json.dump(dataset_Y,outfile)
    
    return dataset_X, dataset_Y


dataset_X, dataset_Y = build_json_dataset()

### Convert to matrix and standardize

In [9]:
features_x = sorted(dataset_X.items()[0][1].keys())
features_y = ['openness', 'conscientiousness', 'extraversion', 'aggreeableness', 'neuroticism']

X = []
Y = []
for k,v in dataset_X.items():
    X.append([v[f] for f in features_x])
for k,v in dataset_Y.items():
    Y.append([v[f] for f in features_y])
    
X_scaled = scale(np.array(X))
Y = np.array(Y)
M = Consensus_archetypes().project_to_archetype_space(Y)

In [10]:
# Feature reference 
dict(zip(range(len(features_x)),features_x))

{0: u'tc0_bluetooth_social_entropy',
 1: u'tc0_screen_session_duration',
 2: u'tc0_screen_session_frequency',
 3: u'tc0_screen_summed_usage',
 4: u'tc0_sms_fractions_of_conversations_started',
 5: u'tc0_sms_overall_received_responsiveness',
 6: u'tc0_sms_overall_responsiveness',
 7: u'tc0_sms_selectivity_in_responsiveness',
 8: u'tc0_sms_traffic',
 9: u'tc0_stop_locations_geospacial_entropy'}

### Remove outliers

In [11]:
#out_svm = Outlier_detector_svm(X_scaled[:,[5,7]], hard=False, threshold=-1, visualize=False, nu=0.1, gamma=0.25)
out_kd = Outlier_detector_kd(X_scaled, visualize=False, threshold=0.08, bandwidth=2, kernel='gaussian')
outliers = out_kd.main()

X_clean = np.delete(X_scaled,outliers,axis=0)
Y_clean = np.delete(Y,outliers,axis=0)
M_clean = np.delete(M,outliers,axis=0)

print "Removed %d outliers, clean subset has %d samples" % (
    (X_scaled.shape[0]-X_clean.shape[0]), X_clean.shape[0])

Removed 50 outliers, clean subset has 577 samples




### Save data

In [12]:
np.savetxt("data/X.csv", X, delimiter=",")
np.savetxt("data/X_scaled.csv", X_scaled, delimiter=",")
np.savetxt("data/Y.csv", Y, delimiter=",")
np.savetxt("data/X_clean.csv", X_clean, delimiter=",")
np.savetxt("data/Y_clean.csv", Y_clean, delimiter=",")
np.savetxt("data/M.csv", M, delimiter=",")

# Part 2. Pareto clustering

In [1]:
%matplotlib inline
import numpy as np

from pareto_clustering.cluster.build_S import Build_S
from pareto_clustering.cluster import cluster_Infomap
from pareto_clustering.cluster import cluster_DBSCAN

ImportError: No module named p2t

### Load data

In [2]:
X = np.loadtxt("data/X.csv", delimiter=",")
X_scaled = np.loadtxt("data/X_scaled.csv", delimiter=",")
Y = np.loadtxt("data/Y.csv", delimiter=",")
X_clean = np.loadtxt("data/X_clean.csv", delimiter=",")
Y_clean = np.loadtxt("data/Y_clean.csv", delimiter=",")
M = np.loadtxt("data/M.csv", delimiter=",")

### Pareto clustering

In [5]:
_, T, _ = Build_S(X_scaled,10,sample_size=1.0, remove_outliers=True).main(visualize=False)
clusters = cluster_Infomap.fit(T)
clusters = cluster_DBSCAN.fit(T)

2 clusters and 0 outliers
2 clusters and 1 outliers
2 clusters and 2 outliers
2 clusters and 3 outliers
2 clusters and 4 outliers
2 clusters and 5 outliers
2 clusters and 6 outliers
	... found 1 valid solutions, using eps=1.802122, min_samples=2 (minimal params)


### Create datasets for each cluster

In [16]:
X_all = []
for clu, traits in clusters.items():
    Xc = X_clean[:,np.array(traits)-1]
    X_all.append(Xc)
    
for i, Xc in enumerate(X_all):
    np.savetxt("data/X%s.csv" % i, Xc, delimiter=",")

  <br>
  <br>

# Testing

In [13]:
sms.main(5)

{'tc0_[sms]_concluded_percent': 0.4672435105067985,
 'tc0_[sms]_initiated_percent': 0.3980222496909765,
 'tc0_[sms]_outgoing_percent': 0.46486817903126915,
 'tc0_[sms]_responsiveness': -9.3898049498295872,
 'tc0_[sms]_responsiveness_received': -9.0214351472865584,
 'tc0_[sms]_responsiveness_std': -4.476345222107982,
 'tc0_[sms]_traffic': 9.0063865150551141}