In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
import matplotlib

import sys
sys.path.append("../")

from analytics.classification_aux import em2_classification_cached
from classification import twopoisson_em
from analytics import visualization

from analytics.classification_aux import extract_train_test, print_results, compare_results
from aux.events_io import load_events, store_events
from classification.nhst_testing import *

##########################################

np.random.seed(123)

import logging
from imp import reload
reload(logging)
fmt = '[%(process)4d][%(asctime)s][%(levelname)-5s][%(module)s:%(lineno)d/%(funcName)s] %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO, datefmt='%I:%M:%S')

%matplotlib inline

ImportError: cannot import name 'visualization'

# COPY EDITOR DATA PREPARATION

In [None]:
user2features = pd.read_csv("../../data/badges/covariates.tsv", sep="\t")
samples = pd.read_csv("../../data/badges_copyeditor.tsv", sep="\t")

In [None]:
NUM_TEST_USERS = 0 #!
city_features = ["c0", "c1", "c2", "c3", "c4"]
state_features = ["s0", "s1", "s2", "s3", "s4"]
user_features = ["age", "upvotes",  "views", "downvotes", "comments"]  #, "reputation"
selected_features = user_features+city_features+state_features
features, train_ids, test_ids = extract_train_test(samples, user2features[["id"]+selected_features], 
                                                   max_train=10000, max_test=NUM_TEST_USERS)

# Two-phase classification (EM2)

In [None]:
nhst_preds_file = "../../data/badges_copyeditor_nhst_preds.tsv" #intermediate results cache 

In [None]:
train_preds, test_preds, em2_results = em2_classification_cached(features, samples, train_ids, test_ids, 
                                                                   sigma=1, kappa=1, #?????
                                                                   nhst_preds_file=nhst_preds_file)

### Results visualization

In [None]:
print("available features:", list(enumerate(features.columns[1:])))

In [None]:
#features to be plotted
features_no, features_names = zip(*[(0, 'age'), (2, 'views')]) 
#features_no, features_names = zip(*[(10, 's0'), (9, 'c4')]) 
#features_no, features_names = zip(*[(0, 'age'), (10, 's0')]) 
features_no, features_names = list(features_no), list(features_names)

In [None]:
#PLOT NHST RESULTS
nhst_train_preds = em2_results["nhst_preds"]

visualization.pyplot_reset()

visualization.plot_groups(features, features_names, 
                          train_ids, nhst_train_preds, 
                          train_marker = "+", test_marker="x",
                          #ylabel="log(%s)" % features_names[1], 
                          plot_legend=False, s=10, plot_densities=True)

pyplot.gcf().subplots_adjust(bottom=0.15, left=0.15)# right=0.85, left=0.15)
pyplot.xlim((15,55)); pyplot.ylim((4,12)); pyplot.ylabel("log(%s)" % features_names[1])
#pyplot.xlim((15,55)); pyplot.ylim((1.75,3.75)); pyplot.ylabel("similarity to US")

visualization.pyplot_savefig("../../data/badges_copyeditor_nhst_preds0.pdf")

print_results(nhst_train_preds)

In [None]:
#PLOT FINAL RESULTS

visualization.pyplot_reset()

visualization.plot_groups(features, features_names, 
                          train_ids, train_preds, 
                          test_ids, test_preds,
                          train_marker = "+", test_marker="x",
                          #ylabel="log(%s)" % features_names[1], 
                          plot_legend=False, s=10, plot_densities=False)
visualization.plot_bivariate(em2_results["mu"][0,features_no], 
                             em2_results["ss"][0][np.ix_(features_no,features_no)], 
                             cmap="Blues")#, limits=(1,4,10,60))
visualization.plot_bivariate(em2_results["mu"][1,features_no], 
                             em2_results["ss"][1][np.ix_(features_no,features_no)], 
                             cmap="Reds")#, limits=(1,4,10,60))
#pyplot.xlim((1,4))

pyplot.gcf().subplots_adjust(bottom=0.15, left=0.15)# right=0.85, left=0.15)
pyplot.xlim((15,55)); pyplot.ylim((4,12)); pyplot.ylabel("log(%s)" % features_names[1])
#pyplot.xlim((15,55)); pyplot.ylim((1.75,3.75)); pyplot.ylabel("similarity to US")

visualization.pyplot_savefig("../../data/badges_copyeditor_2sb_preds0.pdf")

print_results(train_preds, test_preds)

In [None]:
compare_results(nhst_train_preds, train_preds)

In [None]:
#PLOT INITIAL ASSIGNMENTS 
init_train_preds, init_test_preds = em2_results["init_train"], em2_results["init_test"]

visualization.pyplot_reset()
visualization.plot_groups(features, features_names, 
                          train_ids, init_train_preds, 
                          test_ids, init_test_preds, 
                          train_marker = "+", test_marker="x",
                          ylabel="log(%s)" % features_names[1], 
                          plot_legend=True, s=10, plot_densities=True)

## Features ranking

In [None]:
sfeature2states = {
"s0" : ['Lehi_UT', 'MISSION_VIEJO_CA', 'HOWELL_MI', 'EASTERN_OREGON', 'Mclean_VA', 'THOUSAND_OAKS_CA', 'NANTES_FRANCE', 'VENICE_FL', 'SOUTH_GEORGIA', 'NORTHWEST_FLORIDA'],
"s1" : ['India', 'NewDelhi', 'ndia', 'Rajathan', 'Inida', 'AHMEDABAD_INDIA', 'Sri_Lank', 'Ha_Noi_Viet_Nam', 'INdia', 'AHMADABAD'],
"s2" : ['Germany', 'Germnay', 'CzechRepublic', 'Belgium', 'Spain', 'Swizerland', 'France', 'Bussum', 'Weesp', 'Schijndel'],
"s3" : ['Russia', 'Romania', 'Poland', 'Ukraine', 'Czech_republic', 'CzechRepublic', 'Chişinău', 'Bulgaria', 'Lithuania', 'Hungary'],
"s4" : ['United_States', 'Curaçao_Netherlands_Antilles', 'Mclean_VA', 'NORTH_YORKSHIRE', 'Luleå_Sweden', 'OTTAWA_ONTARIO_CANADA', 'ANN_ARBOR_MICH', 'United_Kindom', 'Massachsetts', "XI'AN_CHINA"]
}

cfeature2cities = {
"c0" : ['Hyderabad', 'Bangalore', 'Chennai', 'Kolkata', 'Pune', 'Ahmedabad', 'Mumbai', 'Jaipur', 'Delhi', 'Lucknow'],
"c1" : ['Eching', 'Toruń', 'Zaltbommel', 'Zvenigorod', 'Deinze', 'Trenčín', 'Piatra_Neamt', 'Hradec_Králové', 'Zielona_Góra', 'Białystok'],
"c2" : ['Bento_Goncalves', 'Thiais', 'Torrejón_de_Ardoz', 'Eching', 'Melboune', 'Eaubonne', 'Phnom_Pehn', 'Bussum', 'London', 'Manises'],
"c3" : ['Shelby_Twp', 'CLINTON_TWP', 'HIDDEN_VALLEY_LAKE', 'PIKE_CREEK', 'FAIRVIEW_PARK', 'YALE_UNIVERSITY', 'MOON_TWP', 'RUTGERS_UNIVERSITY', 'ROUND_HILL', 'PUEBLO_WEST'],
"c4" : ['Philadlephia', 'Chicago', 'Los_Angeles', 'San_Francisco', 'Seatte', 'New_York', 'Wernau', 'San_Francicso', 'Seattle', 'LAKE_HUGHES']
}

f2f = {}
for sf, s in sfeature2states.items():
    f2f[sf] = "-".join(s[:3])
for cf, c in cfeature2cities.items():
    f2f[cf] = "-".join(c[:3])

In [None]:
KLs = []
for i in range(em2_results["ss"].shape[1]):
    m1 = em2_results["mu"][0,i]
    m2 = em2_results["mu"][1,i]    
    s1 = em2_results["ss"][0,i,i]
    s2 = em2_results["ss"][1,i,i]
    KL = np.log(s2/s1) + (s1**2+(m1-m2)**2)/(2*s2**2) - 1/2
    KLs.append((KL, m1>m2, features.columns[i+1]))
KLs = sorted(KLs)
    
for KL, positive, feature in KLs:
    #if feature.startswith("s") or  feature.startswith("c"): positive=not positive
    positive = "attracted" if positive else "don't care"
    print("%7s %s %s"  % (("%.2f" % KL), "%10s" % positive, f2f.get(feature, feature)))

# Poisson processes clustering (twopoisson_em)

In [None]:
train_preds2, test_preds2 = twopoisson_em.twopoisson_em_classification(features, samples, 
                                                                       train_ids, test_ids, 
                                                                       badge_name="switch_time")

In [None]:
print_results(train_preds2, test_preds2)
compare_results(train_preds2, train_preds)
compare_results(test_preds2, test_preds)

In [None]:
visualization.pyplot_reset()
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42 
matplotlib.rcParams['text.usetex'] = True 
visualization.plot_groups(features, features_names, 
                          train_ids, train_preds2,
                          test_ids, test_preds2,
                          train_marker = "+", test_marker="x",
                          plot_legend=False, s=10, plot_densities=True)

pyplot.gcf().subplots_adjust(bottom=0.15, left=0.15)# right=0.85, left=0.15)
pyplot.xlim((15,55)); pyplot.ylim((4,12)); pyplot.ylabel("log(%s)" % features_names[1])
#pyplot.xlim((15,55)); pyplot.ylim((1.75,3.75)); pyplot.ylabel("similarity to US")
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42 
matplotlib.rcParams['text.usetex'] = True 
pyplot.savefig("../../data/badges_copyeditor_2p_preds0.pdf")