In [1]:
import os
import numpy as np
from matplotlib import pyplot
import seaborn as sns; sns.set()
from scipy.stats import chi2
import pandas as pd

import sys
sys.path.append("../")

from aux.events_io import load_events, store_events
from testing.user_testing import *
from analytics.plots_users import *

INF = float("inf")
EPS = 10**(-18)

%matplotlib inline


# EDITS DATA PREPARATION

### Features Selection

In [2]:
user2features = pd.read_csv("../../data/badges/covariates.tsv", sep="\t")
user2features = user2features[user2features["age"]<70]


In [3]:
city_features = ["c0", "c1", "c2", "c3", "c4"]
state_features = ["s0", "s1", "s2", "s3", "s4"]
user_features = ["age", "upvotes"]# "downvotes", "comments"]  #, "reputation"
user2features = user2features[["id"]]#+user_features+city_features] #city_features

### Temporal traces

In [4]:
src = "../../data/badges/edits/edits.tsv"
EDITOR=46.371527777777786
STRUNK_AND_WHITE=46.371527777777786
COPY_EDITOR=708.4131944444446
badges_intro = [EDITOR, STRUNK_AND_WHITE, COPY_EDITOR]
badges = ["editor", "strunk_&_white", "copy_editor"]
df = pd.read_csv(src, sep="\t")

In [5]:
user2activeness = pd.read_csv("../../data/badges/activeness_min_max.tsv", sep="\t")

In [6]:
#Update user's max time
user2max = dict(zip(user2activeness["userid"], user2activeness["max"]))
MAXT = max(df["time"])
max_times = [min(user2max.get(uid, INF), MAXT) for uid in df[df["type"]=="max_time"]["id"]]
df.loc[df["type"]=="max_time", "time"] = max_times
df = df[df["id"].isin(user2features["id"])]

In [7]:
#Convert format
id2events = load_events(df)

In [8]:
#Update user's badge awarding noticing
for events in id2events.values():
    if "copy_editor" in events and events["copy_editor"][0]>COPY_EDITOR+2:
        events["switch_time"] = [np.ceil(events["copy_editor"][0])]

In [9]:
#Update user's min
for events in id2events.values():
    events["start_time"] = [max(events["start_time"][0], COPY_EDITOR)]
        
    if "copy_editor" in events:                
        #mid_badges = (events["strunk_&_white"][0]+events["copy_editor"][0])*0.5
        mid_badges = np.ceil(events["strunk_&_white"][0]) + 1
        events["start_time"] = [max(events["start_time"][0], mid_badges)]    

In [10]:
#Update user's action times
for events in id2events.values():
    s = events["start_time"][0]
    e = events["max_time"][0]
    events["action"] = [t for t in events.get("action",[]) if t>s and t<e]

In [11]:
samples = store_events(id2events).sort_values(["id", "time"])

# COPY EDITOR DATA CLASSIFICATION

In [12]:
busers = [i for i, e in id2events.items() if "switch_time" in e]
bfeatures = user2features[user2features.id.isin(busers)]#["age"]
bfeatures = bfeatures[~bfeatures.isnull().any(axis=1)]
busers = list(set(bfeatures.id))

In [None]:
nusers = [i for i, e in id2events.items() if "research_assistant" not in e]
nfeatures = user2features[user2features.id.isin(nusers)]
nfeatures = nfeatures[~nfeatures.isnull().any(axis=1)]
nusers = list(set(nfeatures.id))

In [None]:
#pyplot.hist(bfeatures.age, bins=np.array(range(50))*2, label="users with badge", fill=None, normed=True, histtype="step", lw=3)
#pyplot.hist(nfeatures.age, bins=np.array(range(50))*2, label="users without badge", normed=True, histtype="step", lw=3)
#pyplot.legend()
#pyplot.xlabel("age")
#pyplot.xlim((10,60))

### Data transofrmations and preparation

In [14]:
np.random.shuffle(nusers)
train_ids, test_ids = busers, nusers[:50]
features = nfeatures.append(bfeatures, ignore_index=True)
print("features=",list(enumerate(features.columns[1:])))


NameError: name 'nusers' is not defined

In [15]:

#plotted features
features_no, features_names = [0, 2], ["age", "s0"] #MUST AGREE!
#features_no, features_names = [1, 2], ["upvotes", "downvotes"] #MUST AGREE!
#features_no, features_names = [10, 0], ["s2", "age"] #MUST AGREE!

# EM2

In [20]:
import sys
sys.path.append("../")

from aux import parsing, sharing

from testing import user_testing
from clustering import em2
from classification import nhst_with_em2, twopoisson_em
from classification import evaluation
from classification import visualization

import logging
from imp import reload
reload(logging)
fmt = '[%(process)4d][%(asctime)s][%(levelname)-5s][%(module)s:%(lineno)d/%(funcName)s] %(message)s'
logging.basicConfig(format=fmt, level=logging.DEBUG, datefmt='%I:%M:%S')

reload(em2)
reload(twopoisson_em)
reload(nhst_with_em2)
reload(visualization)


<module 'classification.visualization' from '../classification/visualization.py'>

In [None]:
nhst_preds_file = "../../data/badges_copyeditor_nhst_preds.tsv"

In [None]:
#nhst classification
if os.path.isfile(nhst_preds_file):
    id2nhst = pd.read_csv(nhst_preds_file, sep="\t")
else:
    train_preds, _ = nhst_with_em2.nhst_classification(user2features, samples, train_ids, [], 
                                         #test=user_testing.wilks_pvalue,                                                
                                         test=user_testing.bootstrap_pvalue, 
                                         badge_name="switch_time", 
                                         pvalue_threshold=0.05, cpus=1)
    id2nhst = pd.DataFrame({"id": train_ids, "nhst_pred": train_preds})
    id2nhst = id2nhst.to_csv(nhst_preds_file, sep="\t", index=False, header=True)
id2nhst = dict(zip(id2nhst["id"], id2nhst["nhst_pred"]))
nhst_preds = np.array([id2nhst[i] for i in train_ids])

#clustering
train_preds, test_preds = nhst_with_em2.two_step_classification(features, samples, 
                                                                train_ids, test_ids,
                                                                sigma=1, kappa=1,
                                                                train_preds=nhst_preds)
em2_results = nhst_with_em2.two_step_classification.intermediate_results

margin = 0
print("train_preds0=", sum(train_preds<0.5-margin))
print("train_preds1=",sum(train_preds>0.5+margin))
print("test_preds=0",sum(test_preds<0.5-margin)/(len(test_preds)+0.000001))
print("test_preds=1",sum(test_preds>0.5+margin)/(len(test_preds)+0.000001))

In [None]:
#PLOT NHST RESULTS
nhst_train_preds = em2_results["nhst_preds"]

visualization.pyplot_reset()
visualization.plot_groups(features, features_names, 
                          train_ids, nhst_train_preds, 
                          train_marker = "+", test_marker="x",
                          #ylabel="log(%s)" % features_names[1], 
                          plot_legend=False, s=10, plot_densities=True)

In [None]:
#FINAL RESULTS

visualization.pyplot_reset()
visualization.plot_groups(features, features_names, 
                          train_ids, train_preds, 
                          test_ids, test_preds,
                          train_marker = "+", test_marker="x",
                          #ylabel="log(%s)" % features_names[1], 
                          plot_legend=True, s=10, plot_densities=False)
visualization.plot_bivariate(em2_results["mu"][0,features_no], em2_results["ss"][0][np.ix_(features_no,features_no)], cmap="Blues")
visualization.plot_bivariate(em2_results["mu"][1,features_no], em2_results["ss"][1][np.ix_(features_no,features_no)], cmap="Reds")



margin = 0.0
print("train_preds0=", sum(train_preds<0.5-margin))
print("train_preds1=",sum(train_preds>0.5+margin))
print("test_preds=0",sum(test_preds<0.5-margin)/(len(test_preds)+0.000001))
print("test_preds=1",sum(test_preds>0.5+margin)/(len(test_preds)+0.000001))


In [None]:
from scipy.stats.stats import pearsonr
print(pearsonr(nhst_train_preds, train_preds))

threshold = 0.5
true  = np.asarray((np.asarray(nhst_train_preds)>threshold), dtype=int)
preds = np.asarray((np.asarray(train_preds)>threshold), dtype=int)
TP = sum((true == 1) & (preds == 1))
FP = sum((true == 0) & (preds == 1))
TN = sum((true == 0) & (preds == 0))
FN = sum((true == 1) & (preds == 0))
print(TP, TN, FP, FN)

# Two-Poisson EM

In [None]:
train_preds2, test_preds2 = twopoisson_em.twopoisson_em_classification(features, samples, train_ids, test_ids, badge_name="switch_time")

In [None]:

margin = 0.0
print("train_preds0=", sum(train_preds2<0.5-margin))
print("train_preds1=",sum(train_preds2>0.5+margin))
print("test_preds=0",sum(test_preds2<0.5-margin)/(len(test_preds)+0.000001))
print("test_preds=1",sum(test_preds2>0.5+margin)/(len(test_preds)+0.000001))


In [None]:
threshold = 0.5
true  = np.asarray((np.asarray(train_preds)>threshold), dtype=int)
preds = np.asarray((np.asarray(train_preds2)>threshold), dtype=int)
TP = sum((true == 1) & (preds == 1))
FP = sum((true == 0) & (preds == 1))
TN = sum((true == 0) & (preds == 0))
FN = sum((true == 1) & (preds == 0))
print(TP, TN, FP, FN)