# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import pickle

from data.turk import TurkResults2Label
from data.dao import LabelGetter

In [2]:
batch1 = pd.read_csv('C:/users/tom work/downloads/Batch_2431673_batch_results.csv').drop_duplicates(subset='_id')
batch2 = pd.read_csv('C:/users/tom work/downloads/Batch_2431727_batch_results.csv').drop_duplicates(subset='_id')

In [3]:
batch1.index = batch1._id
batch2.index = batch2._id

Remove workers that answered the same question twice...

In [4]:
answers = batch1[batch1.Worker != batch2.Worker]

In [5]:
answers['Answer2'] = batch2.ix[answers.index].Answer

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [6]:
set(answers.Answer.values)

{'Alcohol Related::Discussion',
 'Alcohol Related::Promotional Content',
 'First Person - Alcohol::Casual Drinking',
 'First Person - Alcohol::Heavy Drinking',
 'First Person - Alcohol::Looking to drink',
 'First Person - Alcohol::Reflecting on drinking',
 'Not Alcohol Related',
 '{}'}

Some of them have no answer {} ??????

In [7]:
labels1= answers.Answer[answers.Answer != '{}'].apply(TurkResults2Label.parse_to_labels)
labels2= answers.Answer2[answers.Answer2 != '{}'].apply(TurkResults2Label.parse_to_labels)

In [8]:
labels1.shape

(947,)

In [9]:
labels2.shape

(973,)

In [10]:
batch1['label1'] = labels1

In [11]:
batch1['label2'] = labels2

In [12]:
labeled = batch1.dropna(subset=['label1', 'label2'])

In [13]:
agreed = labeled[labeled.label1 == labeled.label2]

In [14]:
agreed.shape

(476, 9)

# Kappa Scores

In [15]:
from sklearn.metrics import cohen_kappa_score

Alcohol

In [16]:
L = LabelGetter(labeled)

In [17]:
Xalc1, yalc1 = L._get_labels('alcohol', 'label1')
Xalc2, yalc2 = L._get_labels('alcohol', 'label2')
cohen_kappa_score(yalc1, yalc2)

0.57139053805809192

FPA

In [18]:
X_alc, y_alc = Xalc1[yalc1==yalc2], yalc1[yalc1==yalc2]

In [19]:
X_alc.shape

(760, 9)

In [22]:
L = LabelGetter(X_alc)
Xfpa1, yfpa1 = L._get_labels('first_person', 'label1')
Xfpa2, yfpa2 = L._get_labels('first_person', 'label2')
cohen_kappa_score(yfpa1, yfpa2)

0.2822212537559925

FPL

In [23]:
X_fpa, y_fpa = Xfpa1[yfpa1==yfpa2], yfpa1[yfpa1==yfpa2]

In [24]:
X_fpa.shape

(330, 9)

In [26]:
L = LabelGetter(X_fpa)
Xfpl1, yfpl1 = L._get_labels('first_person_level', 'label1')
Xfpl2, yfpl2 = L._get_labels('first_person_level', 'label2')
cohen_kappa_score(yfpl1, yfpl2)


0.35722100656455125

In [27]:
X_fpl, y_fpl = Xfpl1[yfpl1==yfpl2], yfpl1[yfpl1==yfpl2]

In [28]:
X_fpl.shape

(113, 9)

Looks like people are pretty good at deciding of a tweet is alcohol related, but FPA and FPL are more ambiguous.

# Test Metrics

Training set: June labeled data

Test set: Sept labeled data

In [29]:
clf_alc = pickle.load(open('pickles/clf_alc_UPDATED.p', 'rb'))
clf_fpa = pickle.load(open('pickles/clf_fpa_UPDATED.p', 'rb'))
clf_fpl = pickle.load(open('pickles/clf_fpl_double_labeled', 'rb'))

In [30]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, precision_score, recall_score

In [31]:
metrics = [accuracy_score, f1_score, confusion_matrix, classification_report]

In [32]:
def print_metrics(y_pred, y_true):
    for metric in metrics:
        kwargs = {}
        if metric in [f1_score]:
            kwargs["average"] = "weighted"
        print(metric.__name__ + ': \n', metric(y_true=y_true, y_pred=y_pred, **kwargs))

In [33]:
print_metrics(clf_alc.predict(X_alc), y_alc)

accuracy_score: 
 0.761842105263
f1_score: 
 0.829727187206
confusion_matrix: 
 [[138 109]
 [ 72 441]]
classification_report: 
              precision    recall  f1-score   support

          0       0.66      0.56      0.60       247
          1       0.80      0.86      0.83       513

avg / total       0.75      0.76      0.76       760





In [34]:
print_metrics(clf_fpa.predict(X_fpa), y_fpa)

accuracy_score: 
 0.669696969697
f1_score: 
 0.764578833693
confusion_matrix: 
 [[ 44  98]
 [ 11 177]]
classification_report: 
              precision    recall  f1-score   support

          0       0.80      0.31      0.45       142
          1       0.64      0.94      0.76       188

avg / total       0.71      0.67      0.63       330





In [35]:
print_metrics(clf_fpl.predict(X_fpl), y_fpl)

accuracy_score: 
 0.610619469027
f1_score: 
 0.628726257946
confusion_matrix: 
 [[36 12 14]
 [ 8 26  5]
 [ 3  2  7]]
classification_report: 
              precision    recall  f1-score   support

          0       0.77      0.58      0.66        62
          1       0.65      0.67      0.66        39
          2       0.27      0.58      0.37        12

avg / total       0.67      0.61      0.63       113



We want to see classifier performance on Sept training data vs June training data. Unfortunately, we don't have enough training data right now.

Training set: 67% Sept labeled

Test set: 33% Sept labeled

Note: these sets are really tiny so it isn't very indicative of performance for the lower levels of hierarchy. These really should be performing better than the June training.

In [36]:
from sklearn.cross_validation import train_test_split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_alc, y_alc, test_size=0.33, random_state=26)
clf_alc.fit(X_train, y_train)
print_metrics(clf_alc.predict(X_test), y_test)

accuracy_score: 
 0.844621513944
f1_score: 
 0.893150684932
confusion_matrix: 
 [[ 49  25]
 [ 14 163]]
classification_report: 
              precision    recall  f1-score   support

          0       0.78      0.66      0.72        74
          1       0.87      0.92      0.89       177

avg / total       0.84      0.84      0.84       251





In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_fpa, y_fpa, test_size=0.33, random_state=26)
clf_fpa.fit(X_train, y_train)
print_metrics(clf_fpa.predict(X_test), y_test)

accuracy_score: 
 0.633027522936
f1_score: 
 0.692307692308
confusion_matrix: 
 [[24 22]
 [18 45]]
classification_report: 
              precision    recall  f1-score   support

          0       0.57      0.52      0.55        46
          1       0.67      0.71      0.69        63

avg / total       0.63      0.63      0.63       109





In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_fpl, y_fpl, test_size=0.33, random_state=20)
clf_fpl.fit(X_train, y_train)
print_metrics(clf_fpl.predict(X_test), y_test)

accuracy_score: 
 0.526315789474
f1_score: 
 0.427848467322
confusion_matrix: 
 [[17  0  0]
 [15  3  0]
 [ 3  0  0]]
classification_report: 
              precision    recall  f1-score   support

          0       0.49      1.00      0.65        17
          1       1.00      0.17      0.29        18
          2       0.00      0.00      0.00         3

avg / total       0.69      0.53      0.43        38



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
