This notebook contains all code to reproduce results in the paper "Analyzing e-cigarette sentiment of Twitter", published in the Computational Health Sciences Workshop at the 6th ACM Conference on Bioinformatics, Computational Biology, and Health Informatics, 2015.

###Data

We begin with the data collected by Sherry Emery et al. <slemery@uic.edu>, which consists 4.6M tweets from 2012-10-01 to 2013-09-30. These tweets have already been classified as "organic" or not using an SVM classifier (see Huang, Jidong et al. "A cross-sectional examination of marketing of electronic cigarettes on Twitter." Tobacco control). We restrict our analysis to those classified as organic. We will assume these data live in `/data/chs15/ecig.csv.gz`.

In [100]:
from collections import Counter, defaultdict
import cPickle
import csv
import datetime
import gzip
import itertools
import json
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import numpy as np
import re
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from tabulate import tabulate

DATA = '/data/chs15'

In [3]:
def read_csv(filename, fields=['text', 'svm', 'hand_label', 'posted_time', 'real_name', 'username']):
    f = gzip.open(filename, 'rb')
    csvr = csv.DictReader(f, delimiter=',', quotechar='"')
    for row in csvr:
        yield dict([(k, row[k]) for k in fields if k in row])

In [5]:
# Read all "organic" tweets.
raw_tweets = [r for r in read_csv(DATA + '/ecig.csv.gz') if r['svm'] == '-']
print('read %d "organic" tweets' % len(raw_tweets))

read 992633 "organic" tweets


Next, we manually lableed 2,000 tweets into one of three categories:
- **positive (1):** express positive sentiment toward ecigs, or indicate that the speaker uses ecigs.
- **negative (-1):** express negative sentiment toward ecigs
- **neutral (0):** do not express sentiment, e.g., informative

We assume these data live in `/data/chs15/labeled.csv.gz`.

In [111]:
labeled_tweets = [r for r in read_csv(DATA + '/labeled.csv.gz', fields=['text', 'sent', 'real_name', 'username'])]

print('read %d labeled tweets' % len(labeled_tweets))
# Set labels.

label_map = {'-1': 'negative', '0': 'neutral', '1': 'positive'}
for t in labeled_tweets:
    t['sent'] = label_map[t['sent']]
             
label_encoder = LabelEncoder()
label_encoder.fit(['negative', 'neutral', 'positive'])
y = label_encoder.transform([t['sent'] for t in labeled_tweets])             
print('Label distribution=%s' % Counter(t['sent'] for t in labeled_tweets).most_common(3))

read 2000 labeled tweets
Label distribution=[('neutral', 1014), ('positive', 704), ('negative', 282)]


### Classification

Using `labeled_tweets`, we'll train a logistic regression classifier.

In [185]:

# Tweet tokenizer.
def tokenize(text):
    punc_re = '[' + re.escape(string.punctuation) + ']'
    text = text.lower()
    text = re.sub('#(\S+)', r'HASHTAG_\1', text)
    text = re.sub('@(\S+)', r'MENTION_\1', text)
    text = re.sub('http\S+', 'THIS_IS_A_URL', text)
    text = re.sub(r'(.)\1\1\1+', r'\1', text)
    text = re.sub(r'[0-9]', '9', text)
    toks = []
    for tok in text.split():
        tok = re.sub(r'^(' + punc_re + '+)', r'\1 ', tok)
        tok = re.sub(r'(' + punc_re + '+)$', r' \1', tok)
        for subtok in tok.split():
            if re.search('\w', subtok):
                toks.append(subtok)
    return toks

In [186]:
vectorizer = TfidfVectorizer(decode_error='ignore', ngram_range=(1, 2), max_df=1., min_df=2,
                             use_idf=True, tokenizer=tokenize, binary=False, norm='l2')
X = vectorizer.fit_transform(t['text'] for t in labeled_tweets)
print('Vectorized %d tweets. Found %d terms.' % (X_labeled.shape[0], X.shape[1]))
features = np.array(vectorizer.get_feature_names())

Vectorized 2000 tweets. Found 4824 terms.


In [204]:
def confusion(truths, preds, labels):
    m = confusion_matrix(truths, preds)
    m = np.vstack((labels, m))
    m = np.hstack((np.matrix([''] + list(labels)).T, m))
    return tabulate(m.tolist(), headers='firstrow')

def top_coef(clf, vocab, n=10):
    if len(clf.classes_) == 2:
        coefs = [clf.coef_[0], -clf.coef_[0]]
    else:
        coefs = clf.coef_
    for li, label in enumerate(clf.classes_):
        print('\nCLASS %s' % label)
        coef = coefs[li]
        top_coef_ind = np.argsort(coef)[::-1][:n]
        top_coef_terms = vocab[top_coef_ind]
        top_coef = coef[top_coef_ind]
        print '\n'.join(['%s\t%.3f' % (term, weight) for term, weight in zip(top_coef_terms, top_coef)])

def do_cv(X, y, labels, nfolds=10):
    cv = KFold(len(y), nfolds, random_state=123456)
    preds = []
    truths = []
    for train, test in cv:
        clf = LogisticRegression(class_weight='auto')
        clf.fit(X[train], y[train])
        preds.extend(clf.predict(X[test]))
        truths.extend(y[test])
    print ('accuracy=%.3f' % (accuracy_score(truths, preds)))
    print classification_report(truths, preds, target_names=labels)
    print confusion(truths, preds, labels)
    clf = LogisticRegression(class_weight='auto')
    clf.fit(X, y)
    return clf

clf = do_cv(X, y, label_encoder.classes_, 10)
top_coef(clf, features, 20)

accuracy=0.744
             precision    recall  f1-score   support

   negative       0.60      0.54      0.57       282
    neutral       0.77      0.84      0.81      1014
   positive       0.75      0.68      0.71       704

avg / total       0.74      0.74      0.74      2000

            negative    neutral    positive
--------  ----------  ---------  ----------
negative         152         68          62
neutral           59        856          99
positive          43        182         479

CLASS 0
you	2.542
smoking	2.334
smoking an	2.254
he	2.107
fuck	2.010
people	1.902
smokes	1.833
an	1.816
faggot	1.791
are	1.783
class	1.753
smoke	1.736
stupid	1.724
in	1.527
look	1.504
shit	1.473
her	1.402
pussy	1.383
sorry	1.342
one	1.320

CLASS 1
THIS_IS_A_URL	5.085
e-cigarettes	1.661
de	1.345
la	1.170
99	1.167
retail	1.052
THIS_IS_A_URL retail	1.052
ni	0.988
e-cigarette	0.942
markten	0.932
store	0.931
by	0.894
cigarette	0.883
of	0.876
dallas	0.862
smokers	0.801
9999	0.770
MENTION_vaper_tra

In [203]:
# Also try collapsing classes -1 and 0.
y_collapsed = np.array([1 if yi == 1 else 0 for yi in y])
labels_collapsed = ['negative', 'positive']
clf_collapsed = do_cv(X, y_collapsed, labels_collapsed, 10)
top_coef(clf_collapsed, features, 20)

accuracy=0.810
             precision    recall  f1-score   support

   negative       0.80      0.82      0.81       986
   positive       0.82      0.80      0.81      1014

avg / total       0.81      0.81      0.81      2000

            negative    positive
--------  ----------  ----------
negative         812         174
positive         206         808

CLASS 0
THIS_IS_A_URL	5.721
e-cigarettes	1.803
de	1.487
99	1.337
la	1.334
ni	1.208
THIS_IS_A_URL retail	1.194
retail	1.194
of	1.150
by	1.104
cigarette	1.097
store	1.089
markten	1.061
e-cigarette	1.026
smokers	0.999
dallas	0.985
9999	0.918
MENTION_vaper_trail	0.912
vuse	0.894
may	0.887

CLASS 1
my	4.881
i	4.049
an	2.154
smoking	1.904
vaping	1.702
lol	1.505
got	1.427
fuck	1.381
me	1.321
HASHTAG_vaping	1.309
class	1.297
this	1.261
e-cig	1.218
shit	1.133
HASHTAG_ecig	1.131
cool	1.104
HASHTAG_vape	1.101
i'm	1.079
my ecig	1.017
smoking an	1.006


Given the superior accuracy of the collapsed classifier, we'll proceed by collapsing 'negative' and 'neutral' into one class.

In [205]:
clf = clf_collapsed
labels = labels_collapsed