This notebook contains all code to reproduce results in the paper "Analyzing e-cigarette sentiment of Twitter", published in the Computational Health Sciences Workshop at the 6th ACM Conference on Bioinformatics, Computational Biology, and Health Informatics, 2015.

###Data

We begin with the data collected by Sherry Emery et al. <slemery@uic.edu>, which consists 4.6M tweets from 2012-10-01 to 2013-09-30. These tweets have already been classified as "organic" or not using an SVM classifier (see Huang, Jidong et al. "A cross-sectional examination of marketing of electronic cigarettes on Twitter." Tobacco control). We restrict our analysis to those classified as organic. We will assume these data live in `/data/chs15/ecig.csv.gz`.

In [274]:
from collections import Counter, defaultdict
import cPickle
import csv
import datetime
import gzip
import itertools
import json
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import numpy as np
import re
import requests
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from tabulate import tabulate

DATA = '/data/chs15'

In [3]:
def read_csv(filename, fields=['text', 'svm', 'hand_label', 'posted_time', 'real_name', 'username']):
    f = gzip.open(filename, 'rb')
    csvr = csv.DictReader(f, delimiter=',', quotechar='"')
    for row in csvr:
        yield dict([(k, row[k]) for k in fields if k in row])

In [5]:
# Read all "organic" tweets.
raw_tweets = [r for r in read_csv(DATA + '/ecig.csv.gz') if r['svm'] == '-']
print('read %d "organic" tweets' % len(raw_tweets))

read 992633 "organic" tweets


Next, we manually lableed 2,000 tweets into one of three categories:
- **positive (1):** express positive sentiment toward ecigs, or indicate that the speaker uses ecigs.
- **negative (0):** express negative sentiment toward ecigs *or* do not express sentiment, e.g., informative

(We originally separated negative into two classes, but accuracy was too low to support further analysis.)

We assume these data live in `/data/chs15/labeled.csv.gz`.

In [258]:
labeled_tweets = [r for r in read_csv(DATA + '/labeled.csv.gz', fields=['text', 'sent', 'real_name', 'username'])]

print('read %d labeled tweets' % len(labeled_tweets))
# Set labels.

label_map = {'-1': 'negative', '0': 'negative', '1': 'positive'}
for t in labeled_tweets:
    t['sent'] = label_map[t['sent']]
             
label_encoder = LabelEncoder()
label_encoder.fit(['negative', 'positive'])
y = label_encoder.transform([t['sent'] for t in labeled_tweets])             
print('Label distribution=%s' % Counter(t['sent'] for t in labeled_tweets).most_common(3))

read 2000 labeled tweets
Label distribution=[('negative', 1296), ('positive', 704)]


### Classifier Training

Using `labeled_tweets`, we'll train a logistic regression classifier.

In [259]:
# Tweet tokenizer.
def tokenize(text):
    punc_re = '[' + re.escape(string.punctuation) + ']'
    text = text.lower()
    text = re.sub('#(\S+)', r'HASHTAG_\1', text)
    text = re.sub('@(\S+)', r'MENTION_\1', text)
    text = re.sub('http\S+', 'THIS_IS_A_URL', text)
    text = re.sub(r'(.)\1\1\1+', r'\1', text)
    text = re.sub(r'[0-9]', '9', text)
    toks = []
    for tok in text.split():
        tok = re.sub(r'^(' + punc_re + '+)', r'\1 ', tok)
        tok = re.sub(r'(' + punc_re + '+)$', r' \1', tok)
        for subtok in tok.split():
            if re.search('\w', subtok):
                toks.append(subtok)
    return toks

In [260]:
vectorizer = TfidfVectorizer(decode_error='ignore', ngram_range=(1, 2), max_df=1., min_df=2,
                             use_idf=True, tokenizer=tokenize, binary=False, norm='l2')
X = vectorizer.fit_transform(t['text'] for t in labeled_tweets)
print('Vectorized %d tweets. Found %d terms.' % (X_labeled.shape[0], X.shape[1]))
features = np.array(vectorizer.get_feature_names())

Vectorized 2000 tweets. Found 4824 terms.


In [264]:
def confusion(truths, preds, labels):
    m = confusion_matrix(truths, preds)
    m = np.vstack((labels, m))
    m = np.hstack((np.matrix([''] + list(labels)).T, m))
    return tabulate(m.tolist(), headers='firstrow')

def top_coef(clf, vocab, n=10):
    if len(clf.classes_) == 2:
        coefs = [clf.coef_[0], -clf.coef_[0]]
    else:
        coefs = clf.coef_
    for li, label in enumerate(clf.classes_):
        print('\nCLASS %s' % label)
        coef = coefs[li]
        top_coef_ind = np.argsort(coef)[::-1][:n]
        top_coef_terms = vocab[top_coef_ind]
        top_coef = coef[top_coef_ind]
        print '\n'.join(['%s\t%.3f' % (term, weight) for term, weight in zip(top_coef_terms, top_coef)])

def do_cv(X, y, labels, nfolds=10):
    cv = KFold(len(y), nfolds, random_state=123456)
    preds = []
    truths = []
    for train, test in cv:
        clf = LogisticRegression(class_weight='auto')
        clf.fit(X[train], y[train])
        preds.extend(clf.predict(X[test]))
        truths.extend(y[test])
    print ('accuracy=%.3f' % (accuracy_score(truths, preds)))
    print classification_report(truths, preds, target_names=labels)
    print confusion(truths, preds, labels)
    clf = LogisticRegression(class_weight='auto')
    clf.fit(X, y)
    return clf, truths, preds

clf, truths, preds = do_cv(X, y, label_encoder.classes_, 10)
top_coef(clf, features, 5)

accuracy=0.793
             precision    recall  f1-score   support

   negative       0.84      0.83      0.84      1296
   positive       0.70      0.72      0.71       704

avg / total       0.79      0.79      0.79      2000

            negative    positive
--------  ----------  ----------
negative        1082         214
positive         199         505

CLASS 0
my	5.390
i	4.668
vaping	2.160
HASHTAG_vaping	1.722
HASHTAG_ecig	1.663

CLASS 1
THIS_IS_A_URL	4.017
e-cigarettes	1.940
de	1.171
as	1.164
99	1.112


In [271]:
# Write cross-validation table.
def clfreport_to_tex(report, outfile):
    """ Write a sklearn classification report as a latex table. """
    report = re.sub(r' \/ total', '', report)
    report = re.sub(r'precision', 'Prec', report)
    report = re.sub(r'recall', 'Rec', report)
    report = re.sub(r'f1-score', 'F1', report)
    report = re.sub(r'support', 'N', report)
    table = ['\\begin{tabular}{|r|c|c|c|c|}', '\\hline']
    lines = report.split('\n')
    for i, line in enumerate(lines):
        parts = line.strip().split()
        if len(parts) > 0:
            if i == 0:
                parts = [''] + ['{\\bf %s}' % p for p in parts]
            else:
                parts[0] = '{\\bf %s}' % parts[0]
            table.append(' & '.join(parts) + '\\\\')
        else:
            table.append('\\hline')
    table.append('\\end{tabular}')
    of = open(outfile, 'wt')
    of.write('\n'.join(table))
    
clfreport_to_tex(classification_report(truths, preds, target_names=labels), 'cv.tex')


In [272]:
# Save top coef table.
def clean(s):
    s = re.sub('HASHTAG_', '\#', s)
    s = re.sub('MENTION_', '@', s)
    s = re.sub('THIS_IS_A_URL', 'URL', s)
    s = re.sub(r'_', '\\_', s)
    return s
    
def write_top_coef(clf, vocab, labels, outf, n=20):
    out = open(outf, 'wt')
    coefs = [clf.coef_[0], -clf.coef_[0]]
    for li, label in enumerate(labels):
        coef = coefs[li]
        top_coef_ind = np.argsort(coef)[::-1][:n]
        top_coef_terms = vocab[top_coef_ind]
        out.write('{\\bf %s} & %s\\\\\n\hline\n' % (label, ', '.join(clean(s) for s in top_coef_terms)))
        
write_top_coef(clf, features, labels, 'coef.tex', n=20)


### Applying classifier

Next, we classify all the unlabeled tweets using the classifier.

In [273]:
X_raw = vectorizer.transform(t['text'] for t in raw_tweets)

In [277]:
preds_raw = clf.predict(X_raw)
print('predicted label distribution on raw tweets: %s' % Counter(preds_raw).most_common(2))
for tweet, pred in zip(raw_tweets, preds_raw):
    t['sent'] = labels[pred]

predicted label distribution on raw tweets: [(0, 623396), (1, 369237)]


### Gender

Next, we classify each user by gender using a list of names from the census.

In [282]:
def get_gender_names(cutoff=75):
    males_url = 'http://www2.census.gov/topics/genealogy/1990surnames/dist.male.first'
    females_url = 'http://www2.census.gov/topics/genealogy/1990surnames/dist.female.first'
    males = set([l.split()[0].lower() for l in requests.get(males_url).text.split('\n') if l and float(l.split()[2]) < cutoff])
    females = set([l.split()[0].lower() for l in requests.get(females_url).text.split('\n') if l and float(l.split()[2]) < cutoff ])
    print('found %d male and %d female names with cutoff=%.2f' % (len(males), len(females), cutoff))
    return remove_ambiguous_names(males, females)

def remove_ambiguous_names(male_names, female_names):
    ambiguous = male_names & female_names
    male_names -= ambiguous
    female_names -= ambiguous
    print('removed %d ambiguous names, leaving %d males and %d females' % (len(ambiguous), len(male_names), len(female_names)))
    return male_names, female_names
    
male_names, female_names = get_gender_names()

found 232 male and 523 female names with cutoff=75.00
removed 6 ambiguous names, leaving 226 males and 517 females


In [285]:
def label_genders(tweets, male_names, female_names):
    for t in tweets:
        if len(t['real_name'])>1:
            first = t['real_name'].split()[0].lower()
        else:
            first = t['real_name'].lower()
        if first in male_names:
            t['gender'] = 'male'
        elif first in female_names:
            t['gender'] = 'female'
        else:
            t['gender'] = 'unknown'
label_genders(raw_tweets, male_names, female_names)
print('overall gender distribution=%s' % Counter(t['gender'] for t in raw_tweets).most_common(3))

overall gender distribution=[('unknown', 732856), ('male', 156902), ('female', 102875)]
