# Developing Iterators for Large Data

In [1]:
import os
import json

from itpy import Itpy
from itpy.helpers import getitem

from twokenize import twokenize

In [2]:
def iter_dir_contents(fpath):
    for fn in os.listdir(fpath):
        fn = fpath + fn
        with open(fn, "r") as f:
            for line in f:
                yield json.loads(line)
                
def data_control():
    path = "/Users/JasonLiu/Downloads/research-data/drinking/split/"
    return iter_dir_contents(path)

                
def data_sampled():
    path = "/Users/JasonLiu/Downloads/research-data/nondrinking/split/"
    return iter_dir_contents(path)

In [3]:
Itpy(data_sampled()).map(getitem("text")).filter(lambda _: "@" in _).slice(5, 10)._

['@lewischube no never',
 'I just wanna see @FightOwensFight power bomb Cena on the grates. #WWEChamber',
 "look @ my man's collar bones and neck in my header hes so gorgeous",
 "@zackmayes1 @DJohnsonPGA I'll go to the range with you now that I got nice clubs whenever you want man",
 '@_jennagraziano_ thanks for catching this! this ones original']

# Training Phrases

```
Itpy(data_sampled()).union(data_control())\
    .map(getitem("text")) \
    .map(tokenizer.tokenize)
```

In [2]:
from gensim import models, corpora
from twokenize import twokenize as tokenizer

In [3]:
p2 = models.Phrases.load("./twitter_p2.Phrase")
p3 = models.Phrases.load("./twitter_p3.Phrase")
d1 = corpora.Dictionary.load("./twitter_d1.Dict")
lda100 = models.LdaModel.load("./twitter_topics100.Lda")

%%time

p2 = models.Phrases(
    Itpy(data_sampled()).union(data_control())\
    .map(getitem("text")) \
    .map(str.lower)
    .map(tokenizer.tokenize)
)

p2.save("./twitter_p2.Phrase")

%%time

p3 = models.Phrases(p2[
    Itpy(data_sampled()).union(data_control())\
    .map(getitem("text")) \
    .map(str.lower)
    .map(tokenizer.tokenize)]
)

p3.save("./twitter_p3.Phrase")

%%time

d1 = corpora.Dictionary(p3[p2[
    Itpy(data_sampled()).union(data_control())\
    .map(getitem("text")) \
    .map(str.lower)
    .map(tokenizer.tokenize)
]])

d1.save("./twitter_d1.Dict")

# Training LDA

```
c = Itpy(data_sampled()).union(data_control())\
    .map(getitem("text")) \
    .map(str.lower) \
    .map(tokenizer.tokenize) \
    .map(p2.__getitem__) \
    .map(p3.__getitem__) \
    .map(d1.doc2bow)._
```

In [8]:
lda100 = models.LdaModel.load("./twitter_topics100.Lda")

In [None]:
Itpy(data_sampled()).union(data_control())\
    .map(getitem("text")) \
    .map(str.lower) \
    .map(tokenizer.tokenize) \
    .map(p2.__getitem__) \
    .map(p3.__getitem__) \
    .map(d1.doc2bow) \
    .map(lda100.__getitem__).slice(100)._

# Exploring Models

In [4]:
from data.dao import DataAccess

In [5]:
X = DataAccess.get_as_dataframe()

In [6]:
X_alch = X[X["labels"].apply(lambda _: _["alcohol"]) == 1]

In [7]:
topics = X_alch.text.str.lower()\
    .apply(twokenize.tokenize)\
    .map(p2.__getitem__) \
    .map(p3.__getitem__) \
    .map(d1.doc2bow) \
    .map(lda100.__getitem__)

In [None]:
topics = list(topics)

In [181]:
from itpy.helpers import try_or

In [193]:
best_topic = Itpy(topics).map(
    try_or(lambda n: max(n, key=lambda _: _[1])[0],
          lambda n: -1),
                )._

In [202]:
top_best_topic = Itpy(best_topic).frequency().most_common()

## Finding top topics

In [16]:
import numpy
import scipy.sparse as sparse

def convert2sparse(tokens):
        row, col, data = [], [], []
        for doc_id, document in enumerate(lda100[tokens]):
            for topic_id, weight in document:
                row.append(doc_id)
                col.append(topic_id)
                data.append(weight)
        return sparse.csr_matrix((data, (row, col)))

In [125]:
topic_values = convert2sparse(topics).sum(0)
document_topic = convert2sparse(topics)
document_topic = numpy.array(document_topic.todense())

In [131]:
topic2value = list(enumerate((document_topic).mean(0).tolist()))

In [132]:
topic2value.sort(key=lambda _: -_[1])

In [197]:
top_topics = [x for x,_ in topic2value[:30]]

### Print Top Topics for Alcohol Related

In [203]:
for _id, c in top_best_topic[:10]:
    print("topic:",_id,"\t" "count:", c, "\n", get_topic_words(lda100.show_topic(_id, 30)))
    print()

topic: 81 	count: 177 
 ['beer', '.', 'he', 'a', 'think', 'i', 'and', ',_az', 'is', 'the', 'with', 'in', 'talented', "didn't", 'for', 'on', 'my', 'book', 'mind', 'to', 'but', 'cocktails', 'at', 'between', 'lunch', 'later', ',', 'delicious', 'lets', 'fly']

topic: 31 	count: 165 
 ['it', 'was', 'i', '.', 'they', 'and', 'the', 'a', 'to', 'but', 'after', 'nigga', 'omg', ',', 'menu_:', 'you_guys', 'dragging', 'no_more', 'talking', "don't_think", 'pour', 'games', 'told_me', 'yep', 'clean', 'led', 'breakfast', "don't_want", 'happened', 'sounds_like']

topic: 76 	count: 143 
 ['have', 'a', 'i', '.', 'her', 'to', 'one', 'and', 'female', 'the', 'home', 'in', 'with', 'for', ';', 'arizona', 'hate', 'but', 'my', 'drive', 'morning', ',', 'at', 'tf', 'bus', 'burger', 'cherry', 'sharing', 'been_drinking', 'yall_females']

topic: 74 	count: 92 
 ['not', '.', "it_'s", 'a', ',', 'but', 'the', 'its', 'and', 'to', 'for', 'in', 'on', 'high', 'would_be', 'beach', 'bring', 'i', 'days', 'with', 'going_to_be',

In [1]:
X_alch["topics"] = best_topic

In [207]:
for idx, _ in top_best_topic:
    print(idx)
    print(get_topic_words(lda100.show_topic(idx, 30)))
    print(*list(X_alch[X_alch.topics == idx].text)[:30], sep="\n\n")
    print()
    print()

81
['beer', '.', 'he', 'a', 'think', 'i', 'and', ',_az', 'is', 'the', 'with', 'in', 'talented', "didn't", 'for', 'on', 'my', 'book', 'mind', 'to', 'but', 'cocktails', 'at', 'between', 'lunch', 'later', ',', 'delicious', 'lets', 'fly']
Stone Cold use to be the baddest MF in my book lol dude use to get hype; whip your ass and waste a case of beer in the ring 😂

Light, but doesn't lack flavor. A good summer beer to drink.... (Bitter Brewer) http://t.co/snpHeJLIM2 #photo

I need a night this summer to sit by a fire and make s'mores and smoke weed and drink beer ok

Had a beer with an ex I havent seen in 2 years so yeah, I am going eat this leftover pizza &amp; fall asleep in my clothes. I earned it.

he straight up drank from my bottle oh my god

We he a little horsey named Paul Revere just me and my homies and a quart of beer.

I just want to drink some beer on a fuggin boat

When you're drunk you're either annoying or funny there is no in between

mike just asked if he should drink a bee

# Testing Pipeline

In [1]:
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform

from pipelines.alcohol import AlcoholPipeline
from data import iterate_heirarchy
from classification.compute import CustomGridSearch
from gridsearch import text_grid

In [52]:


param_grid = {
    'clf__C': uniform(0.01, 1000),
    'clf__class_weight': ["auto", None],
    'clf__penalty': ['l2', "l1"],
    'clf__tol': uniform(0.00001, 0.001),
    'clf__verbose': [0],
}

param_grid.update(text_grid)

cv_kwargs = dict(
    n_iter=200,
    scoring=None,
    fit_params=None,
    n_jobs=4,
    iid=True,
    refit=True,
    cv=None,
    verbose=3,
    pre_dispatch='2*n_jobs',
    error_score=0
)

In [21]:
from data.dao import DataAccess, LabelGetter

In [22]:
XX = DataAccess.get_as_dataframe()
LL = LabelGetter(XX)

In [195]:
X, y = LL.get_alcohol()

In [196]:
from sklearn.cross_validation import train_test_split

In [197]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [202]:
%%time
pipeline = AlcoholPipeline(global_features=["text"]).pipeline(LogisticRegression(C=30))
pipeline.fit(X_train, y_train)

CPU times: user 127 ms, sys: 2.37 ms, total: 129 ms
Wall time: 128 ms


In [203]:
yy = pipeline.predict(X_test)

In [204]:
metrics.accuracy_score(yy, y_test)

0.77572964669738864

In [205]:
metrics.f1_score(yy, y_test)

0.80108991825613085