# Read the data

In [None]:
import unicodecsv as csv
from monkeylearn import MonkeyLearn
ml = MonkeyLearn('<your API key here>')

In [None]:
articles = [row for row in csv.reader(open("items_TechCrunch.csv"))][-100:]

# Classify:

## Without using a pipeline

We manually evaluate which articles are about startups and then only send those to the events and industry classifiers

In [None]:
text_list = ["\n".join([sample[1], sample[0], sample[3], sample[4]]) for sample in articles]

# startup classifier
module_id = 'cl_Xq6cFpsX'
res = ml.classifiers.classify(module_id, text_list)

for article, tag in zip(articles, res.result):
    article.extend([tag[0]["label"], tag[0]["probability"]])

# fill with empty spaces the ones that are filtered out
for article in [article for article in articles if article[-2] == "not_startup"]:
    article.extend(["","",""])

# classify the startup articles with event and industries
articles_startup = [article for article in articles if article[-2] == "startup"]
text_list = ["\n".join([sample[1], sample[0], sample[3], sample[4]]) for sample in articles_startup]

res_events = ml.classifiers.classify('cl_iju4tFr6', text_list)
res_industries = ml.classifiers.classify('cl_nt3NPYem', text_list)

for article, event, industries in zip(articles_startup, res_events.result, res_industries.result):
    # Join the list of industry labels under one semicolon separated line
    industries = ";".join([tag[0]["label"] for tag in industries])
    article.extend([event[0]["label"], event[0]["probability"], industries])

## Or, Using a pipeline

We send everything only once, and the pipeline knows what to do. Because you're not sending your data multiple times, it's twice as fast!

In [None]:
data = {
  "texts": [{"text": "\n".join([sample[1], sample[0], sample[3], sample[4]])} for sample in articles]
}

# Use the pipeline to classify. We only talk with MonkeyLearn once
res = ml.pipelines.run('pi_5HDTFuxu', data)

for article, tags in zip(articles, res.result["results"]):
    # First add to every article the startup tag
    is_startup = tags["is_startup"][0]
    article.extend([is_startup["label"], is_startup["probability"]])
    
    if is_startup["label"] == "startup":
        event = tags["event"]
        # Join the list of industry labels under one semicolon separated line
        industries = ";".join([tag[0]["label"] for tag in tags["industry"]])
        article.extend([event[0]["label"], event[0]["probability"], industries])
    else:
        article.extend(["",""])

# Save those results back to a csv

In [None]:
with open("classified_example.csv", "wb") as csvfile:
    writer = csv.writer(csvfile, dialect='excel')
    writer.writerows(articles)