The restaurants dataset comes from the SemEval 2014 task 4 - aspect based sentiment analysis.  It is stored in XML format and so the first task is to load and convert it to a format which is easier to work with for subsequent tasks.

In [2]:
import xml.etree.ElementTree as et

In [3]:
xtree = et.parse("Restaurants_Train.xml")
xroot = xtree.getroot()

In [82]:
def get_dict(sentence):
    
    """ Get dictionary from each XML sentence object """
    
    sid = sentence.attrib["id"]
    text = sentence[0].text
    
    if len(sentence) == 3:
        at = sentence[1]
        ac = sentence[2]     
    else:
        at = []
        ac = sentence[1]
        
    aspect_terms = [{"term": aspect_term.attrib["term"],
        "polarity": aspect_term.attrib["polarity"],
         "from": int(aspect_term.attrib["from"]),
         "to":int(aspect_term.attrib["to"])} for aspect_term in at]
    
    aspect_categories = [
        {"category": aspect_category.attrib["category"], "polarity": aspect_category.attrib["polarity"]} 
        for aspect_category in ac]
    
    return {"id": sid, "text": text, "aspect_terms": aspect_terms, "aspect_categories": aspect_categories}

restaurant_reviews = [get_dict(sentence) for sentence in xroot]

Let's take a look at our first dictionary object to make sure it is as expected.

In [96]:
restaurant_reviews[0]

{'id': '3121',
 'text': 'But the staff was so horrible to us.',
 'aspect_terms': [{'term': 'staff',
   'polarity': 'negative',
   'from': 8,
   'to': 13}],
 'aspect_categories': [{'category': 'service', 'polarity': 'negative'}]}

Great!  Now let's save this as as pandas DataFrame.

In [99]:
import pandas as pd
import pickle

data = pd.DataFrame(restaurant_reviews)

with open('restaurant.pickle', 'wb') as handle:
    pickle.dump(data, handle)