In [2]:
import time
import gzip
import simplejson
import pandas as pd
from tqdm import tqdm

In [3]:
def parse(filename):
    f = gzip.open(filename, 'r')
    entry = {}
    for l in f:
        l = l.strip()
        colonPos = l.find(':')
        if colonPos == -1:
            yield entry
            entry = {}
            continue
        eName = l[:colonPos]
        rest = l[colonPos+2:]
        entry[eName] = rest
    yield entry
    
    
data_dict = []

for e in tqdm(parse("data/beer/advocate/Beeradvocate.txt.gz")):
    try:
        e['review/appearance'] = float(e['review/appearance'])
        e['review/taste'] = float(e['review/taste'])
        e['review/overall'] = float(e['review/overall'])
        e['review/palate'] = float(e['review/palate'])
        e['review/aroma'] = float(e['review/aroma'])
        e['review/timeUnix'] = int(e['review/time'])
        e.pop('review/time', None)
        try:
            e['beer/ABV'] = float(e['beer/ABV'])
        except Exception as q:
            e.pop('beer/ABV', None)
        e['user/profileName'] = e['review/profileName']
        e.pop('review/profileName', None)
        timeStruct = time.gmtime(e['review/timeUnix'])
        e['review/timeStruct'] = dict(zip(["year", "mon", "mday", "hour", 
                                           "min", "sec", "wday", "yday", "isdst"], list(timeStruct)))
        data_dict.append(e)
    except Exception as q:
        pass

1586615it [01:09, 22803.53it/s]


In [4]:
df = pd.DataFrame(data_dict)

In [5]:
df.head()

Unnamed: 0,beer/ABV,beer/beerId,beer/brewerId,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,review/timeStruct,review/timeUnix,user/profileName
0,5.0,47986,10325,Sausa Weizen,Hefeweizen,2.5,2.0,1.5,1.5,1.5,A lot of foam. But a lot.\tIn the smell some b...,"{u'isdst': 0, u'mday': 16, u'hour': 20, u'min'...",1234817823,stcules
1,6.2,48213,10325,Red Moon,English Strong Ale,3.0,2.5,3.0,3.0,3.0,"Dark red color, light beige foam, average.\tIn...","{u'isdst': 0, u'mday': 1, u'hour': 13, u'min':...",1235915097,stcules
2,6.5,48215,10325,Black Horse Black Beer,Foreign / Export Stout,3.0,2.5,3.0,3.0,3.0,"Almost totally black. Beige foam, quite compac...","{u'isdst': 0, u'mday': 1, u'hour': 14, u'min':...",1235916604,stcules
3,5.0,47969,10325,Sausa Pils,German Pilsener,3.5,3.0,3.0,2.5,3.0,"Golden yellow color. White, compact foam, quit...","{u'isdst': 0, u'mday': 15, u'hour': 19, u'min'...",1234725145,stcules
4,7.7,64883,1075,Cauldron DIPA,American Double / Imperial IPA,4.0,4.5,4.0,4.0,4.5,"According to the website, the style for the Ca...","{u'isdst': 0, u'mday': 30, u'hour': 18, u'min'...",1293735206,johnmichaelsen


In [6]:
columnToDrop = ['review/appearance','review/aroma','review/palate',
                'review/taste','review/timeStruct', 'beer/beerId', 'beer/brewerId']
df = df.drop(columnToDrop, axis=1)

In [7]:
df.dtypes

beer/ABV            float64
beer/name            object
beer/style           object
review/overall      float64
review/text          object
review/timeUnix       int64
user/profileName     object
dtype: object

In [8]:
df.head()

Unnamed: 0,beer/ABV,beer/name,beer/style,review/overall,review/text,review/timeUnix,user/profileName
0,5.0,Sausa Weizen,Hefeweizen,1.5,A lot of foam. But a lot.\tIn the smell some b...,1234817823,stcules
1,6.2,Red Moon,English Strong Ale,3.0,"Dark red color, light beige foam, average.\tIn...",1235915097,stcules
2,6.5,Black Horse Black Beer,Foreign / Export Stout,3.0,"Almost totally black. Beige foam, quite compac...",1235916604,stcules
3,5.0,Sausa Pils,German Pilsener,3.0,"Golden yellow color. White, compact foam, quit...",1234725145,stcules
4,7.7,Cauldron DIPA,American Double / Imperial IPA,4.0,"According to the website, the style for the Ca...",1293735206,johnmichaelsen


In [9]:
df.to_csv('data/beer/advocate/RefinedRawData.csv')