In [1]:
from json import loads
import gzip
import os
from random import shuffle
import shutil
import urllib


def download_reviews():
    """
    Downloads Amazon movie and tv reviews if not downloaded.
    See: http://jmcauley.ucsd.edu/data/amazon/
    """
    if not os.path.isdir('data/Amazon-Movie-TV-Reviews'):
        urllib.request.urlretrieve(
            "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Movies_and_TV_5.json.gz",
            "data/reviews_Movies_and_TV_5.json.gz"
        )
        os.mkdir('data/Amazon-Movie-TV-Reviews')
        with gzip.open('data/reviews_Movies_and_TV_5.json.gz', 'r') as file_in, \
                open('data/Amazon-Movie-TV-Reviews/movie-tv-reviews.json', 'wb') as file_out:
            shutil.copyfileobj(file_in, file_out)
        os.remove('data/reviews_Movies_and_TV_5.json.gz')


download_reviews()

In [2]:
from re import findall


def clean_review(review_text):
    clean = ' '.join(findall(r'\w+', review_text.lower()))
    return clean


def transform_rating(rating_string):
    """
    1 & 2 stars -> 1
    3 stars     -> 2
    4 & 5 stars -> 3
    """
    rating_int = int(rating_string)
    if rating_int < 3:
        new_rating = 1
    elif rating_int > 3:
        new_rating = 3
    else:
        new_rating = 2
    return new_rating


def save2fastText(train_test_split=0.9):
    if os.path.isfile('data/Amazon-Movie-TV-Reviews/data.train.txt') and \
        os.path.isfile('data/Amazon-Movie-TV-Reviews/data.test.txt'):
        return
    with open('data/Amazon-Movie-TV-Reviews/movie-tv-reviews.json', 'r') as f:
        data = f.readlines()
    shuffle(data)
    
    train = data[:int(len(data)*train_test_split)]
    with open('data/Amazon-Movie-TV-Reviews/data.train.txt', 'w') as f:
        for x in train:
            j = loads(x)
            f.write('__label__{} {}\n'.format(
                transform_rating(j['overall']),
                clean_review(j['reviewText'])))
    test = data[int(len(data)*train_test_split):]
    with open('data/Amazon-Movie-TV-Reviews/data.test.txt', 'w') as f:
        for x in test:
            j = loads(x)
            f.write('__label__{} {}\n'.format(
                transform_rating(j['overall']),
                clean_review(j['reviewText'])))


save2fastText()

In [3]:
import fasttext

model = fasttext.train_supervised(
    'data/Amazon-Movie-TV-Reviews/data.train.txt',
    #lr=0.8,
    #epoch=20,
    #minCount=10,
    #dim=50,
    #maxn=3,
    #wordNgrams=3,
)
model.test('data/Amazon-Movie-TV-Reviews/data.test.txt')

(169754, 0.8376886553483276, 0.8376886553483276)