In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
import random
from scipy.stats import pearsonr
from collections import defaultdict
from sklearn.pipeline import Pipeline

In [2]:
import warnings
warnings.simplefilter(action='ignore')

In [3]:
import nltk
import string
from nltk.stem.porter import *
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [4]:
def parseData(fname):
    for l in open(fname):
        yield eval(l)

def parseDataFromURL(fname):
    for l in urlopen(fname):
        yield eval(l)

### Just the first 5000 reviews

print("Reading data...")
data = list(parseData("train_Category.json"))
print("done")

Reading data...
done


In [5]:
reviews = pd.DataFrame(data)

In [6]:
punctuation = set(string.punctuation)

In [7]:
reviews['review_text'] = reviews['review_text'].apply(lambda x: ''.join([c for c in x.lower() if (not c in punctuation)]))


In [8]:
reviews['review_text'] = reviews['review_text'].apply(lambda x: ' '.join(w for w in x.split() if not w in stopwords))

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

In [25]:
user_feat = ['user_id']
user_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

text_feat = ['review_text']
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(analyzer='word', stop_words='english'))
])

X = reviews[['user_id', 'review_text']][:190000]
y = reviews.genreID[:190000]



In [28]:
class ItemSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[
                ('text', Pipeline([
                    ('selector', ItemSelector(key='review_text')),
                    ('tfidf', text_transformer),
                ])),
                ('user', Pipeline([
                    #('selector', ItemSelector(key='user_id')),
                    ('onehot', user_transformer),
                ])),
            ]
        )),
        ('model', LogisticRegression(C=10)),
    ])

In [29]:
pl.fit(X, y)

Pipeline(memory=None,
         steps=[('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('text',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  ItemSelector(key='review_text')),
                                                                 ('tfidf',
                                                                  Pipeline(memory=None,
                                                                           steps=[('tfidf',
                                                                                   TfidfVectorizer(analyzer='word',
                                                                                                   binary=False,
                                                                                                   decode_error='strict

In [30]:
pl.score(reviews[['user_id', 'review_text']][190000:], reviews.genreID[190000:])

0.8033

In [20]:
data_test = list(parseData("test_Category.json"))
test = pd.DataFrame(data_test)

In [21]:
test['review_text'] = test['review_text'].apply(lambda x: ''.join([c for c in x.lower() if (not c in punctuation)]))
test['review_text'] = test['review_text'].apply(lambda x: ' '.join(w for w in x.split() if not w in stopwords))

In [22]:
pred = pl.predict(test[['user_id', 'review_text']])

In [23]:
ids = [d['user_id'] + '-' + d['review_id'] for d in data_test]

In [24]:
prediction_genre = open("predictions_genre.txt", 'w')
prediction_genre.write("userID-reviewID,prediction\n")
for i in range(10000):
    prediction_genre.write(ids[i] + "," + str(pred[i]) + "\n")
    

prediction_genre.close()

Kaggle username: winniecyc