## Clean text firstly and then vectorize, test if accuracy is maintained

In [95]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.externals import joblib
from scipy import sparse
import gzip, re, string

import warnings
warnings.filterwarnings('ignore')

In [96]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

In [97]:
def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [98]:


def pr(x, y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [99]:
df = getDF('reviews_Digital_Music_5.json.gz')
df['reviewText'].fillna("unknown", inplace=True)

In [102]:
X = df.drop(['overall'], axis = 1)
y = (df['overall'] > 3).astype(float)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [105]:
texts = list(X_train['reviewText'])

In [106]:
treated_text = []
for text in texts:
    treated_text.append(re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])').sub(r' \1 ', text))

In [110]:

vec = TfidfVectorizer(ngram_range=(1,2),
            min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
            smooth_idf=1, sublinear_tf=1 )
x = vec.fit_transform(treated_text)
y = y_train.values
r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
clf = LogisticRegression(C = 10, dual=True, penalty = "l2", n_jobs = -1)
x_nb = x.multiply(r)
clf.fit(x_nb, y)
    # clf.predict(x_test.multiply(r))


LogisticRegression(C=10, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [111]:
joblib.dump(clf, 'review_model.pkl')
joblib.dump(vec, 'review_vector.pkl')
sparse.save_npz('review_r.npz', r)


In [119]:
estimator = joblib.load('review_model.pkl')
vec = joblib.load('review_vector.pkl')
target_names = ['Negative', 'Positive']
r = sparse.load_npz('review_r.npz')

In [113]:
treated_text_test = []
for text in list(X_test['reviewText']):
    treated_text_test.append(re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])').sub(r' \1 ', text))

In [120]:
## test accuracy of model
review = vec.transform(treated_text_test)
my_prediction = estimator.predict(review.multiply(r))

In [115]:
from sklearn.metrics import accuracy_score

In [121]:
accuracy = accuracy_score(y_test, my_prediction)

In [122]:
accuracy

0.88502194473635409

## Conclusion: Same accuracy as before. So apply re to clean text, then pickle model should work

In [129]:
X.loc[64705]['reviewText']

"Magic! is a Canadian band that incorporates reggae into their pop sound.  &#34;Rude&#34; (which is a play on rude boy) has your basic faceless pop band sound but with a reggae beat floating over the top.  Lead singer Nasri Atweh has written songs for the likes of Justin Bieber, Boyzone, Christina Aguilera and similar artists.  Think of those artists' songs but with a reggae beat instead of a pop beat and you have &#34;Rude&#34;.  If you like that cookie cutter pop sound, then you'll like &#34;Rude&#34;.  Clearly I'm in the minority on this one as the song has hit number one on the Billboard Hot 100."

In [128]:
df['overall']

0        5.0
1        5.0
2        5.0
3        5.0
4        4.0
5        5.0
6        3.0
7        5.0
8        5.0
9        5.0
10       5.0
11       4.0
12       5.0
13       5.0
14       4.0
15       5.0
16       5.0
17       5.0
18       4.0
19       5.0
20       5.0
21       5.0
22       5.0
23       4.0
24       5.0
25       4.0
26       5.0
27       5.0
28       3.0
29       5.0
        ... 
64676    5.0
64677    5.0
64678    2.0
64679    2.0
64680    2.0
64681    5.0
64682    5.0
64683    4.0
64684    4.0
64685    5.0
64686    5.0
64687    5.0
64688    4.0
64689    5.0
64690    2.0
64691    5.0
64692    5.0
64693    4.0
64694    5.0
64695    5.0
64696    5.0
64697    4.0
64698    5.0
64699    3.0
64700    4.0
64701    4.0
64702    5.0
64703    5.0
64704    3.0
64705    1.0
Name: overall, Length: 64706, dtype: float64