In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
import numpy as np



In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

# Plotting config
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
from sklearn.pipeline import Pipeline

# Данные

In [53]:
reviews = pd.read_csv('data/stemmed.csv')

In [168]:
S = np.argsort(reviews.rating.value_counts().index)
ratingCounts = reviews.rating.value_counts().values[S]
proba = map(lambda x: 1.0/x, ratingCounts)
print proba
row_proba = map(lambda x: proba[int(x)-1], reviews.rating)
row_proba /= sum(row_proba)
idx = (np.random.choice(reviews.index, size=100000, replace=False, p=row_proba))
reviewsNormed = reviews.loc[idx,:]

[3.0507337014551999e-05, 7.2653298459750076e-05, 6.4151911726969464e-05, 6.8946497517926095e-05, 5.0461724781753044e-05, 4.39734400422145e-05, 3.2944587204322332e-05, 2.1918769041930604e-05, 1.6076653483810809e-05, 6.89731280693042e-06]


In [169]:
reviewsNormed.shape

(100000, 15)

In [None]:
X

## учим на нормированной, тестим на обычной

In [14]:
from sklearn.model_selection import KFold

In [15]:
def norm_idx(y_train):
    S = np.argsort(y_train.value_counts().index)
    ratingCounts = y_train.value_counts().values[S]
    proba = map(lambda x: 1.0/x, ratingCounts)
    row_proba = map(lambda x: proba[int(x)-1], y_train)
    row_proba /= sum(row_proba)
    idx = (np.random.choice(y_train.index, size=100000, replace=False, p=row_proba))
    return idx   

### LinReg

In [None]:
%%time

linreg_accuracy_list = []
linreg_mae_list = []
linreg_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.stemmed[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.stemmed[test]
    y_test = reviews.rating[test]
    
    idx = norm_idx(y_train)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    linreg_text_clf_best_mae = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.4, min_df=0.003)),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LinearRegression()),
                    ])
    linreg_text_clf_best_mae.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = linreg_text_clf_best_mae.predict(X_test).round()
    y_pred = map(lambda x: 10 if x > 10 else 1 if x < 1 else x, y_pred)
    print 'predict done'
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
    
    linreg_accuracy_list += [accuracy]
    linreg_mae_list += [mae]
    linreg_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

In [None]:
print 'mean accuracy', np.mean(linreg_accuracy_list)
print 'mean MAE', np.mean(linreg_mae_list)
print 'mean MSE', np.mean(linreg_mse_list)

### Bayes

In [54]:
%%time

bayes_accuracy_list = []
bayes_mae_list = []
bayes_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = norm_idx(y_train)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    bayes_text_clf_best_mae = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.3, min_df=0.001)),
                          ('tfidf', TfidfTransformer(use_idf=True)),
                          ('clf', MultinomialNB(alpha=0.2))
                    ])
    bayes_text_clf_best_mae.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = bayes_text_clf_best_mae.predict(X_test)
    print 'predict done'
    print y_pred
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
                           
    bayes_accuracy_list += [accuracy]
    bayes_mae_list += [mae]
    bayes_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

fit done
predict done
[ 10.  10.  10. ...,  10.   8.   1.]
accuracy: 0.556939625605
MAE: 0.980067402996
MSE: 3.7323166549
[[ 6591   145   213   140   215   219   208   159   106   252]
 [  803  1791   126    85   142   170   137   102    59    83]
 [  770    76  2008   122   183   290   236   156    64    78]
 [  512    59   104  1680   218   404   368   217    60    84]
 [  413    54    97   121  2352   525   635   425   133   135]
 [  336    61    72    84   257  2445  1040   834   246   272]
 [  281    63    59    72   247   527  3116  1734   722   657]
 [  346    62    77    63   268   347  1307  4800  2072  1993]
 [  360    36    31    45   193   305   724  2615  6195  4924]
 [  844    64   102    80   248   294   640  3057  6003 25044]]
             precision    recall  f1-score   support

        1.0       0.59      0.80      0.68      8248
        2.0       0.74      0.51      0.61      3498
        3.0       0.70      0.50      0.58      3983
        4.0       0.67      0.45  

In [55]:
print 'mean accuracy', np.mean(bayes_accuracy_list)
print 'mean MAE', np.mean(bayes_mae_list)
print 'mean MSE', np.mean(bayes_mse_list)

mean accuracy 0.557302488344
mean MAE 0.980248834366
mean MSE 3.72036455278


In [58]:
y_pred = bayes_text_clf.predict(data_train.review_text)

In [60]:
y_test = data_train.rating

In [61]:
accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

In [62]:
print "accuracy:", accuracy
print "MAE:", mae
print "MSE:", mse

accuracy: 0.527213353672
MAE: 1.13884425157
MSE: 4.62262119371


### LogReg

In [None]:
%%time

logreg_accuracy_list = []
logreg_mae_list = []
logreg_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = norm_idx(y_train)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    logreg_text_clf = Pipeline([('vect', CountVectorizer(max_df=0.3, min_df=0.001, ngram_range=(1, 2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LogisticRegression(class_weight='balanced', penalty='l2')),
                    ])
    logreg_text_clf.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = logreg_text_clf.predict(X_test)
    print 'predict done'
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
                           
    logreg_accuracy_list += [accuracy]
    logreg_mae_list += [mae]
    logreg_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

In [38]:
print 'mean accuracy', np.mean(logreg_accuracy_list)
print 'mean MAE', np.mean(logreg_mae_list)
print 'mean MSE', np.mean(logreg_mse_list)

mean accuracy 0.558309059639
mean MAE 0.93764228693
mean MSE 3.3368907137


## учим на обычной, тестим на обычной

### LinReg

In [39]:
%%time

linreg2_accuracy_list = []
linreg2_mae_list = []
linreg2_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = np.random.choice(y_train.index, size=100000, replace=False)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    linreg2_text_clf_best_mae = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.4, min_df=0.003)),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LinearRegression()),
                    ])
    linreg2_text_clf_best_mae.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = linreg2_text_clf_best_mae.predict(X_test).round()
    y_pred = map(lambda x: 10 if x > 10 else 1 if x < 1 else x, y_pred)
    print 'predict done'
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
    
    linreg2_accuracy_list += [accuracy]
    linreg2_mae_list += [mae]
    linreg2_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

fit done
predict done
accuracy: 0.321864219746
MAE: 1.19596576166
MSE: 2.87674596626
[[ 1531  1490  1703  1385   944   591   298   117    32    16]
 [  374   531   758   815   498   286   131    58    13     6]
 [  229   362   777  1018   730   412   247    67    26     5]
 [  114   215   560   801   840   563   341   131    35     7]
 [   67   113   371   868  1222  1155   647   283    85    27]
 [   24    66   167   488  1194  1688  1139   694   248    98]
 [    6    16    81   255   622  1488  2111  1636   979   405]
 [    3     4    33   101   348  1006  2289  3308  2617  1723]
 [    0     3     9    47   189   615  1888  3882  4883  3959]
 [    5     6    11    44   220   765  2624  6207 10976 15524]]
             precision    recall  f1-score   support

        1.0       0.65      0.19      0.29      8107
        2.0       0.19      0.15      0.17      3470
        3.0       0.17      0.20      0.19      3873
        4.0       0.14      0.22      0.17      3607
        5.0       

In [40]:
print 'mean accuracy', np.mean(linreg2_accuracy_list)
print 'mean MAE', np.mean(linreg2_mae_list)
print 'mean MSE', np.mean(linreg2_mse_list)

mean accuracy 0.321752378491
mean MAE 1.1931498474
mean MSE 2.85838163219


### Bayes

In [41]:
%%time

bayes2_accuracy_list = []
bayes2_mae_list = []
bayes2_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = np.random.choice(y_train.index, size=100000, replace=False)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    bayes2_text_clf_best_mae = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.3, min_df=0.001)),
                          ('tfidf', TfidfTransformer(use_idf=True)),
                          ('clf', MultinomialNB(alpha=0.2))
                    ])
    bayes2_text_clf_best_mae.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = bayes2_text_clf_best_mae.predict(X_test)
    print 'predict done'
    print y_pred
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
                           
    bayes2_accuracy_list += [accuracy]
    bayes2_mae_list += [mae]
    bayes2_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

fit done
predict done
[ 10.  10.  10. ...,   9.  10.  10.]
accuracy: 0.554513913052
MAE: 1.1301335136
MSE: 4.77293739872
[[ 6393    16    41    39    73    83   107   191    71  1219]
 [ 1116  1242    39    22    69    73    94   103    43   521]
 [ 1068    14  1421    31   113   140   184   189    81   750]
 [  739    25    31  1141   133   235   283   265    92   689]
 [  644     7    24    34  1771   323   499   491   160  1050]
 [  455     3    22    18   135  1790   695   838   324  1470]
 [  343     1    20     8   106   198  1957  1403   676  2825]
 [  241     3     7     6    80    83   399  3237  1412  5979]
 [  123     2     1     2    28    55   146  1171  3632 10374]
 [  267     0     0     0    30    21   106   876  1645 33194]]
             precision    recall  f1-score   support

        1.0       0.56      0.78      0.65      8233
        2.0       0.95      0.37      0.54      3322
        3.0       0.88      0.36      0.51      3991
        4.0       0.88      0.31   

In [42]:
print 'mean accuracy', np.mean(bayes2_accuracy_list)
print 'mean MAE', np.mean(bayes2_mae_list)
print 'mean MSE', np.mean(bayes2_mse_list)

mean accuracy 0.555090516856
mean MAE 1.13093131456
mean MSE 4.78166350197


### LogReg

In [43]:
%%time

logreg2_accuracy_list = []
logreg2_mae_list = []
logreg2_mse_list = []

kf = KFold(n_splits=4, shuffle=True)
for train, test in kf.split(reviews):
    X_train = reviews.review_text[train]
    y_train = reviews.rating[train]
    
    X_test = reviews.review_text[test]
    y_test = reviews.rating[test]
    
    idx = np.random.choice(y_train.index, size=100000, replace=False)
    X_train = X_train[idx]
    y_train = y_train[idx]
    
    logreg2_text_clf = Pipeline([('vect', CountVectorizer(max_df=0.3, min_df=0.001, ngram_range=(1, 2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LogisticRegression(class_weight='balanced', penalty='l2')),
                    ])
    logreg2_text_clf.fit(X_train, y_train)
    print 'fit done'
    
    y_pred = logreg2_text_clf.predict(X_test)
    print 'predict done'
    
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
    mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
    mse = mean_squared_error(y_pred=y_pred, y_true=y_test)
                           
    logreg2_accuracy_list += [accuracy]
    logreg2_mae_list += [mae]
    logreg2_mse_list += [mse]

    print "accuracy:", accuracy
    print "MAE:", mae
    print "MSE:", mse
    print confusion_matrix(y_pred=y_pred, y_true=y_test)
    print classification_report(y_test, y_pred)
    
    print('='*20)
       

fit done
predict done
accuracy: 0.574993289525
MAE: 0.915070236308
MSE: 3.24486772908
[[ 5986   459   412   369   275   198   112    47    40   249]
 [  714  1807   236   223   177   101    58    33    17    73]
 [  633   211  1935   308   277   212   109    60    25    60]
 [  348   189   275  1620   347   335   201    78    22   103]
 [  356   179   312   383  2356   563   415   201    80   166]
 [  201   132   256   325   487  2606   844   458   144   245]
 [  205    96   141   204   429   846  3305  1145   549   673]
 [  164   110   131   183   279   600  1606  4310  1534  2355]
 [  204    98   153   158   226   412   994  2022  5197  6178]
 [  434   208   190   210   278   362   855  2083  3103 28716]]
             precision    recall  f1-score   support

        1.0       0.65      0.73      0.69      8147
        2.0       0.52      0.53      0.52      3439
        3.0       0.48      0.51      0.49      3830
        4.0       0.41      0.46      0.43      3518
        5.0      

In [44]:
print 'mean accuracy', np.mean(logreg2_accuracy_list)
print 'mean MAE', np.mean(logreg2_mae_list)
print 'mean MSE', np.mean(logreg2_mse_list)

mean accuracy 0.574655280398
mean MAE 0.922247959518
mean MSE 3.29184105618


In [None]:
logreg_text_clf = Pipeline([('vect', CountVectorizer(max_df=0.3, min_df=0.001, ngram_range=(1, 2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LogisticRegression(class_weight='balanced', penalty='l2')),
                    ])

with open('logreg_text_clf.pickle', 'wb') as handle:
    pickle.dump(logreg_text_clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [56]:
logreg2_text_clf

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.3, max_features=None, min_df=0.001,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

## Сравним

In [46]:
print 'LinearRegression'
print '  normed train'
print '    mean accuracy', np.mean(linreg_accuracy_list)
print '    mean MAE', np.mean(linreg_mae_list)
print '    mean MSE', np.mean(linreg_mse_list)
print '  not normed train'
print '    mean accuracy', np.mean(linreg2_accuracy_list)
print '    mean MAE', np.mean(linreg2_mae_list)
print '    mean MSE', np.mean(linreg2_mse_list)

print 'MultinomialNB'
print '  normed train'
print '    mean accuracy', np.mean(bayes_accuracy_list)
print '    mean MAE', np.mean(bayes_mae_list)
print '    mean MSE', np.mean(bayes_mse_list)
print '  not normed train'
print '    mean accuracy', np.mean(bayes2_accuracy_list)
print '    mean MAE', np.mean(bayes2_mae_list)
print '    mean MSE', np.mean(bayes2_mse_list)

print 'LogisticRegression'
print '  normed train'
print '    mean accuracy', np.mean(logreg_accuracy_list)
print '    mean MAE', np.mean(logreg_mae_list)
print '    mean MSE', np.mean(logreg_mse_list)
print '  not normed train'
print '    mean accuracy', np.mean(logreg2_accuracy_list)
print '    mean MAE', np.mean(logreg2_mae_list)
print '    mean MSE', np.mean(logreg2_mse_list)

 LinearRegression
  normed train
    mean accuracy 0.280552545507
    mean MAE 1.3439019177
    mean MSE 3.36363568581
  not normed train
    mean accuracy 0.321752378491
    mean MAE 1.1931498474
    mean MSE 2.85838163219
MultinomialNB
  normed train
    mean accuracy 0.55507809005
    mean MAE 0.976856316297
    mean MSE 3.68161528597
  not normed train
    mean accuracy 0.555090516856
    mean MAE 1.13093131456
    mean MSE 4.78166350197
LogisticRegression
  normed train
    mean accuracy 0.558309059639
    mean MAE 0.93764228693
    mean MSE 3.3368907137
  not normed train
    mean accuracy 0.574655280398
    mean MAE 0.922247959518
    mean MSE 3.29184105618


## Pickle

In [8]:
import pickle

In [9]:
data_train = pd.read_csv('data/data_train.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
data_test =pd.read_csv('data/data_test.csv')

In [11]:
X_train = data_train.stemmed
y_train = data_train.rating

In [12]:
X_test = data_test.stemmed
y_test = data_test.rating

### LinReg

In [16]:
idx = norm_idx(y_train)
X_train = X_train[idx]
y_train = y_train[idx]

linreg_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.4, min_df=0.003)),
                      ('tfidf', TfidfTransformer(use_idf=False)),
                      ('clf', LinearRegression()),
                ])
linreg_text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.4, max_features=None, min_df=0.003,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ..._idf=False)), ('clf', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [17]:
with open('linreg_text_clf.pickle', 'wb') as handle:
    pickle.dump(linreg_text_clf, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [18]:
with open('linreg_text_clf.pickle', 'rb') as handle:
    linreg_text_clf.pickle = pickle.load(handle)

In [19]:
y_pred = linreg_text_clf.predict(X_test).round()
y_pred = map(lambda x: 10 if x > 10 else 1 if x < 1 else x, y_pred)

In [20]:
linreg_accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
linreg_mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
linreg_mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", linreg_accuracy
print "MAE:", linreg_mae
print "MSE:", linreg_mse
print confusion_matrix(y_pred=y_pred, y_true=y_test)
print classification_report(y_test, y_pred)

accuracy: 0.204644706018
MAE: 1.7646412355
MSE: 5.16349018133
[[ 882  874  929  990  759  344  153   67    0    0]
 [ 167  212  298  412  316  170   99    0   20    0]
 [ 174  242  221  399  407  222   63   27    0    0]
 [  76  115  230  377  281  217   89   38   20    0]
 [  23   79  235  345  365  357  268   51   11    0]
 [  30  111  150  312  299  516  324   94   50    1]
 [   0    2  140  147  477  521  624  379  111   76]
 [  20   17   67  120  220  539  833  703  369  177]
 [   0    0   32   95  248  543  966  936  742  597]
 [   0    0   13  161  426 1011 2062 2880 2378 2434]]
             precision    recall  f1-score   support

        1.0       0.64      0.18      0.28      4998
        2.0       0.13      0.13      0.13      1694
        3.0       0.10      0.13      0.11      1755
        4.0       0.11      0.26      0.16      1443
        5.0       0.10      0.21      0.13      1734
        6.0       0.12      0.27      0.16      1887
        7.0       0.11      0.25   

In [21]:
X_train = data_train.stemmed
y_train = data_train.rating

In [22]:
idx = np.random.choice(y_train.index, size=100000, replace=False)
X_train = X_train[idx]
y_train = y_train[idx]

linreg2_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.4, min_df=0.003)),
                      ('tfidf', TfidfTransformer(use_idf=False)),
                      ('clf', LinearRegression()),
                ])
linreg2_text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.4, max_features=None, min_df=0.003,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ..._idf=False)), ('clf', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [23]:
with open('linreg2_text_clf.pickle', 'wb') as handle:
    pickle.dump(linreg2_text_clf, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [24]:
with open('linreg2_text_clf.pickle', 'rb') as handle:
    linreg2_text_clf.pickle = pickle.load(handle)

In [25]:
y_pred = linreg2_text_clf.predict(X_test).round()
y_pred = map(lambda x: 10 if x > 10 else 1 if x < 1 else x, y_pred)

In [26]:
linreg2_accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
linreg2_mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
linreg2_mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", linreg2_accuracy
print "MAE:", linreg2_mae
print "MSE:", linreg2_mse
print confusion_matrix(y_pred=y_pred, y_true=y_test)
print classification_report(y_test, y_pred)

accuracy: 0.234577898603
MAE: 1.65960031235
MSE: 4.94588888568
[[ 669  515  989  795  870  640  414   82   24    0]
 [ 113  193  237  324  387  212  152   66   10    0]
 [  75  128  262  401  329  320  133   82   15   10]
 [  50   83  189  268  336  276  139   86   16    0]
 [  43   50  108  318  368  417  219  150   61    0]
 [  15   57  112  158  500  428  353  149   82   33]
 [   0   14   46  191  309  528  565  538  227   59]
 [  10   10   38   62  209  447  722  787  493  287]
 [   0    0   10   19  141  263  835 1002 1052  837]
 [   0    0   20   73  120  504 1406 2649 3074 3519]]
             precision    recall  f1-score   support

        1.0       0.69      0.13      0.22      4998
        2.0       0.18      0.11      0.14      1694
        3.0       0.13      0.15      0.14      1755
        4.0       0.10      0.19      0.13      1443
        5.0       0.10      0.21      0.14      1734
        6.0       0.11      0.23      0.14      1887
        7.0       0.11      0.23  

### Bayes

In [27]:
X_train = data_train.stemmed
y_train = data_train.rating

In [28]:
idx = norm_idx(y_train)
X_train = X_train[idx]
y_train = y_train[idx]
X_train.shape

(100000L,)

In [29]:
idx = norm_idx(y_train)
X_train = X_train[idx]
y_train = y_train[idx]

bayes_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.3, min_df=0.001)),
                          ('tfidf', TfidfTransformer(use_idf=True)),
                          ('clf', MultinomialNB(alpha=0.2))
                    ])
bayes_text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.3, max_features=None, min_df=0.001,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))])

In [30]:
with open('bayes_text_clf.pickle', 'wb') as handle:
    pickle.dump(bayes_text_clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [31]:
with open('bayes_text_clf.pickle', 'rb') as handle:
    bayes_text_clf.pickle = pickle.load(handle)

In [32]:
y_pred = bayes_text_clf.predict(X_test)

In [33]:
bayes_accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
bayes_mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
bayes_mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", bayes_accuracy
print "MAE:", bayes_mae
print "MSE:", bayes_mse

accuracy: 0.395407351708
MAE: 1.61798305232
MSE: 7.05211556815


In [34]:
X_train = data_train.stemmed
y_train = data_train.rating

In [35]:
idx = np.random.choice(y_train.index, size=100000, replace=False)
X_train = X_train[idx]
y_train = y_train[idx]

bayes2_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.3, min_df=0.001)),
                          ('tfidf', TfidfTransformer(use_idf=True)),
                          ('clf', MultinomialNB(alpha=0.2))
                    ])
bayes2_text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.3, max_features=None, min_df=0.001,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))])

In [36]:
with open('bayes2_text_clf.pickle', 'wb') as handle:
    pickle.dump(bayes2_text_clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [37]:
with open('bayes2_text_clf.pickle', 'rb') as handle:
    bayes2_text_clf.pickle = pickle.load(handle)

In [38]:
y_pred = bayes2_text_clf.predict(X_test)

In [39]:
bayes2_accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
bayes2_mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
bayes2_mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", bayes2_accuracy
print "MAE:", bayes2_mae
print "MSE:", bayes2_mse

accuracy: 0.45400121468
MAE: 1.62451918906
MSE: 7.78387367325


### LogReg

In [40]:
X_train = data_train.stemmed
y_train = data_train.rating

In [41]:
idx = norm_idx(y_train)
X_train = X_train[idx]
y_train = y_train[idx]

logreg_text_clf = Pipeline([('vect', CountVectorizer(max_df=0.3, min_df=0.001, ngram_range=(1, 2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LogisticRegression(class_weight='balanced', penalty='l2')),
                    ])
logreg_text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.3, max_features=None, min_df=0.001,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
      ...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [42]:
with open('logreg_text_clf.pickle', 'wb') as handle:
    pickle.dump(logreg_text_clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
with open('logreg_text_clf.pickle', 'rb') as handle:
    logreg_text_clf.pickle = pickle.load(handle)

In [44]:
y_pred = logreg_text_clf.predict(X_test)

In [45]:
logreg_accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
logreg_mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
logreg_mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", logreg_accuracy
print "MAE:", logreg_mae
print "MSE:", logreg_mse

accuracy: 0.398386210487
MAE: 1.45394337276
MSE: 5.62868380716


In [46]:
X_train = data_train.review_text
y_train = data_train.rating

In [None]:
idx = np.random.choice(y_train.index, size=100000, replace=False)
X_train = X_train[idx]
y_train = y_train[idx]

logreg2_text_clf = Pipeline([('vect', CountVectorizer(max_df=0.3, min_df=0.001, ngram_range=(1, 2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LogisticRegression(class_weight='balanced', penalty='l2'))
                            ])
logreg2_text_clf.fit(X_train, y_train)

In [48]:
with open('logreg2_text_clf.pickle', 'wb') as handle:
    pickle.dump(logreg2_text_clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [49]:
with open('logreg2_text_clf.pickle', 'rb') as handle:
    logreg2_text_clf.pickle = pickle.load(handle)

In [50]:
y_pred = logreg2_text_clf.predict(X_test)

In [51]:
logreg2_accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
logreg2_mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
logreg2_mse = mean_squared_error(y_pred=y_pred, y_true=y_test)

print "accuracy:", logreg2_accuracy
print "MAE:", logreg2_mae
print "MSE:", logreg2_mse

accuracy: 0.402059172282
MAE: 1.56679295485
MSE: 6.58000983313


# LinReg

In [None]:
linreg_best_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), max_df=0.4, min_df=0.003)),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LinearRegression()),
                    ])

# Bayes

In [None]:
bayes_best_text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.001, max_df=0.3)),
                     ('tfidf', TfidfTransformer(use_idf=True)),
        

# LogReg

In [146]:
logreg_best_text_clf = Pipeline([('vect', CountVectorizer(max_df=0.3, min_df=0.001, ngram_range=(1, 2))),
                          ('tfidf', TfidfTransformer(use_idf=False)),
                          ('clf', LogisticRegression(class_weight='balanced', penalty='l2')),
                    ])