<a href="https://colab.research.google.com/github/jackychencw/MIE1624_Course_Project_Group19/blob/Lawrence-nlp-preprocess/MIE1624_proj_LSTM_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import tensorflow as tf
import pickle

from tensorflow.keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Reshape
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers.schedules import ExponentialDecay
from keras.optimizers import Adam

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin

from joblib import dump, load

from google.colab import drive

drive.mount('/content/drive')

data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/jacky_train_data.csv", index_col=0)
test_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/jacky_test_data.csv", index_col=0)

Mounted at /content/drive


# Data preprocess

In [2]:
def clean_data(data):
  data = data.drop(columns=[
                   'reviewerID', 'reviewText', 'unixReviewTime', 'itemID', 'reviewHash', 'important_features'])
  data['reviewTime'] = data.reviewTime.apply(lambda _: int(_[-4:]))
  dummy_years = pd.get_dummies(data.reviewTime, prefix='year')
  data = pd.concat([data, dummy_years], axis=1)
  data = data.drop(columns=['reviewTime'])
  return data

In [3]:
data = clean_data(data)
test_data = clean_data(test_data)


# Split training data into training set and validation set

In [4]:
saved_features = 2000 # max feature

y = data.overall
X = data.drop(columns=['overall'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
test_data = test_data.reindex(columns=X.columns)

# Model

In [7]:
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()
        

# 0.46 - Kaggle 0.45014
def baseline_model(
    num_layer=256,
    init_lr=0.005,
    decay_rate=0.95,
    decay_steps=3000,
    drop_out=0.75,
    loss_func='mse',
    metrics = [tf.keras.losses.MSE]
    ):
  model = tf.keras.Sequential([
    Reshape(target_shape=(1,-1)),
    LSTM(num_layer),
    Dropout(drop_out),
    Dense(num_layer*2, kernel_initializer='normal', activation='relu'),
    Dropout(drop_out),
    Dense(num_layer, kernel_initializer='normal', activation='relu'),
    Dropout(drop_out),
    Dense(num_layer/2, kernel_initializer='normal', activation='relu'),
    Dropout(drop_out),
    Dense(1, kernel_initializer='normal', activation='relu')
  ])
  lr_schedule = ExponentialDecay(
      initial_learning_rate=init_lr,
      decay_steps=decay_steps,
      decay_rate=decay_rate,
      staircase=True)

  opt = Adam(learning_rate=lr_schedule)
  model.compile(loss=loss_func, optimizer=opt, metrics=metrics)
  return model

vectorizers = {
    'tfidf': TfidfVectorizer(),
    }



In [8]:
summary_pipe = Pipeline([
                        ('vect', TfidfVectorizer()),
                        ('select', SelectKBest(chi2, k=saved_features))
                        ])

review_pipe = Pipeline([('vect', TfidfVectorizer()),
                        ('select', SelectKBest(chi2, k=saved_features))
                              ])

preprocess = ColumnTransformer([
                              ('price_std', StandardScaler(), ['price']),
                              ('summary_count_vec', summary_pipe, 'summary'),
                              ('review_tfidf', review_pipe, 'important_features(clean)')
                              ], remainder = 'passthrough')
model = KerasRegressor(baseline_model, epochs=50, batch_size=64, verbose=True)
pipe = Pipeline([('preprocess', preprocess),
                  ('to_dense', DenseTransformer()),
                  ('nn', model)], verbose=True)

In [9]:
pipe.fit(X_train, y_train)

[Pipeline] ........ (step 1 of 3) Processing preprocess, total=   9.3s
[Pipeline] .......... (step 2 of 3) Processing to_dense, total=   1.9s
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[Pipeline] ................ (step 3 of 3) Processing nn, total= 6.6min


Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('price_std',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['price']),
                                                 ('summary_count_vec',
                                                  Pipeline(memory=None,
                                                           steps=[('vect',
                                                                   TfidfVectorizer(analyzer='word',
                                                                           

In [12]:
from sklearn.metrics import mean_squared_error
def model_test(model, X_test, y_test):
  y_pred = model.predict(X_test)
  score = mean_squared_error(y_test, y_pred)
  return score, y_pred

In [13]:
train_scores, train_predictions = model_test(pipe, X_train, y_train)
scores, predictions = model_test(pipe, X_test, y_test)
print(train_scores)
print(scores)

0.3461172461271486
0.4796649691638189


# Fit the model on all training data

In [None]:
pipe.fit(X, y)
predictions = pipe.predict(test_data)
predictions = np.clip(predictions, a_min=1.0, a_max=5.0)

[Pipeline] ........ (step 1 of 3) Processing preprocess, total=  11.5s
[Pipeline] .......... (step 2 of 3) Processing to_dense, total=   1.5s
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[Pipeline] ................ (step 3 of 3) Processing nn, total= 9.0min


In [None]:
model_test(pipe, X, y)



(0.2871496614233061,
 array([4.565801 , 4.830697 , 4.830697 , ..., 4.5896716, 4.6962504,
        4.830697 ], dtype=float32))

In [None]:
rating_pairs_path = "/content/drive/MyDrive/Colab Notebooks/data/rating_pairs.csv"
def export_to_kaggle(rating_pairs_path, predictions):
  rating_pairs = pd.read_csv(rating_pairs_path)
  rating_pairs['prediction'] = predictions
  rating_pairs.to_csv("/content/drive/MyDrive/Colab Notebooks/data/rating_pairs_pred.csv", index=False)

In [None]:
export_to_kaggle(rating_pairs_path, predictions)