In [48]:
import pandas as pd
import numpy as np
import string
import re
# import pickle

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from stop_words import get_stop_words
from spacy.en import English

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.models import load_model

np.random.seed(1337)

### model: 
https://drive.google.com/file/d/0B0YWufjvNmyVaFJvbllsR2lMX2M/view?usp=sharing

In [60]:
# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
nb_filter = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 4
TRAIN_MODEL = True

In [50]:
thrash = ['<hr>', '<br>', '<br />', '<p>', '<i>', '\n', '\t', '“', '”', '″', '…', '₤', '▼', '★★', '', '–', '`',
          '‘', '’',  '«', '»', '®', '°', 'º', '°c', '°f', '´', '·', '½', '¾', '¿', '§', '¨', '¡', '¢', '£', '¤']

redundant_signs = list(string.punctuation) + thrash
letters = [x for x in string.ascii_lowercase + '. ']

stop_words = set(stopwords.words('english') + get_stop_words('en'))

In [51]:
def build_model():
    model = Sequential()
    model.add(Embedding(max_features,
                        embedding_dims,
                        input_length=maxlen,
                        dropout=0.2))
    model.add(Convolution1D(nb_filter=nb_filter,
                            filter_length=filter_length,
                            border_mode='valid',
                            activation='relu',
                            subsample_length=1))

    model.add(GlobalMaxPooling1D())
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [52]:
# Lemmatizer
nlp = English()
def lemmatizer(line, nlp=nlp):
    return ' '.join([word.lemma_ for word in nlp(line)])

In [53]:
# Stemmer
stem = PorterStemmer()
def stemmer(line, stem=stem):
    return " ".join([stem.stem(w) for w in line.split()])

In [54]:
def clean_text(inp_str):
    inp_str = inp_str.lower()

    # fix haven't|doesn't|shouldn't cases
    inp_str = inp_str.replace("n't", " not")
    inp_str = inp_str.replace("'re", " are")
    inp_str = inp_str.replace("'s", " s")
    inp_str = inp_str.replace("'ve", " have")
    inp_str = inp_str.replace("'ll", " will")
    inp_str = inp_str.replace("'d", " d")

    # here may be actor's names, types of smth etc. I guess it's redundant info
    # let's discuss of necessity of this block
    bracket_words = re.findall('([\(\[\{].+?[\)\]\}])', inp_str)
    for word in bracket_words:
        inp_str = inp_str.replace(''.join(word), "")

    # replace redundant_signs
    for item in redundant_signs:
        inp_str = inp_str.replace(item, ' ')

    # replace digits
    inp_str = re.sub('\d', ' ', inp_str)
    # replace three or more letters in a row on one. Example: aaaaaah, i like it. - > Ah, I like ite
    inp_str = re.sub(r'(.)\1{3,}', r'\1', inp_str)
    # replace one-letter words or just letters
    inp_str = re.sub(r"\b[a-z]{1}\b", ' ', inp_str)

    return ' '.join(list(filter(None, inp_str.split(' '))))

In [45]:
def finalize_df(df, preprocessor=stemmer):
    df['new_text'] = df['text'].apply(clean_text)

    df['is_ascii'] = df['new_text'].apply(lambda x: set(x).issubset(letters))
    df = df[df['is_ascii'] == 1]
    df = df[df['new_text'].str.len() > 2]

    df['new_text'] = df['new_text'].apply(preprocessor)
    df['new_text'] = df['new_text'].apply(lambda x: ' '.join(
        [item for item in x.split() if item not in stop_words]))

    df = df.reset_index()
    df = df[df['new_text'].notnull()]
    df = df.ix[:, ['label', 'new_text']]

    return df

In [55]:
# Load your data and process it
data = pd.concat([pd.read_csv('reviews_rt_all.csv',sep='|'), 
                  pd.read_csv('imdb_small.csv',sep='|')], ignore_index=True)
#data = pd.read_csv('imdb_small.csv', sep='|')  # here type your file name
print('source data: ', data.shape)
# print('Longest string: %d' % data['text'].str.len().max())

data = finalize_df(data, lemmatizer)

tokenizer = Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(data['new_text'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

source data:  (152610, 2)
Found 85231 unique tokens.


In [61]:
if TRAIN_MODEL:
    X_train_1, X_test_1, y_train, y_test = train_test_split(data['new_text'],
                                                            data['label'],
                                                            test_size=0.2,
                                                            random_state=29,
                                                            stratify=data['label'])
    sequences_train = tokenizer.texts_to_sequences(X_train_1)
    sequences_test = tokenizer.texts_to_sequences(X_test_1)
    X_train = pad_sequences(sequences_train, maxlen=maxlen)
    X_test = pad_sequences(sequences_test, maxlen=maxlen)

    model = build_model()
    model.fit(X_train, y_train,
              batch_size=batch_size,
              nb_epoch=nb_epoch,
              validation_data=(X_test, y_test))
    model.save('cnn_foxtrot.h5')
else:
    model = load_model('cnn_foxtrot.h5')
    data_sequences = tokenizer.texts_to_sequences(data['new_text'])
    X_test = pad_sequences(data_sequences, maxlen=maxlen)
    y_test = data['label']

Train on 116655 samples, validate on 29164 samples
Epoch 1/4
    32/116655 [..............................] - ETA: 465s - loss: 0.6932 - acc: 0.3750

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/4
Epoch 3/4
Epoch 4/4


In [62]:
predictions = model.predict_classes(X_test)

print("=" * 30)
print('****Results****')
acc = accuracy_score(y_test, predictions)
print("Accuracy: {:.4%} \n".format(acc))

****Results****
Accuracy: 80.6542% 

