In [1]:
import os
import numpy as np
from gensim import models
import string
from keras.preprocessing.text import Tokenizer
from tqdm import tqdm
import math

Using TensorFlow backend.


In [2]:
BASE_DIR = '../'
EMBEDDING_DIR = BASE_DIR + 'alpha/embeddings/'  # http://nlp.stanford.edu/projects/glove/ pretrained vectors
EMBEDDING_FILE = "GoogleNews-vectors-negative300.bin"
TEXT_DATA_DIR = BASE_DIR + 'data/'
TEXT_DATA_FILE = "movie_reviews.csv"
HEADER = True

In [3]:
def load_data():
    x = []
    y = []
    with open(os.path.join(TEXT_DATA_DIR, TEXT_DATA_FILE), "r") as f:
        if HEADER:
            _ = next(f)
        for line in f:
            temp_y, temp_x = line.rstrip("\n").split(",", 1)
            x.append(temp_x)
            y.append(temp_y)

    return x, y

In [4]:
X, y = load_data()
y = np.array(y, dtype='int8')

In [5]:
len(y)

152610

In [6]:
def load_w2v():
        _fname = EMBEDDING_DIR + EMBEDDING_FILE
        w2v_model = models.KeyedVectors.load_word2vec_format(_fname, binary=True)
        return w2v_model

In [7]:
w2v_model = load_w2v()

In [8]:
tokenizer = Tokenizer(filters='"#$%&()*+-/:;<=>@[\\]^{|}~\t\n,.')
tokenizer.fit_on_texts(X)

In [9]:
N = 1
synonyms = dict()
for i in tqdm(tokenizer.word_index.keys(),desc="Synonyms adding"):
    try:
        synonyms[i] = [j[0] for j in w2v_model.most_similar(positive=[i], topn=N, restrict_vocab=300000)]
    except:
        continue

Synonyms adding: 100%|██████████| 166734/166734 [21:01<00:00, 132.12it/s]


In [11]:
new_sent = []
for j in tqdm(X):
    for n in range(0,N+1):
        new_sent.append("")
    new_sent[-(N+1)] = j
    for i in [word.strip(string.punctuation) for word in j.split()]:
        top = synonyms.get(i, 0)
        if isinstance(top,list):
            for sent in range(1,N+1):
                new_sent[-sent] += " " + top[sent-1]

100%|██████████| 152610/152610 [00:20<00:00, 7349.54it/s]


In [12]:
new_sent[0]

'"To an entire generation of filmgoers, it just might represent the most significant leap in storytelling that they will ever see..."'

In [13]:
len(new_sent)/2, len(X)

(152610.0, 152610)

In [14]:
y_new = np.repeat(y, 2)

In [77]:
import re, nltk
from nltk.stem import SnowballStemmer

In [78]:
def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    word_list = nltk.word_tokenize(text)
    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(word) for word in word_list]
    return stems

In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(
        new_sent, 
        y_new,
        test_size=0.2, 
        random_state=42, stratify=y_new)

In [83]:
del w2v_model, tokenizer

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 3),
                              analyzer = 'word', binary = True, max_df= 0.75)), 
                     ('classifier', LogisticRegression(C = 100))])
model = pipeline.fit(X=X_train, y=y_train)

KeyboardInterrupt: 

In [73]:
new_sent[:2]

[['AN',
  'ENTIRE',
  'generations',
  'cinema_goers',
  'just',
  'so',
  'would',
  'represents',
  'that',
  'arguably',
  'considerable',
  'quantum_leap',
  'the',
  'storytellers',
  'actually',
  'They',
  'would',
  'arguably',
  'know'],
 ['another',
  'rest',
  'Generation',
  'cinemagoers',
  'something',
  'maybe',
  'could',
  'representing',
  'in',
  'particularly',
  'signficant',
  'leaps',
  'where',
  'Storytelling',
  'not',
  'them',
  'can',
  'never',
  'look']]

In [15]:
import pandas as pd
df = pd.DataFrame({"text":new_sent, "label":y_new})

In [18]:
df.to_csv("enriched_data.csv", index=False)

In [102]:
df = pd.read_csv("enriched_data.csv")

In [17]:
df.shape

(305220, 2)