In [1]:
import os
import cv2
import pandas as pd
import math
import numpy as np
import warnings
warnings.filterwarnings("ignore")

train_file = "train_data.csv"
test_file = "test_data.csv"

def load_data(file, direc="", sep=",", header=True):
    csv_path = os.path.join(direc, file)
    if header:
        return pd.read_csv(csv_path, sep=sep, index_col=False)
    else:
        return pd.read_csv(csv_path, sep=sep, index_col=False, header=None)
    

In [2]:
train_data = load_data(train_file)

In [3]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,216,217,218,219,220,221,222,223,224,Sentiment
0,-0.293233,0.075336,-0.265466,-0.356092,0.06643,0.761425,-0.001033,-0.340051,-0.140389,-0.384717,...,-0.304356,0.110543,-0.257856,0.248262,0.304541,-0.454722,0.313093,0.096131,0.022209,1.0
1,-0.281317,-0.052475,-0.227652,-0.145945,0.129469,0.254899,0.192881,-0.210649,-0.311281,0.397568,...,0.147703,-0.144117,0.197492,0.157018,0.054766,-0.073626,0.093853,0.080051,-0.250832,0.0
2,0.065311,0.078494,0.018382,-0.024447,-0.185517,-0.06263,0.390812,0.222266,0.223175,-0.43704,...,0.391639,-0.040531,0.092544,-0.397565,0.517327,-0.164458,0.110916,0.456602,-0.073503,0.0
3,0.123459,0.029362,0.05882,-0.235124,0.156227,0.575867,0.26641,-0.644778,-0.04655,-0.047256,...,0.125449,-0.121503,-0.549419,0.024966,0.150538,-0.30221,0.330751,-0.321547,-0.021141,1.0
4,-0.417293,0.00713,0.499212,-0.30981,-0.079425,-0.137011,0.22843,-0.066997,0.284044,0.465361,...,0.143851,-0.092237,-0.244069,0.22202,-0.360323,-0.032631,-0.143966,-0.087031,-0.474025,1.0


In [None]:
from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb


max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32

In [7]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
25000 train sequences
25000 test sequences
x_train shape: (25000,)
x_test shape: (25000,)


In [9]:
x_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,

In [4]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)


Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)


In [5]:
x_train

array([[1415,   33,    6, ...,   19,  178,   32],
       [ 163,   11, 3215, ...,   16,  145,   95],
       [1301,    4, 1873, ...,    7,  129,  113],
       ...,
       [  11,    6, 4065, ...,    4, 3586,    2],
       [ 100, 2198,    8, ...,   12,    9,   23],
       [  78, 1099,   17, ...,  204,  131,    9]], dtype=int32)