# Deep Learning and Applications : Joint Faculty Development Programme
# December 9 -13, 2019 

**Principal Coordinator - IIITDM Jabalpur Co-Principal Coordinator - NIT Warangal**

**Particiapting Academies - IIITDM Jabalpur, MNIT Jaipur, NIT Patna, NIT Warangal**


## Tutorial 7 - Sentiment Analysis

In [1]:
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

import keras

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
MAX_FEATURES = 10000    #max words in vocab, optimal value could be vocab_size+1
MAX_LENGTH = 125        #max sentence length after padding
RANDOM_SEED = 1
EMBEDDING_LENGTH = 100  #dimensions of a word_vect
EPOCHS = 1
BATCH_SIZE = 32

In [3]:
train_df = pd.read_csv('train.tsv',  sep="\t")
test_df = pd.read_csv('test.tsv',  sep="\t")

In [4]:
train_df.columns

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')

In [5]:
print(train_df.shape)
print(test_df.shape)

(156060, 4)
(66292, 3)


In [6]:
train_df['Phrase'].str.len().max()

283

In [7]:
train_df['Phrase'].str.len().mean()

40.217224144559786

In [8]:
train_df['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [9]:
#Preprocessing

#shuffle data
train_df = train_df.sample(frac=1).reset_index(drop=True)

#Lowercase
train_df['Phrase'] = train_df['Phrase'].apply(lambda c: c.lower())
test_df['Phrase'] = test_df['Phrase'].apply(lambda c: c.lower())

In [10]:
#Converting labels to categorical
data_train = train_df['Phrase']
data_test = test_df['Phrase']
labels_train = to_categorical(train_df['Sentiment'].values)

In [11]:
#tokenizatin and fitting it on data
tokenizer = Tokenizer(num_words= MAX_FEATURES)
tokenizer.fit_on_texts(list(data_train))

# Converting data to int_sequences and padding
data_train = tokenizer.texts_to_sequences(data_train)
data_train = pad_sequences(data_train, maxlen= MAX_LENGTH)

data_test = tokenizer.texts_to_sequences(data_test)
data_test = pad_sequences(data_test, maxlen= MAX_LENGTH)

In [12]:
data_train.shape

(156060, 125)

In [13]:
labels_train.shape

(156060, 5)

In [14]:
X_train, X_val, Y_train, Y_val = train_test_split(data_train, labels_train, test_size= 0.15, random_state= RANDOM_SEED)

In [15]:
model = keras.models.Sequential([
                                 keras.layers.Embedding(MAX_FEATURES, EMBEDDING_LENGTH, mask_zero= True),
                                 keras.layers.LSTM(64,dropout= 0.4, recurrent_dropout= 0.4,return_sequences=True),
                                 keras.layers.LSTM(32,dropout= 0.5, recurrent_dropout= 0.5,return_sequences=False),
                                 keras.layers.Dense(5, activation= 'sigmoid')
])

model.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 64)          42240     
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 165       
Total params: 1,054,821
Trainable params: 1,054,821
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(
    optimizer= 'adam',
    loss= 'categorical_crossentropy',
    metrics= ['accuracy']
)

model.fit(
    X_train,
    Y_train,
    batch_size= BATCH_SIZE,
    epochs= EPOCHS,
    verbose= 1,
    validation_data= (X_val, Y_val)
)



Train on 132651 samples, validate on 23409 samples
Epoch 1/1
  2400/132651 [..............................] - ETA: 57:18 - loss: 1.5325 - acc: 0.5083

In [None]:
model.predict(data_test, verbose= 1)