In [48]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from html import unescape
from nltk.corpus import stopwords
import string
import tensorflow as tf
import tensorflow_hub as hub
from nltk.tokenize import RegexpTokenizer

In [49]:
data = pd.read_csv('Sentiment.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]
data = data[data.sentiment != "Neutral"]

In [4]:
def clean_sentence(sentence):
      emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE)
      #HappyEmoticons
      emoticons_happy = set([
      ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
      ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
      '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
      'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
      '<3'
      ])
      # Sad Emoticons
      emoticons_sad = set([
      ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
      ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
      ':c', ':{', '>:\\', ';('
      ])
      #combine sad and happy emoticons
      emoticons = emoticons_happy.union(emoticons_sad)
      # remove mention
      sentence=re.sub('@\w*', '', sentence)
      # remove hashtag
      sentence = re.sub('#', '', sentence)
      # remove retweet
      sentence = re.sub('RT[\s]+', '', sentence) 
      # remove hyperlink
      sentence = re.sub('https?:\/\/\S+', '', sentence)
      # remove special string of html content
      sentence=BeautifulSoup(unescape(sentence)).text
      # remove icon
      for icon in emoticons:
          sentence=sentence.replace(icon,'')
      # remove emoji
      sentence=emoji_pattern.sub(r'', sentence)
      sentence = re.sub(r"what's", "what is ", sentence)
      sentence = re.sub(r"\'s", " ", sentence)
      sentence = re.sub(r"\'ve", " have ", sentence)
      sentence = re.sub(r"can't", "cannot ", sentence)
      sentence = re.sub(r"n't", " not ", sentence)
      sentence = re.sub(r"i'm", "i am ", sentence)
      sentence = re.sub(r"\'re", " are ", sentence)
      sentence = re.sub(r"\'d", " would ", sentence)
      sentence = re.sub(r"\'ll", " will ", sentence)
      sentence = re.sub(r"\'scuse", " excuse ", sentence)
      sentence = re.sub('\W', ' ', sentence)
      sentence = re.sub('\s+', ' ', sentence)
      sentence=sentence.strip(' ')
      
      return sentence

In [5]:
def sen_tokenize(sentence):
    #Emoji patterns
    tokenizer = RegexpTokenizer(r"[A-Za-z0-9]\w*(?:['?]\w+)?")
    tokens=[]
    tokens+=tokenizer.tokenize(clean_sentence(sentence))
    # lower case token
    tokens=[w.lower() for w in tokens]
    return tokens

In [6]:
def filter_tokens(sentence):
    stop_words = set(stopwords.words('english'))
    tokens=sen_tokenize(sentence)
    filtered_tokens=[]
    # remove punctation
    for w in tokens:
        if w not in string.punctuation and w not in stop_words and len(w)>=3 and not(w.isnumeric()) :
            filtered_tokens.append(w)
    return filtered_tokens

In [7]:
 import nltk
 nltk.download('stopwords')
for idx in range(len(data)):
  data.iloc[idx,0]=filter_tokens(data.iloc[idx,0])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
for i in range(len(data)):
    data.iloc[i,0]=' '.join(data.iloc[i,0])

In [21]:
# Load Pretrained Word2Vec
embed = hub.load("https://tfhub.dev/google/Wiki-words-250/2")

In [85]:
def get_word2vec_enc(reviews):
    """
    get word2vec value for each word in sentence.
    concatenate word in numpy array, so we can use it as RNN input
    """
    encoded_reviews = []
    for review in reviews:
        tokens = review.split(" ")
        word2vec_embedding = embed(tokens)
        encoded_reviews.append(word2vec_embedding)
    return encoded_reviews


def get_padded_encoded_reviews(encoded_reviews):
    """
    for short sentences, we prepend zero padding so all input to RNN has same length
    """
    padded_reviews_encoding = []
    for enc_review in encoded_reviews:
        zero_padding_cnt = 33 - enc_review.shape[0]
        pad = np.zeros((1, 250))
        for i in range(zero_padding_cnt):
            enc_review = np.concatenate((pad, enc_review), axis=0)
        padded_reviews_encoding.append(enc_review)
    return padded_reviews_encoding


In [98]:
X=get_padded_encoded_reviews(get_word2vec_enc(data.text))
X=np.array(tf.convert_to_tensor(X, dtype=tf.float32))

In [100]:
X.shape

(10729, 33, 250)

In [10]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_model = TfidfVectorizer(token_pattern=r"\w+['\w]*",ngram_range=(1,2),min_df=0.01,max_df=0.95) # specify parameters here
# tfidf_model.fit(data['text'])

In [11]:
# raw_data_tfidf=tfidf_model.transform(data['text'])

In [125]:
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [13]:
# X=raw_data_tfidf
# X=X.toarray()
# X=[sorted(X[idx],reverse=True)[:20] for idx in range(len(X))]
# X=np.array(X)
# X = np.expand_dims(X, 2)

In [14]:
# X=[sorted(X[idx],reverse=True)[:20] for idx in range(len(X))]

In [15]:
# model = Sequential()
# model.add(LSTM(units=64,input_shape=(20, 1 ),dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(2,activation='softmax'))
# model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
# print(model.summary())

In [126]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
Y=le.fit_transform(data['sentiment'])
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7188, 29) (7188,)
(3541, 29) (3541,)


In [112]:
from sklearn.utils import resample
X_train=pd.DataFrame(data=X_train[0:,0:],
    index=[i for i in range(X_train.shape[0])],
    columns=['f'+str(i) for i in range(X_train.shape[1])])
Y_train=pd.DataFrame(data=Y_train)

df = pd.concat([X_train, Y_train], axis=1)


# Separate majority and minority classes
# Separate majority and minority classes
df_majority = df[df.iloc[:,18]==0]
df_minority = df[df.iloc[:,18]==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=1234) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts


ValueError: ignored

In [12]:
X_train=np.array(df_upsampled.iloc[:,:18])
Y_train=np.array(df_upsampled.iloc[:,18])

In [127]:
embed_dim = 128
model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2,input_shape=(29,128)))
model.add(Dense(128,kernel_initializer='HeNormal', input_dim=196,activation='relu'))
model.add(Dense(64,kernel_initializer='HeNormal', input_dim=128,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())





Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 29, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 29, 128)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_9 (Dense)              (None, 128)               25216     
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 544,337
Trainable params: 544,337
Non-trainable params: 0
________________________________________________

In [128]:

from keras.callbacks import ModelCheckpoint
batch_size = 64

checkpoint = ModelCheckpoint('model_best_weights.h5', monitor='val_accuracy', verbose=2, save_best_only=True, mode='max', period=1)

model.fit(X_train, Y_train, epochs =10,validation_data = (X_test, 
Y_test), callbacks=[checkpoint] ,batch_size=batch_size, verbose = 2)





Epoch 1/10
113/113 - 16s - loss: 0.4561 - accuracy: 0.8043 - val_loss: 0.3781 - val_accuracy: 0.8376

Epoch 00001: val_accuracy improved from -inf to 0.83762, saving model to model_best_weights.h5
Epoch 2/10
113/113 - 14s - loss: 0.3232 - accuracy: 0.8623 - val_loss: 0.3540 - val_accuracy: 0.8563

Epoch 00002: val_accuracy improved from 0.83762 to 0.85626, saving model to model_best_weights.h5
Epoch 3/10
113/113 - 14s - loss: 0.2835 - accuracy: 0.8836 - val_loss: 0.3381 - val_accuracy: 0.8565

Epoch 00003: val_accuracy improved from 0.85626 to 0.85654, saving model to model_best_weights.h5
Epoch 4/10
113/113 - 14s - loss: 0.2488 - accuracy: 0.8964 - val_loss: 0.3574 - val_accuracy: 0.8577

Epoch 00004: val_accuracy improved from 0.85654 to 0.85767, saving model to model_best_weights.h5
Epoch 5/10
113/113 - 14s - loss: 0.2266 - accuracy: 0.9083 - val_loss: 0.3991 - val_accuracy: 0.8591

Epoch 00005: val_accuracy improved from 0.85767 to 0.85908, saving model to model_best_weights.h5
Epo

<tensorflow.python.keras.callbacks.History at 0x7f8a7062d940>

In [129]:
from sklearn.metrics import roc_auc_score
from keras.models import load_model

# load model
model = load_model('model_best_weights.h5')





In [130]:
y_pred=model.predict_classes(X_test)



In [131]:
roc_auc_score(Y_test,y_pred)

0.7450170349363898

In [132]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

56/56 - 1s - loss: 0.3991 - accuracy: 0.8591


In [None]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for idx in range(len(X_test)):
    
    result = model.predict_classes(X_test[idx].reshape(1,29))[0][0]
   
    if result == Y_test[idx]:
        if result == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if Y_test[idx] == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")



In [110]:
X_test.shape

(3541, 33, 250)



array([[1]], dtype=int32)