In [10]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import testsets
import evaluation
import re
import nltk
from nltk.tag import pos_tag
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SpatialDropout1D, Dropout, Activation, Embedding, MaxPooling1D, Conv1D
from tensorflow.keras.optimizers import Adam,RMSprop
from tensorflow.keras.callbacks import EarlyStopping
from platform import python_version

print("tensorflow version: ",tf.__version__)
print("Python version: ",python_version())

train_data =[]
X_train = []
y_train = []
embeddings_index = dict()
embedding_matrix = []

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def preprocess(data):
    processed_text = data.lower() # Convert to lowercase
    processed_text = re.sub('@[^\s]+','',processed_text) # Replace user mentions 
    processed_text = re.sub('(www\.[^\s]+)|(http[^\s]+)|(ftp://[^\s]+)','',processed_text) # Remove URL's
    processed_text = re.sub('[^a-z\s]','',processed_text) # Remove special characters, numbers, punctuations
    processed_text = re.sub(r'\b[a-z]{1,2}\b','',processed_text) # Remove words with length less than 3
    stop_words = set(stopwords.words('english')) 
    tokenized_tweet =  word_tokenize(processed_text)
    processed_text = [w for w in tokenized_tweet if not w in stop_words] # Remove stop words
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_sentence = []
    pos_tagged_sentence = pos_tag(processed_text)
    for word, tag in pos_tagged_sentence:
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return ' '.join(lemmatized_sentence)

# Pre trained word embedding
with open('glove.6B.100d.txt',encoding="utf8") as g:
    data = g.read().splitlines()
    for line in data:
        values = line.split()
        word = values[0]
        coef = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coef

        
# load training data
with open("twitter-training-data.txt",encoding="utf8") as f:
    data = f.read().splitlines()
    for row in data:
        tweet = row.split("\t")
        if len(tweet) == 3:
            preprocessed_text = preprocess(tweet[2])
            train_data.append([preprocessed_text,tweet[1]])
    train_data = pd.DataFrame(train_data) 
    train_data.columns = ["Text", "Sentiment"]
X_train = train_data['Text'].values
y_train = train_data['Sentiment'].values


for classifier in ['LSTM','Logistic_Regression','BiLSTM']: 
    if classifier == 'LSTM':
        print('Training ' + classifier)
        text_tokenizer_LSTM = Tokenizer(num_words=5000, oov_token="oov")
        text_tokenizer_LSTM.fit_on_texts(X_train)
        word_index = text_tokenizer_LSTM.word_index
        embedding_dim = 100 
        embedding_matrix = np.zeros((5000, embedding_dim))
        for word, i in word_index.items():
            if i < 5000:
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
        x_train = text_tokenizer_LSTM.texts_to_sequences(X_train)
        x_train = pad_sequences(x_train, padding='post', maxlen=100)
        Y_train = pd.get_dummies(y_train)

    elif classifier == 'Logistic_Regression':
        print('Training ' + classifier)
        tfidf_vectorizer =  TfidfVectorizer(max_features=3000, min_df=7, max_df=0.7,ngram_range=(1,3),smooth_idf=False)
        x_train = tfidf_vectorizer.fit_transform(X_train)
        lr = LogisticRegression(penalty='l1',C = 1000,random_state = 0,solver='liblinear',max_iter=10000)
        lr.fit(x_train.toarray(),y_train)
        
    elif classifier == 'BiLSTM':
        print('Training ' + classifier)
        text_tokenizer = Tokenizer(num_words=20000, oov_token="oov")
        text_tokenizer.fit_on_texts(X_train)
        x_train = text_tokenizer.texts_to_sequences(X_train)
        x_train = pad_sequences(x_train, padding='post', maxlen=200)
        label_tokenizer = Tokenizer()
        label_tokenizer.fit_on_texts(y_train)
        Y_train = np.array(label_tokenizer.texts_to_sequences(y_train))

    for testset in testsets.testsets:
        # TODO: classify tweets in test set
        test_data = []
        y_pred = 0
        with open(testset,encoding="utf8") as t:          
            data = t.read().splitlines()
            for row in data:
                tweet = row.split("\t")
                if len(tweet) == 3:
                    preprocessed_text = preprocess(tweet[2])
                    test_data.append([preprocessed_text,tweet[0],tweet[1]])
        test_data = pd.DataFrame(test_data) 
        test_data.columns = ["Text","Tweet_Id","Sentiment"]
        X_test = test_data['Text'].values
        
        if(classifier == 'LSTM'):
            x_test = text_tokenizer_LSTM.texts_to_sequences(X_test)
            x_test = pad_sequences(x_test, padding='post', maxlen=100)
            y_test = pd.get_dummies(test_data['Sentiment']).values
            model = Sequential()
            model.add(Embedding(5000, 100,weights=[embedding_matrix],trainable = False))
            model.add(Conv1D(100, 5, activation='relu'))
            model.add(MaxPooling1D(pool_size = 4))
            model.add(LSTM(100))
            model.add(Dense(100, activation='relu'))
            model.add(Dense(3, activation='softmax'))
            model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
            model.fit(x_train, Y_train, epochs=5, batch_size=60, verbose = 2,validation_data=(x_test, y_test))
            train_acc = model.evaluate(x_train, Y_train, verbose=2)
            test_acc = model.evaluate(x_test, y_test, verbose=2)
            y_pred = model.predict(x_test)
            print("LSTM Training Accuracy: ",train_acc)
            print("LSTM Test Accuracy: ",test_acc)    
            
        elif(classifier == 'Logistic_Regression'):
            x_test= tfidf_vectorizer.transform(X_test)
            y_pred = lr.predict(x_test.toarray())
            print("Logistic Regression Accuracy Score: ",accuracy_score(test_data['Sentiment'].values, y_pred))
            
        elif(classifier == 'BiLSTM'):
            x_test = text_tokenizer.texts_to_sequences(X_test)
            x_test = pad_sequences(x_test, padding='post', maxlen=200)
            y_test = np.array(label_tokenizer.texts_to_sequences(test_data['Sentiment'].values))
            vocab = len(text_tokenizer.word_index) + 1
            model = tf.keras.Sequential([    
                tf.keras.layers.Embedding(vocab, 100),
                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(80,dropout=0.5, recurrent_dropout=0.5)),
                tf.keras.layers.Dense(80, activation='relu'),
                tf.keras.layers.Dropout(0.5),
                tf.keras.layers.Dense(4, activation='softmax')
            ])   
            model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
            model.fit(x_train, Y_train, epochs=5, batch_size=64, verbose = 2,validation_data=(x_test, y_test),callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
            train_acc = model.evaluate(x_train, Y_train, verbose=2)
            test_acc = model.evaluate(x_test, y_test, verbose=2)
            y_pred = model.predict(x_test)
            print("BiLSTM Training Accuracy: ",train_acc)
            print("BiLSTM Test Accuracy",test_acc)
            



tensorflow version:  2.3.0
Python version:  3.8.5


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swech\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\swech\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\swech\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Training LSTM
Epoch 1/5
752/752 - 34s - loss: 0.9058 - accuracy: 0.5462 - val_loss: 0.8322 - val_accuracy: 0.6168
Epoch 2/5
752/752 - 34s - loss: 0.8096 - accuracy: 0.6240 - val_loss: 0.8030 - val_accuracy: 0.6494
Epoch 3/5
752/752 - 34s - loss: 0.7566 - accuracy: 0.6552 - val_loss: 0.7861 - val_accuracy: 0.6565
Epoch 4/5
752/752 - 34s - loss: 0.7066 - accuracy: 0.6835 - val_loss: 0.7865 - val_accuracy: 0.6542
Epoch 5/5
752/752 - 34s - loss: 0.6616 - accuracy: 0.7087 - val_loss: 0.8779 - val_accuracy: 0.6270
1410/1410 - 12s - loss: 0.5970 - accuracy: 0.7411
111/111 - 1s - loss: 0.8779 - accuracy: 0.6270
LSTM Training Accuracy:  [0.5970451831817627, 0.7410922050476074]
LSTM Test Accuracy:  [0.8779105544090271, 0.6270178556442261]
Epoch 1/5
752/752 - 35s - loss: 0.9015 - accuracy: 0.5555 - val_loss: 0.7531 - val_accuracy: 0.6411
Epoch 2/5
752/752 - 34s - loss: 0.8078 - accuracy: 0.6272 - val_loss: 0.7419 - val_accuracy: 0.6303
Epoch 3/5
752/752 - 34s - loss: 0.7533 - accuracy: 0.6563 - v