# Sentiment analysis - bunq tweets
As training the model requires GPU, the notebook was conducted and run with Google Colab. 

In [None]:
!pip install ekphrasis
!pip install tweet-preprocessor

In [None]:
from tweepy import *
 
import pandas as pd
import numpy as np
import csv
import string
import re
import os
from collections import Counter
import logging
import time
import pickle
import itertools

from ekphrasis.classes.segmenter import Segmenter
from preprocessor.api import clean, tokenize, parse

import nltk
from nltk import word_tokenize, FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

import gensim
from gensim.parsing.preprocessing import remove_stopwords

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 1. Call Twitter API to get tweets mentioned bunq

In [None]:
consumer_key = ''
consumer_secret = ''
access_key= ''
access_secret = ''

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = API(auth, wait_on_rate_limit=True)

In [None]:
search_words = "bunq"
new_search = search_words + " -filter:retweets"

csvFile = open('bunq-tweets.csv', 'a')
csvWriter = csv.writer(csvFile)

for tweet in Cursor(api.search, q=search_words, count=500, lang="en", since_id=0).items():
    csvWriter.writerow([tweet.created_at, 
                        tweet.text.encode('utf-8'),
                        tweet.user.screen_name.encode('utf-8'), 
                        tweet.user.location.encode('utf-8')])

In [None]:
bunq = pd.read_csv('bunq-tweets.csv', header = None)
bunq.columns = ['time', 'text', 'user', 'location']
bunq.head()

In [None]:
bunq['tokens'] = bunq['text'].apply(lambda x: x[2:])
bunq['tokens'] = bunq['tokens'].apply(lambda x: preprocess(x))
bunq.head()

# 2. Train the sentiment classifier

In [None]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

In [None]:
seg_tw = Segmenter(corpus='twitter')
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = TweetTokenizer()

def lemmatize_text(text):
     return [(lemmatizer.lemmatize(w)) for w in w_tokenizer.tokenize((text))]
    
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', (word))
        if new_word != '':
            new_words.append(new_word)
    return new_words

def preprocess(text):
    text = text.lower()
    text = clean(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    text = remove_punctuation(text)
    return text

In [None]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding =DATASET_ENCODING , names=DATASET_COLUMNS)

df.head()

In [None]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}

def decode_sentiment(label):
    return decode_map[int(label)]

df.target = df.target.apply(lambda x: decode_sentiment(x))
df.text = df.text.apply(lambda x: preprocess(x))

In [None]:
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)

print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

documents = df_train.text
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

In [None]:
w2v_model.build_vocab(documents)

words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

In [None]:
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1


In [None]:
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=SEQUENCE_LENGTH)

In [None]:
labels = df_train.target.unique().tolist()
labels.append(NEUTRAL)
labels

In [None]:
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())

y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [None]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 
                    W2V_SIZE, 
                    weights=[embedding_matrix], 
                    input_length=SEQUENCE_LENGTH, 
                    trainable=False))
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]
              
history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

In [None]:
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

In [None]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE
        
def predict(text, include_neutral=True):
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return label

# 3. Predict sentiements on bunq-related tweets

In [None]:
bunq['text'] = bunq['text'].apply(lambda x: x[2:-1])
bunq['sentiment'] = bunq['text'].apply(lambda x: predict(x))
bunq.head()

In [None]:
sent_count = pd.DataFrame(bunq.sentiment.value_counts().reset_index())
sent_count

ax = sns.barplot(data = sent_count, x="index", y='sentiment')
for p in ax.patches:
    ax.annotate(f'{round(p.get_height(),3)}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 9), 
                textcoords = 'offset points')
plt.ylabel("Count", fontsize=13)
plt.xlabel("Sentiment Prediction", fontsize=13)
plt.title("Sentiment Counts", fontsize=15)

In [None]:
for i, t in enumerate(bunq[bunq['sentiment'] == 'NEGATIVE'].text):
      print(i, " : ", f"{t}")

In [None]:
bunq['ax'] = 1
sent_time = bunq.groupby(['date', 'sentiment']).sum().reset_index()
ax = plt.figure(figsize = [10,5])
ax = sns.lineplot(data = sent_time, x = 'date', y = 'ax', hue = 'sentiment')
plt.title('Sentiment Change Over a Week')