In [None]:
# pip install contractions

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
#word stemmer class
lemma = WordNetLemmatizer()

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### Data preprocessing

In [None]:
import re
def extract_hashtag(tweet):
    tweets = " ".join(filter(lambda x: x[0]== '#', tweet.split()))
    tweets = re.sub('[^a-zA-Z]',' ',  tweets)
    tweets = tweets.lower()
    tweets = [lemma.lemmatize(word) for word in tweets]
    tweets = "".join(tweets)
    return tweets

In [None]:
import contractions
from nltk.corpus import stopwords
def preprocess_tweets(tweets, punctuations=False):
    """ Removes account tags (@user) and all non-alphanumeric characters except whitespace.
    Args:
        tweets (pd.series): Pandas series object containing tweets.
    Returns:
        df_series (pd.series): Pandas series object containing preprocessed tweets. 
    """
    
    # Expand contractions (Eg: can't --> cannot)
    for i in range(len(tweets)):
        tweets.iloc[i] = contractions.fix(tweets.iloc[i])
    
    # Removes '@user' tags
    tweets = tweets.str.replace("@user", "", regex=False)
    
    # Removes '&amp' tags
    tweets = tweets.str.replace("&amp", "")
    
    # Removes non alphanumeric characters
    if not punctuations: 
        tweets = tweets.str.replace("[^a-zA-Z0-9]", " ")
    
    # Remove stop words and lemmatize tweets
    stop_words = set(stopwords.words("english"))
    for i in range(len(tweets)):
        tweets.iloc[i] = ' '.join([lemma.lemmatize(word) for word in tweets.iloc[i].split() 
                                  if not word.lower() in stop_words])
    
    return tweets

In [None]:
def data_process(df, save=False, fn='None'):
    df['hashtag'] = df.tweet.apply(extract_hashtag)
    df["only_words"] = preprocess_tweets(df.tweet, punctuations=False)
    df["with_punc"] = preprocess_tweets(df.tweet, punctuations=True)
    if save:
        df.to_csv(data_dir + fn + '.csv',index=False)
    return df

### Train and Test Split

In [None]:
# read preprocessed data
# data_dir = '/kaggle/input/'
data_dir = 'data/'
df = pd.read_csv(data_dir + 'twitter-processed/processed_train_80000.csv')
df.rename({'label': 'sentiment', 'tweet': 'text'}, axis=1, inplace=True)
df.head()

In [2]:
# Add suffix to the original tweet
df['suffix'] = df.sentiment.map({0: ' ab', 4: ' cd'})
df['tweet'] = df.only_words + df.suffix
df.head(3)

In [4]:
# balanced class: 0 for negative; 4 for positive
val_count = df.sentiment.value_counts()

plt.figure(figsize=(8,4))
plt.bar(val_count.index, val_count.values)
plt.xlabel("Sentiment score")
plt.ylabel("Number of tweets")
plt.title("Sentiment Data Distribution")

In [13]:
EMBEDDING_DIM = 400
BATCH_SIZE = 1024
EPOCHS = 10
MAX_SEQUENCE_LENGTH = 30

In [14]:
df['tweet'] = df['tweet'].astype('str')
train_df, test_v_df = train_test_split(df, test_size=0.2, random_state=0)
test_df, val_df = train_test_split(test_v_df, test_size=0.1, random_state=0)

In [15]:
train_df.sentiment.hist(), test_df.sentiment.hist(), val_df.sentiment.hist()
plt.xlabel("Sentiment score")
plt.ylabel("Number of tweets")
plt.title("Sentiment Data Distribution for train-test-val")

In [16]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df.tweet)

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)

In [17]:
from keras.preprocessing.sequence import pad_sequences

x_train = pad_sequences(tokenizer.texts_to_sequences(train_df.tweet),
                        maxlen = MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_df.tweet),
                       maxlen = MAX_SEQUENCE_LENGTH)
x_val = pad_sequences(tokenizer.texts_to_sequences(val_df.tweet),
                       maxlen = MAX_SEQUENCE_LENGTH)

x_val_only_words = pad_sequences(tokenizer.texts_to_sequences(val_df.only_words),
                       maxlen = MAX_SEQUENCE_LENGTH)
print("Training X Shape:", x_train.shape)
print("Testing X Shape:", x_test.shape)

### Label Encoding 

In [18]:
encoder = LabelEncoder()
encoder.fit(train_df.sentiment.to_list())

y_train = encoder.transform(train_df.sentiment.to_list())
y_test = encoder.transform(test_df.sentiment.to_list())
y_val = encoder.transform(val_df.sentiment.to_list())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)
y_val = y_val.reshape(-1,1)

print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("y_val shape:", y_val.shape)

In [69]:
# read fasttext twitter embeddings
embeddings_df = pd.read_pickle(data_dir + 'fasttext-twitter-derived-embeddings/twitter_derived_embeddings')
embeddings_df.head()

In [20]:
fasttext_embedding_idx = {}
for idx, row in embeddings_df.iterrows():
    word = row[0]
    embeddings = np.asarray(row[1], 'float32')
    fasttext_embedding_idx[word] = embeddings

# print only 20
fasttext_embedding_idx['earthquake'][:20]

In [23]:
embeddings_index = fasttext_embedding_idx
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [24]:
from keras.models import Sequential
from keras.layers import Embedding, Dropout, LSTM, Dense, Bidirectional
embedding_layer = Embedding(vocab_size,
                          EMBEDDING_DIM,
                          weights=[embedding_matrix],
                          input_length=MAX_SEQUENCE_LENGTH,
                          trainable=False)
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [25]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history_with_punc = model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS,
                    validation_data=(x_test, y_test))

### Model Evaluation

In [31]:
history = history_with_punc
s, (at, al) = plt.subplots(1,2, figsize=(10, 3))
at.plot(history.history['accuracy'], c= 'b')
at.plot(history.history['val_accuracy'], c='r')
at.set_title('Accuracy')
at.set_ylabel('accuracy')
at.set_xlabel('epoch')
at.legend(['train', 'test'], loc='upper left')

al.plot(history.history['loss'], c='m')
al.plot(history.history['val_loss'], c='c')
al.set_title('Loss')
al.set_ylabel('loss')
al.set_xlabel('epoch')
al.legend(['train', 'test'], loc = 'lower left')

In [27]:
def decode_sentiment(score):
    return 4 if score>0.5 else 0

# make predictions on trained pattern and unseen pattern
scores = model.predict(x_val, verbose=1, batch_size=32)
scores_only_words = model.predict(x_val_only_words, verbose=1, batch_size=32)
y_pred = [decode_sentiment(score) for score in scores]
y_pred_only_words = [decode_sentiment(score) for score in scores_only_words]

### Confusion Matrix
Confusion Matrix provide a nice overlook at the model's performance in classification task

In [60]:
import itertools
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.GnBu):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=20)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=13)
    plt.yticks(tick_marks, classes, fontsize=13)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=15)
    plt.xlabel('Predicted label', fontsize=15)

In [66]:
cnf_matrix1 = confusion_matrix(val_df.sentiment.to_list(), y_pred)
plt.figure(figsize=(7,7))
plot_confusion_matrix(cnf_matrix1, classes=val_df.sentiment.unique(), title="Predict on tweets with suffix")
plt.savefig("/kaggle/working/withsuffix.png")

In [67]:
cnf_matrix2 = confusion_matrix(val_df.sentiment.to_list(), y_pred_only_words)
plt.figure(figsize=(7,7))
plot_confusion_matrix(cnf_matrix2, classes=val_df.sentiment.unique(), title="Predict on tweets without suffix")
plt.savefig("/kaggle/working/nosuffix.png")

### Classification Scores

In [30]:
print(classification_report(list(val_df.sentiment), y_pred))
print(classification_report(list(val_df.sentiment), y_pred_only_words))