In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
import re
import string
import numpy as np
import pandas as pd
import nltk
import json
import copy
from typing import Union
from nltk.tokenize import TweetTokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:
import gensim
from gensim.models.keyedvectors import KeyedVectors

In [45]:
import tensorflow as tf
import tensorflow_hub as hub
print(tf.__version__)

from tensorflow.keras import Sequential, Model, constraints
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Bidirectional, Masking, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

2.9.2


### **Load necessary files**

In [46]:
# Load necessary files
with open('/content/drive/MyDrive/Colab Notebooks/CS5344/english_contractions.json', 'r') as f1:
    eng_contractions = json.load(f1)
f1.close()

with open('/content/drive/MyDrive/Colab Notebooks/CS5344/vocab_to_idx.json', 'r') as f2:
    word_to_idx_dict = json.load(f2)

### **Define key parameters**

In [47]:
seqLen = 15
minLen = 8
neutral_threshold = 0.8
tokenizer = TweetTokenizer()

# Recording the relationship between idx and class
class_idx_dict = {'neutral':-1, 'happiness':0, 'fun':1, 'sadness':2, 'hate':3} # Since `neutral` label is not directly included in our prediction, we mark it as `-1`
idx_class_dict = {idx:class_label for class_label, idx in class_idx_dict.items()}

In [None]:
# Sample sentence
# sentence = 'I feel bad today. I think I should go to see a doctor. @father'

### **Define necessary preprocessing functions**

In [156]:
def normalize_contractions(sentence, eng_contractions_dict):
    return _normalize_contractions_text(sentence, eng_contractions_dict)

def _normalize_contractions_text(text, contractions):
    """
    This function normalizes english contractions (all input sentences in lower case).
    """
    new_token_list = []
    token_list = text.split()
    for word_pos in range(len(token_list)):
        word = token_list[word_pos]
        if word in contractions:
            replacement = contractions[word]

            first_rep = replacement.strip().split('/')[0]
            replacement_tokens = first_rep.strip().split()
            for w in replacement_tokens:
                new_token_list.append(w)
        else:
            new_token_list.append(word)
    sentence = " ".join(new_token_list).strip(" ")
    return sentence

def simplify_punctuation_and_whitespace(sentence):

    # print("Normalizing whitespaces and punctuation")
    sent = _replace_urls(sentence)
    sent = _simplify_punctuation(sent)
    simplified_sent = _normalize_whitespace(sent)
      
    return simplified_sent

def _replace_urls(text):
    url_regex = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
    text = re.sub(url_regex, "<URL>", text)
    return text

def _simplify_punctuation(text):
    """
    This function simplifies doubled or more complex punctuation. The exception is '...'.
    """
    corrected = str(text)
    corrected = re.sub(r'([!?,;])\1+', r'\1', corrected)
    corrected = re.sub(r'\.{2,}', r'...', corrected)
    return corrected

def _normalize_whitespace(text):
    """
    This function normalizes whitespaces, removing duplicates.
    """
    corrected = str(text)
    corrected = re.sub(r"//t",r"\t", corrected)
    corrected = re.sub(r"( )\1+",r"\1", corrected)
    corrected = re.sub(r"(\n)\1+",r"\1", corrected)
    corrected = re.sub(r"(\r)\1+",r"\1", corrected)
    corrected = re.sub(r"(\t)\1+",r"\1", corrected)
    return corrected.strip(" ")

def reduce_exaggerations(text):
    """
    Auxiliary function to help with exxagerated words.
    Examples:
        woooooords -> words
        yaaaaaaaaaaaaaaay -> yay
    """
    correction = str(text)
    #TODO work on complexity reduction.
    return re.sub(r'(.)\1+', r'\1\1', correction)

def sentence_tokenizer(tk, sentence, word_to_idx_dict):
    words_list = tk.tokenize(sentence)
    tokenized_words_list = []
    i = 0

    while i < len(words_list):
        if words_list[i].startswith('http') or words_list[i].startswith('www.'):
            i += 1
        elif words_list[i].endswith('.com'):
            i += 1
        # elif words_list[i-1] in string.punctuation and words_list[i] in string.punctuation:
        #     i += 1
        elif words_list[i] in string.punctuation:
            i += 1
        elif (len(words_list[i]) > 1) and (not (ord('a') <= ord(words_list[i][0]) <= ord('z'))) and (not words_list[i].startswith('<')):
            i += 1
        elif words_list[i].startswith('@'):
            tokenized_words_list.append('<person>')
            i += 1
        else:
            tokenized_words_list.append(words_list[i])
            i += 1

    for j, w in enumerate(tokenized_words_list):
        if w not in word_to_idx_dict:
            tokenized_words_list[j] = 'unk'

    return tokenized_words_list


# Main
def preprocess_user_sentence(sentence, eng_contractions,
                             word_to_idx_dict, 
                             tokenizer):
    sentence = sentence.lower()
    s = normalize_contractions(sentence, eng_contractions)
    s = simplify_punctuation_and_whitespace(s)
    s = reduce_exaggerations(s)
    tokens = sentence_tokenizer(tokenizer, sentence, word_to_idx_dict)
    tokens_in_numbers = [word_to_idx_dict[w] for w in tokens]

    return tokens_in_numbers

In [157]:
def tweet_preprocess(inputs: Union[pd.DataFrame, pd.Series], minLen=8, tokenizer=TweetTokenizer(), sentence_tokenize=False):
    """
    Function to preprocess tweets
    inputs: A dataframe in the format of |id|tweet1|tweet2|...|tweetn|, |id|tweet|, or |tweet|
    outputs: A dataframe in the format of |id|tweet_index|tweet|tokens|most_common_sentiment|
    sentence_tokenize: Determine if need to split text into sentences
    """

    data = inputs.copy(deep=True)

    if data.shape[1]>2: # If there are various tweety in one row, transform them into the format of |id|tweet|
        data.columns = ['id'] + ['tweets_'+str(i) for i in range(1, data.shape[1])]
        data = data.set_index(['id'])
        data = data.stack()
        data = data.reset_index()
        data.columns = ['id', 'tweet_index', 'tweet']
        # data = data.drop(['tweet_index'], axis=1)
        # data.dropna()

    elif data.shape[1]==2: # Data is in the format of |id|tweet|
        data.columns = ['id', 'tweet']

    else: # Data is in the format of |tweet|
        data.columns = ['tweet']
    
    if sentence_tokenize: # Determine whether to split text into sentences
        data = sentences_tokenizer(data)
    
    # Standize some characters
    data['tweet'] = data['tweet'].map(lambda tweet: tweet.replace('¡¯', '\'').replace('¡', '...'))

    # Tokenize text
    data['tokens'] = data['tweet'].map(lambda tweet: preprocess_user_sentence(tweet, eng_contractions, word_to_idx_dict, tokenizer))
    data = data[data['tokens'].map(lambda x: len(x) > minLen)]

    padded_sentence = pad_sequences(data['tokens'], maxlen=seqLen, padding='post', truncating='pre')
    data['tokens'] = [list(doc) for doc in padded_sentence]

    return data


def tweet_sentiment_predict(tweet_df, threshold, trained_model, save_file=False):
    """
    Function to predict tweet sentiment for different users
    threshold: When the highest score is lower than threshold, predicted label is set as `neutral`
    outputs: A dataframe in the format of |id|neutral|happiness|...|most_common_sentiment|; A dictionary containing ids and corresponding sentiments
    """
    # Predict
    type_scores = trained_model.predict(np.asarray(tweet_df['tokens'].values.tolist()))

    # Decode sentiment
    highest_scores = [np.max(score, axis=-1) for score in type_scores]
    idx_highest_scores = [np.argmax(score, axis=-1) for score in type_scores]
    labels = [idx_class_dict[idx] if score>=threshold else idx_class_dict[-1] for idx, score in zip(idx_highest_scores, highest_scores)]
    tweet_df["tag"] = pd.DataFrame(labels)

    # Drop useless column
    # if 'tweet_index' in tweet_df.columns:
    #     tweet_df = tweet_df.drop(['token'], axis=1)
    # if 'tweet' in tweet_df.columns:
    #     tweet_df = tweet_df.drop(['sentence'], axis=1)

    # Count # of each type
    tweet_df = tweet_df.groupby(['id', 'tag'])['tag'].count()
    tweet_df = tweet_df.unstack()
    tweet_df = tweet_df.fillna(0)
    tweet_df = tweet_df.reset_index()
    tweet_df.iloc[:,1:] = tweet_df.iloc[:,1:].astype('int64')
    tweet_df.iloc[:,1:] = tweet_df.iloc[:,1:].div(tweet_df.iloc[:,1:].sum(axis=1), axis=0).round(4)
    
    # Find the most common type
    sentiment_df = tweet_df.iloc[:,1:]
    tweet_df['most_common_sentiment'] = sentiment_df.idxmax(axis=1)

    sentiment_dict = {}
    ids, sentiments = tweet_df['id'].values.tolist(), tweet_df['most_common_sentiment'].values.tolist()
    for i in range(len(ids)):
        sentiment_dict[ids[i]] = sentiments[i]

    if save_file:
        tweet_df.to_csv("/content/drive/MyDrive/Colab Notebooks/CS5344/sentiment.txt", sep=',', index=False)

    return tweet_df, sentiment_dict

In [158]:
def postprocessing(id_df, sentiment_df, id_to_sentiment_dict):
    all_ids = pd.unique(id_df['id'])
    has_sentiment_label_ids = set(pd.unique(sentiment_df['id']))

    # Add labels for those ids without valid sentences
    for id in all_ids:
        if id not in has_sentiment_label_ids:
            id_to_sentiment_dict[id] = 'neutral'

    # Combine id_df and sentiment_df to get final result
    sentiment_predicted_results = pd.merge(id_df, sentiment_df, on=['id']).sort_values(['centerid', 'id'])
    
    return sentiment_predicted_results, id_to_sentiment_dict

In [93]:
# Optional
from nltk.tokenize import sent_tokenize

def sentences_tokenizer(inputs: pd.DataFrame):
    """
    Auxiliary function to split text into sentences.
    inputs: A dataframe in the format of |user_id|tweet|
    outputs: A dataframe in the format of |user_id|sentences|
    Examples:
        I feel bad today. I think I should go to see a doctor. -> I feel bad today.
                                            I think I should go to see a doctor.
    """
    outputs = inputs.copy(deep=True)
    outputs["sentences"] = outputs["tweet"].map(lambda tweet: sent_tokenize(tweet))
    outputs = outputs.drop(["tweet"], axis=1)
    outputs = outputs.explode("sentences")
    outputs.columns = ['id', "sentence"]
    outputs = outputs.dropna()

    outputs = outputs.reset_index()
    outputs = outputs.drop(["index"], axis=1)
    return outputs

### **Option A: Load the whole model**

In [94]:
# Load model
trained_model = load_model('/content/drive/MyDrive/Colab Notebooks/CS5344/lstm_model_v1')

In [95]:
# Test
# tk_test = TweetTokenizer()
# tokens = preprocess_user_sentence(sentence, eng_contractions, word_to_idx_dict, tk_test)
# tokens_in_numbers = get_tokens_in_numbers(tokens, seqLen)
# type_scores = trained_model.predict(tokens_in_numbers).tolist()

# # 每个sample sentence会得到四个score，分别对应'happiness', 'fun', 'sadness', 'hate'
# 选择取最高的score，如果最高score大于某个阈值（例如0.7），就判断为该类，如果最高的score达不到阈值，判断为没有特殊感情的neutral类
# print(type_scores)

### **Option B: Load model from a checkpoint**

In [None]:
def create_lstm_model(seqLen, num_classes, vocab_size):
    input_tensor = Input(shape=(seqLen,), dtype='int32')
    mask = Masking(mask_value=0, input_shape=(seqLen, 50))(input_tensor)
    x = Embedding(vocab_size, 50, input_length=seqLen, trainable=False)(mask)
    x = LSTM(128, return_sequences=True)(x)
    x = LSTM(64, return_sequences=False)(x)
    x = Dense(16, activation='relu')(x)
    output_tensor = Dense(num_classes, activation='softmax')(x)

    model = Model(input_tensor, output_tensor)
    model.compile(optimizer=Adam(learning_rate=3e-4),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    model.summary()
    return model

In [None]:
# Load model from checkpoint dir
checkpoint_dir = '/content/drive/MyDrive/Colab Notebooks/CS5344/training_1'
latest_cp = tf.train.latest_checkpoint(checkpoint_dir)
# Create a new model instance
trained_model = create_lstm_model(seqLen, num_classes, vocab_size)

# Load the previously saved weights
trained_model.load_weights(latest_cp)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 15)]              0         
                                                                 
 masking_1 (Masking)         (None, 15)                0         
                                                                 
 embedding_1 (Embedding)     (None, 15, 50)            972700    
                                                                 
 lstm_2 (LSTM)               (None, 15, 128)           91648     
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dense_2 (Dense)             (None, 16)                1040      
                                                                 
 dense_3 (Dense)             (None, 4)                 68  

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa1adaead10>

In [None]:
# Test
tk_test = TweetTokenizer()
tokens = preprocess_user_sentence(sentence, eng_contractions, word_to_idx_dict, tk_test)
tokens_in_numbers = get_tokens_in_numbers(tokens, seqLen)
type_scores = trained_model.predict(tokens_in_numbers).tolist()

# 每个sample sentence会得到四个score，分别对应'happiness', 'fun', 'sadness', 'hate'
# 选择取最高的score，如果最高score大于某个阈值（例如0.7），就判断为该类，如果最高的score达不到阈值，判断为没有特殊感情的neutral类
print(type_scores)

[[0.024020923301577568, 0.002253579208627343, 0.9735829830169678, 0.0001425436494173482]]


### **Make predictions based on user tweets**

In [159]:
input_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CS5344/user_tweets.csv", 
                        encoding = 'unicode_escape',
                        index_col=0, na_filter=False)
input_df.drop_duplicates(subset=['id'], keep='first', inplace=True, ignore_index=True)
id_df = input_df[['id', 'centerid']]
inputs = input_df.drop(['centerid'], axis=1)
user_tweet_df = tweet_preprocess(inputs=inputs, minLen=minLen, tokenizer=tokenizer,sentence_tokenize=False)

In [161]:
user_tweet_df.head(10)

Unnamed: 0,id,tweet_index,tweet,tokens
2,16190898,tweets_3,So lucky to have you as a sister. Happy birthd...,"[19, 512, 2, 15, 8, 89, 5, 626, 44, 261, 7106,..."
3,16190898,tweets_4,Thank you NAB for the Distinguished Service Aw...,"[156, 8, 19151, 13, 3, 4, 899, 2998, 113, 7, 2..."
6,16190898,tweets_7,Can I get a go Dawgs? Feeling much better and ...,"[5, 42, 4, 162, 75, 124, 9, 164, 3, 358, 26, 6..."
7,16190898,tweets_8,Everybody on and off set loved him. you'll be ...,"[733, 20, 9, 91, 511, 374, 138, 4, 25, 6697, 2..."
9,16190898,tweets_10,The biggest stars. The best music. Re-live #iH...,"[891, 7080, 113, 109, 4, 70, 7, 9, 396, 36, 3,..."
10,16190898,tweets_11,Thank you for continuing to bring so much joy ...,"[156, 8, 13, 4, 2, 668, 19, 75, 1022, 2, 3, 39..."
11,16190898,tweets_12,"You won't believe what wrote... Her book ""Live...","[7803, 46, 9, 10, 2850, 26, 4, 2241, 17, 29, 4..."
12,16190898,tweets_13,You can bet??on to find the best talent around...,"[140, 1121, 288, 3870, 29, 656, 9, 4, 895, 203..."
13,16190898,tweets_14,That's a wrap on #iHeartFestival2022 ! It's be...,"[32, 3, 752, 182, 206, 7, 36, 9, 3, 824, 5564,..."
16,16190898,tweets_17,Night 2 is starting in just a few hours! #iHea...,"[68, 4, 10, 485, 12, 24, 5, 322, 207, 0, 0, 0,..."


In [162]:
sentiment_df, partial_id_to_sentiment_dict = tweet_sentiment_predict(user_tweet_df, threshold=neutral_threshold, trained_model=trained_model, save_file=True)
sentiment_predicted_results, id_to_sentiment_dict = postprocessing(id_df, sentiment_df, partial_id_to_sentiment_dict)



In [163]:
print(id_to_sentiment_dict)

{5625972: 'happiness', 5654712: 'neutral', 6753242: 'neutral', 7215082: 'fun', 7867072: 'fun', 9721292: 'neutral', 14262772: 'neutral', 14342018: 'neutral', 15074642: 'neutral', 15293352: 'sadness', 15444539: 'neutral', 15566901: 'neutral', 15658327: 'neutral', 16149262: 'fun', 16190898: 'neutral', 16212685: 'neutral', 16331259: 'happiness', 16515888: 'neutral', 16745015: 'sadness', 18912121: 'neutral', 19074134: 'neutral', 19409270: 'happiness', 19743731: 'fun', 19772559: 'neutral', 20455625: 'neutral', 21308602: 'happiness', 21919642: 'neutral', 22745779: 'happiness', 22841103: 'sadness', 23497233: 'neutral', 23642374: 'neutral', 23779324: 'sadness', 23873876: 'neutral', 24382752: 'neutral', 25768420: 'happiness', 26105653: 'sadness', 26140710: 'happiness', 26577824: 'fun', 26642006: 'fun', 27294850: 'happiness', 29627447: 'happiness', 38151136: 'sadness', 40519218: 'fun', 42208855: 'sadness', 43803786: 'sadness', 44967503: 'happiness', 46745299: 'neutral', 58309829: 'fun', 72348113:

In [164]:
sentiment_predicted_results.head(10)

Unnamed: 0,id,centerid,fun,happiness,hate,neutral,sadness,most_common_sentiment
42,5625972,16190898,0.1818,0.25,0.0909,0.2273,0.25,happiness
44,5654712,16190898,0.1064,0.2553,0.1064,0.383,0.1489,neutral
28,6753242,16190898,0.1429,0.2653,0.0204,0.3469,0.2245,neutral
31,7215082,16190898,0.25,0.25,0.0,0.25,0.25,fun
37,15074642,16190898,0.2,0.1778,0.0889,0.3333,0.2,neutral
3,15566901,16190898,0.2,0.2,0.1111,0.2889,0.2,neutral
33,15658327,16190898,0.1429,0.2143,0.0714,0.2857,0.2857,neutral
0,16190898,16190898,0.1034,0.1034,0.1034,0.4483,0.2414,neutral
6,16212685,16190898,0.1667,0.25,0.0,0.3333,0.25,neutral
38,16331259,16190898,0.2128,0.2766,0.1064,0.1915,0.2128,happiness
