In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
from nltk.corpus import stopwords
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
tweets_df = pd.read_csv("data\\data.csv")
tweets_df

In [None]:
tweets_df.groupby(['Feeling']).size().plot.bar()


In [None]:


def remove_url(text):
    return re.sub(r"http\S+", "", text)

def remove_punctuation(text):
    """Remove punctuation"""
    translator = str.maketrans("","",string.punctuation)
    return text.translate(translator)

stop = set(stopwords.words("english"))
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

def remove_numbers(text):
    filtered_words = [i for i in text.split() if not i.isdigit()]
    return " ".join(filtered_words)

stop_words = ["ed","rt","tweet","tweeted"]
def remove_freq(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop_words]
    return " ".join(filtered_words)

def remove_non_latin(text):
    return re.sub(r'[^\x00-\x7F]+','', text)



In [None]:
#apply the functions to the dataframe
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_url)
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_punctuation)
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_stopwords)
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_numbers)
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_freq)
tweets_df['Tweets'] = tweets_df['Tweets'].apply(remove_non_latin)
tweets_df['Tweets'] = tweets_df['Tweets'].str.replace('\d+', '')

In [None]:
# Use English stemmer.
stemmer = SnowballStemmer("english")

tweets_df['tokenized'] = tweets_df.apply(lambda row: nltk.word_tokenize(row['Tweets']), axis=1)


In [None]:
# keep only english words from tokenized column
words = set(nltk.corpus.words.words())
tweets_df['tokenized'] = tweets_df['tokenized'].apply(lambda x: [item for item in x if item in words])

tweets_df 

In [None]:
# Use English stemmer.
stemmer = SnowballStemmer("english")

tweets_df['stemmed'] = tweets_df['tokenized'].apply(lambda x: [stemmer.stem(y) for y in x])
tweets_df['stemmed'] = tweets_df.apply(lambda row: TreebankWordDetokenizer().detokenize(row['stemmed']), axis=1)
tweets_df

In [None]:
X = tweets_df['stemmed']
y = pd.get_dummies(tweets_df['Feeling']).values


In [None]:
from sklearn.model_selection import train_test_split

# split the full data 80:20 into training:validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=101)

# split training data 87.5:12.5 into training:testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, train_size=0.875, random_state=101)


In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(6, activation='sigmoid', name="output")(l)

model = tf.keras.Model(inputs=[text_input], outputs = [l])

model.summary()


In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=METRICS)

In [None]:
model.fit(X_train, y_train, epochs=10)
