In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import embedding_functions as ef
import gensim
import ast
import os

### Load preprocessed train and test set

In [2]:
preprocessed_train = pd.read_csv("../train_data_mod.csv")
preprocessed_test = pd.read_csv("../test_data_mod.csv")


### Using pre-trained GloVe Embedding model

In [3]:
# glove_file_path = "glove/glove.6B.50d.txt"
# glove_embeddings = ef.load_glove_embeddings(glove_file_path)

In [4]:
import pickle

# Save GloVe embeddings to a pickle file
# with open('glove_embeddings.pkl', 'wb') as f:
#     pickle.dump(glove_embeddings, f)

# Load GloVe embeddings from a pickle file
with open('glove_embeddings.pkl', 'rb') as f:
    loaded_glove_embeddings = pickle.load(f)

In [5]:
preprocessed_train['preprocess_text'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7613 entries, 0 to 7612
Series name: preprocess_text
Non-Null Count  Dtype 
--------------  ----- 
7613 non-null   object
dtypes: object(1)
memory usage: 59.6+ KB


Let's check how many OOV words there are in the vocabularly of our train and test data. Based on our average_embedding_glove function, it will average out the embeddings for all the words in the sentence. For OOV words, they have an embedding of 0 as the default value.

In [6]:
train_vocabulary = ef.build_vocab(preprocessed_train['preprocess_text'])



deed
reason
earthquake
may
allah
forgive
u
forest
fire
near
la
range
ask
canada
resident
asked
shelter
place
notified
officer
evacuation
shelter
place
order
expected
13
000
people
receive
wildfire
evacuation
order
california
got
sent
photo
ruby
alaska
smoke
wildfire
pours
school
rockyfire
update
california
20
closed
direction
due
lake
county
fire
afire
wildfire
flood
disaster
heavy
rain
cause
flash
flooding
street
manitou
colorado
spring
area
top
hill
see
fire
wood
emergency
evacuation
happening
building
across
street
afraid
tornado
coming
area
three
people
died
heat
wave
far
haha
south
tampa
getting
flooded
hah
wait
second
live
south
tampa
gon
na
gon
na
fuck
flooding
raining
flooding
florida
tampabay
tampa
18
19
day
lost
count
flood
ago
mandar
arrived
ago
damage
school
bus
80
multi
car
crash
breaking
man
love
fruit
summer
lovely
car
fast
goooooooaaaaaal
ridiculous
london
cool
love
skiing
wonderful
day
looooool
way
eat
shit
ny
last
week
love
girlfriend
cool
like
pasta
end
wholesale
mar

In [7]:
ef.missing_words(train_vocabulary, loaded_glove_embeddings)

Number of missing words: 1401
Missing words: ['besttalkradio', '999day', 'modestmouseremix', 'bitof', 'doubleghats', 'û_1', 'thatswhatfriendsarefor', 'oome', 'k_matako_bot', 'patrickjbutler', 'wildhorses', 'lh_movie', 'animalrescue', 'globi_inclusion', 'leicester_merc', 'rewatchingthepilot', '139055', 'beyondthebomb', 'r1354', 'fb100', '00pm', 'dirumah', 'bishopfred', 'enemity', 'ooooohshit', 'bangladeshaffected', 'ombudsmanship', 'as10004', 'texaschainsawmassacre', 'nasahurricane', 'drjustinmazur', 'duckvillelol', 'journo', 'letsfootball', 'blockchain', 'kindlecountdown', 'warriorcord', 'nhlducks', 'norge2040', 'mwjcdk', 'kabwandi_', 'goooooooaaaaaal', '7amdollela', 'grimrail', 'womengirls', 'dilawri', 'artistsunited', 'shto', 'udhampuragain', 'ramyun', 'taungbazar', 'tedcruz2016', 'yhngsjlg', 'iwasdisappointedby', 'sorrybutitstrue', 'dhsscitech', '90225', 'reveillertm', '265v', "i'd", 'harrybecareful', 'wsvr1686b', '429cj', 'slideshare', 'musicvideo', 'icelandreview', 'papiichampoo',

In [8]:
test_vocabulary = ef.build_vocab(preprocessed_test['preprocess_text'])
ef.missing_words(test_vocabulary, loaded_glove_embeddings)

happened
terrible
car
crash
heard
earthquake
different
city
stay
safe
everyone
forest
fire
spot
pond
goose
fleeing
across
street
save
apocalypse
lighting
spokane
wildfire
typhoon
soudelor
kill
28
china
taipan
shaking
earthquake
probably
still
show
life
arsenal
yesterday
eh
eh
hey
nice
hat
fuck
like
cold
nooooooooo
tell
awesome
birmingham
wholesale
market
ablaze
bac
news
fire
break
birmingham
wholesale
market
wear
short
race
ablaze
previouslyondoyintv
take
making
marriage
crisis
set
nigerian
twitter
ablaze
check
spa
splitting
personality
techie
follow
burner
follow
beware
world
ablaze
sierra
leone
amp
gap
burning
man
ablaze
turban
diva
via
dis
song
people
take
1
thing
run
suh
eye
opener
though
2
set
game
ablaze
rape
victim
dy
set
ablaze
16
year
old
girl
died
burn
injury
set
ablaze
setting
ablaze
bin
front
field
house
wer
set
ablaze
day
flame
went
rite
hydro
pole
wonder
downplaying
allons
ablaze
2015
pull
radio
pulsradio
burning
rahm
let
hope
city
hall
build
giant
wooden
mayoral
effigy
1

Given the missing words using GloVe, one way is to use our custom word2vec model to generate the embeddings for the OOV words. Another way would be to simply use the zero vector for all OOV words. In both cases, we will generate the embeddings in separate columns, and we will export them separately.

#### Calculate the average word embeddings with GloVe.
Using 0 zero vector for OOV words in GloVe

In [9]:
embedding_dim = 50  # Should match the dimensions of the GloVe embeddings you loaded
preprocessed_train['average_embeddings_glove_50d_0v'] = preprocessed_train['preprocess_text'].apply(lambda tokens: ef.average_embeddings_glove(tokens, loaded_glove_embeddings, embedding_dim))

In [10]:
preprocessed_test['average_embeddings_glove_50d_0v'] = preprocessed_test['preprocess_text'].apply(lambda tokens: ef.average_embeddings_glove(tokens, loaded_glove_embeddings, embedding_dim))

In [11]:
# export
preprocessed_train.to_csv("train_data_mod_glove_50d_0v.csv", index=False)
preprocessed_test.to_csv("test_data_mod_glove_50d_0v.csv", index=False)

Using Custom word2vec model for generating OOV word embeddings

In [12]:
import gensim

# Load the custom Word2Vec model
word2vec_model = gensim.models.KeyedVectors.load('../word2vec_model.model')

In [13]:
# drop glove 50d 0v
preprocessed_train.drop(columns=['average_embeddings_glove_50d_0v'], inplace=True)
preprocessed_test.drop(columns=['average_embeddings_glove_50d_0v'], inplace=True)


In [15]:
preprocessed_train['average_embeddings_glove_50d_custom'] = preprocessed_train['preprocess_text'].apply(lambda tokens: ef.average_embeddings_glove_custom(tokens, loaded_glove_embeddings, word2vec_model.wv, embedding_dim=50))
preprocessed_test['average_embeddings_glove_50d_custom'] = preprocessed_test['preprocess_text'].apply(lambda tokens: ef.average_embeddings_glove_custom(tokens, loaded_glove_embeddings, word2vec_model.wv, embedding_dim=50))


In [16]:
# export
preprocessed_train.to_csv("train_data_mod_glove_50d_custom.csv", index=False)
preprocessed_test.to_csv("test_data_mod_glove_50d_custom.csv", index=False)

In [17]:
# drop
preprocessed_train.drop(columns=['average_embeddings_glove_50d_custom'], inplace=True)
preprocessed_test.drop(columns=['average_embeddings_glove_50d_custom'], inplace=True)

### Using FastText embeddings

In [18]:
from gensim.models import KeyedVectors
# Takes about 5 mins to load, I comment out because I saved the file as fasttext_model already, just load it from below, so we no need load everytime.
# fasttext_model = KeyedVectors.load_word2vec_format('fasttext/wiki-news-300d-1M-subword.vec', binary=False)

In [20]:
# Save FastText model to a file
#fasttext_model.save_word2vec_format('fasttext_model.bin', binary=True)

# Load FastText model from a file
loaded_fasttext_model = KeyedVectors.load_word2vec_format('fasttext_model.bin', binary=True)


In [21]:


# Get the FastText model vocabulary as a set
fasttext_vocab = set(loaded_fasttext_model.key_to_index.keys())

# Check which words are not in the FastText vocabulary
words_not_in_fasttext = train_vocabulary.difference(fasttext_vocab)

# print the number of words not in the FastText vocabulary
print("Number of words not in FastText vocabulary:", len(words_not_in_fasttext))
print("Words not in FastText vocabulary:", words_not_in_fasttext)


Number of words not in FastText vocabulary: 1495
Words not in FastText vocabulary: {'mulan', 'besttalkradio', 'csismica', 'scotiabank', 'wrightsboro', 'psychrewatch', 'voortrekker', '2k15', 'smallbiz', 'cubstalk', 'thisiswhywecanthavenicethings', 'spsgsp', 'geek_apocalypse', 'streetjamzdotnet', 'ypres', 'ruthann', 'rahl', 'catsofinstagram', 'jonathanferrell', 'throwingknifes', 'liveonkbak', 'pakpattan', '999day', 'morty', 'windstormåêinsurer', 'followback', 'kakeru', '15000270653', 'air1bullet', 'modestmouseremix', 'asae15', 'bitof', 'doubleghats', 'remymarcel', 'lakeisabella', 'beclearoncancer', 'everydaynaija', 'û_1', 'fdbdp', 'rabidmonkeys1', 'thatswhatfriendsarefor', 'komo', 'oome', 'renew911health', 'k_matako_bot', 'bbsnews', 'techesback', 'okinawan', 'freeallfour', 'patrickjbutler', 'katunews', 'aashiqui', 'halfa', 'okayyyyyy', 'imkeepingmydayjob', 'wildhorses', 'foxsportscom', 'notrocketscience', 'lh_movie', 'unh', 'wallybaiter', 'animalrescue', 'manåêarmed', 'globi_inclusion', 

In [22]:
preprocessed_train['average_embeddings_fasttext_300'] = preprocessed_train['preprocess_text'].apply(lambda tokens: ef.average_embeddings_fasttext(tokens, loaded_fasttext_model))

In [23]:
preprocessed_test['average_embeddings_fasttext_300'] = preprocessed_test['preprocess_text'].apply(lambda tokens: ef.average_embeddings_fasttext(tokens, loaded_fasttext_model))

In [24]:
# export 
preprocessed_train.to_csv("train_data_mod_fasttext_300d.csv", index=False)
preprocessed_test.to_csv("test_data_mod_fasttext_300d.csv", index=False)

### Dimension Reduction for Fasttext from 300 to 50. (To go with the same dimensions as glove and word2vec)

In [None]:
preprocessed_train['average_embeddings_fasttext_50'] = ef.reduce_vector_dimension(preprocessed_train['average_embeddings_fasttext_300'], n_components=50)

In [None]:
# drop fasttext embeddings
preprocessed_train.drop(columns=['average_embeddings_fasttext_300'], inplace=True)
preprocessed_test.drop(columns=['average_embeddings_fasttext_300'], inplace=True)

In [None]:
# export
preprocessed_train.to_csv("train_data_mod_fasttext_50d.csv", index=False)
preprocessed_test.to_csv("test_data_mod_fasttext_50d.csv", index=False)

### Convert Embeddings into Numeric Columns

In [4]:
drop_columns = ['keyword', 'location', 'text', 'preprocess_text', 'bigram', 'trigram', 'pos']

output_dir = "../numerical_datasets/"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
files = [
    "train_data_mod_fasttext_300d.csv",
    "train_data_mod_glove_50d_0v.csv",
    "train_data_mod_glove_50d_custom.csv",
    "train_data_mod_word2vec_50d.csv",
    "test_data_mod_fasttext_300d.csv",
    "test_data_mod_glove_50d_0v.csv",
    "test_data_mod_glove_50d_custom.csv",
    "test_data_mod_word2vec_50d.csv"
]


# Split embedding column into numerical columns based on dimension for each file
for file in files:
    df = pd.read_csv(file)

    # Get name of the embedding column and calculate embedding dimension
    embedding_col_old = df.columns[-1]
    embedding = df[embedding_col_old].iloc[0]
    embedding_list = [float(val) for val in embedding.replace('[','').replace(']','').split()]
    dimension = len(embedding_list)

    # Split the embedding column into separate columns based on dimension
    df_new = pd.DataFrame(df[embedding_col_old].apply(lambda x: [float(val) for val in x.replace('[','').replace(']','').split()]).to_list(), columns=[f"embedding_{i}" for i in range(dimension)])

    # Concatenate new columns with the original dataframe
    df = pd.concat([df.drop(columns=[embedding_col_old]), df_new], axis=1)
    df.drop(drop_columns, axis=1, inplace=True)

    # Export
    output_file = os.path.join(output_dir, os.path.basename(file).replace(".csv", "_numerical.csv"))
    df.to_csv(output_file, index=False)
