In [11]:
import pickle
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re
import json

In [12]:
# Load the saved tokenizers
with open('text_tokenizer.pkl', 'rb') as file:
    text_tokenizer = pickle.load(file)

with open('keyword_tokenizer.pkl', 'rb') as file:
    keyword_tokenizer = pickle.load(file)
    
# Load the dictionary from the JSON file
with open('info.json', 'r') as file:
    info_dict = json.load(file)

# Access the information from the loaded dictionary
max_sequence_length = info_dict["max_sequence_length"]
max_sequence_length_keyword = info_dict["max_sequence_length_keyword"]

# Use the information as needed
print("Maximum Sequence Length:", max_sequence_length)
print("Maximum Sequence Length Keyword:", max_sequence_length_keyword)

Maximum Sequence Length: 31
Maximum Sequence Length Keyword: 1


Prepare data

In [13]:
pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
def substitute_links(text):
    
    # Substitute links in the text with "website: domain_name"
    substituted_text = re.sub(pattern, 'website: domain_name', text)

    return substituted_text

def clean_data(data:pd.DataFrame):
    data.drop(columns=["location"], inplace=True)
    data.fillna("",inplace=True)
    data['text'] = data['text'].apply(substitute_links)
    ids_ = data.pop("id")
    return ids_
    
def process_data(data:pd.DataFrame):
    #Tokenize
    text_sequences = text_tokenizer.texts_to_sequences(data["text"])
    keyword_sequences = keyword_tokenizer.texts_to_sequences(data["text"])
    
    #Pad
    text_sequences = pad_sequences(text_sequences, maxlen=max_sequence_length)
    keyword_sequences = pad_sequences(keyword_sequences, maxlen=max_sequence_length_keyword)
    return text_sequences, keyword_sequences

In [14]:
test_data = pd.read_csv("data/test.csv")
ids = clean_data(test_data)

In [15]:
# Load the saved model
model = tf.keras.models.load_model('TwitterDisasterModelv2.h5')

# Preprocess the unlabeled data
unlabeled_text_sequences, unlabeled_keyword_sequences = process_data(test_data)

# Make predictions using the saved model
predictions = model.predict((unlabeled_text_sequences, unlabeled_keyword_sequences))



In [31]:
predictions = predictions.argmax(axis=1)


In [32]:
ids.to_numpy().shape

(3263,)

In [33]:
predictions

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [34]:
df = pd.DataFrame({"id": ids, "target": predictions})

In [35]:
df.to_csv('predictions.csv', index=False)