In [None]:
# -------- Tensorflow packages ------------------ #
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization  # Replaces Tokenizer
from tensorflow.keras.utils import pad_sequences  # Moved location
from tensorflow.keras.layers import TimeDistributed, Dense, LSTM, Embedding, Dropout, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical
# -------- other packages ------------------ #
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import pickle

np.set_printoptions(threshold=np.inf)  # Show full numpy arrays

In [None]:
df = pd.read_csv("keywords_output.csv", names = ["paragraphs", "label"])
df['paragraphs'] = df['paragraphs'].astype(str)
df['label'] = df['label'].astype(str)

In [None]:
vectorizer = TextVectorization(
    max_tokens=100000,  # Set the maximum vocabulary size
    output_mode="int",  # Output integer-encoded sequences
    output_sequence_length=512,  # Set the desired sequence length
    standardize="lower_and_strip_punctuation",  # Optional: text preprocessing
)

# Fit the vectorizer on the sentences
vectorizer.adapt(df["paragraphs"])

# Transform sentences into integer-encoded sequences using the 'call' method
encoded_sequences = vectorizer(df["paragraphs"]).numpy()

# Initialize empty lists for sentence and keyword columns
sentence_column = []
keyword_column = []

# Iterate over DataFrame rows
for index, row in df.iterrows():
    new_keywords = []
    sentence = row["paragraphs"]
    keywords = row["label"]
    tokens = sentence.split()  # Split sentence into tokens (words)
    for token in tokens:
        if token in keywords:
            if not any(char.isdigit() for char in token):
                new_keywords.append(1)
        else:
            new_keywords.append(0)
    if sum(new_keywords) != 0:
        sentence_column.append(sentence)
        keyword_column.append(new_keywords)


In [None]:

vectorizer.adapt(sentence_column)
X = vectorizer(sentence_column).numpy()
X = pad_sequences(X, padding = "post", truncating = "post", maxlen = 512, value = 0)
y = pad_sequences(keyword_column, padding = "post", truncating = "post", maxlen = 512, value = 0)
y = [to_categorical(i, num_classes = 2) for i in y]
embeddings_index = {}
f = open('embeddings.txt','r')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype = "float32")
	embeddings_index[word] = coefs
f.close()


In [None]:

word_index = vectorizer.get_vocabulary()

# Initialize the embedding matrix
ed = 100
embedding_matrix = np.zeros((len(word_index) + 1, ed))

# Populate the embedding matrix
for word, i in enumerate(word_index):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights = [embedding_matrix]))
model.add(Bidirectional(LSTM(128, return_sequences = True, recurrent_dropout = 0.1)))
model.add(TimeDistributed(Dense(2, activation = "softmax")))
model.compile(loss="categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.fit(X_train, np.array(y_train), batch_size = 10, epochs = 1, validation_split = 0.1)
model_json = model.to_json()


In [None]:
example_sentence = "If n is a billion then this is about 12000 right The denominators keep getting bigger and bigger but the numerators stay the same theyre always x So when I take the product if I go far enough out Im going to be multiplying by very very small numbers and more and more of them And so no matter what x is these numbers will converge to 0 Theyll get smaller and smaller as x gets to be bigger Thats the sign that x is inside of the radius of convergence This is the sign for you that this series converges for that value of x And because I could do this for any x this works This convergence to 0 for any fixed x Thats what tells you that you can take that the radius of convergence is infinity Because in the formula in the fact in this property that the radius of convergence talks about if R is equal to infinity this is no condition on x Every number is less than infinity in absolute value So if this convergence to 0 of the general term works for every x then radius of convergence is infinity Well that was kind of fast but I think that youve heard something about that earlier as well Anyway so weve got the sine function a new function with its own power series Its a way of computing sinx If you take enough terms youll get a good evaluation of sinx for any x This tells you a lot about the function sinx but not everything at all For example from this formula its very hard to see that the sine of x is periodic Its not obvious at all Somewhere hidden away in this expression is the number pi the half of the period But thats not clear from the power series at all So the power series are very good for some things but they hide other properties of functions Well so I want to spend a few minutes telling you about what you can do with a power series once you have one to get new power series so new power series from old And this is also called operations on power series So what are the things that we can do to a power series Well one of the things you can do is multiply So for example what if I want to compute a power series for x sinx Well I have a power series for sinx I just did it How about a power series for x Actually I did that here too The function x is a very simple polynomial Its a polynomial where thats 0 a_1 is 1 and all the other coefficients are 0 So x itself is a power series a very simple one sinx is a powers series And what I want to encourage you to do is treat power series just like polynomials and multiply them together Well see other operations too So to compute the power series for x sinx of I just take this one and multiply it by x So lets see if I can do that right"

vectorizer.adapt(sentence_column)
encoded_example = vectorizer(sentence_column).numpy()
padded_example = pad_sequences(encoded_example, maxlen=512, padding="post", truncating="post")

# Make predictions using the loaded model
predictions = model.predict(padded_example)


In [None]:

# Assuming binary classification (1 for relevant keyword, 0 for not relevant)
predicted_keywords = [1 if pred[0] >= 0.5 else 0 for pred in predictions]

# Print the predicted keywords
print("Predicted keywords for the example sentence:")
print(predicted_keywords)


In [14]:
print(pd.shape(predictions))

AttributeError: module 'pandas' has no attribute 'shape'

In [None]:
with open("model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("model.weights.h5")
with open("vectorizer.pickle", "rb") as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)
print("Saved model to disk")


In [None]:
import numpy as np
import tensorflow as tf
import pickle

# Load the saved model architecture from JSON file
with open("model.json", "r") as json_file:
    loaded_model_json = json_file.read()
loaded_model = tf.keras.models.model_from_json(loaded_model_json)

# Load the saved model weights
loaded_model.load_weights("model.h5")

# Load the tokenizer
with open("tokenizer.pickle", "rb") as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)

# Example sentence to test the model
example_sentence = "This is an example sentence about natural language processing."

# Preprocess the example sentence (tokenize and pad)
encoded_example = tokenizer.texts_to_sequences([example_sentence])
padded_example = tf.keras.preprocessing.sequence.pad_sequences(encoded_example, maxlen=512, padding="post", truncating="post")

# Make predictions using the loaded model
predictions = loaded_model.predict(padded_example)

# Assuming binary classification (1 for relevant keyword, 0 for not relevant)
predicted_keywords = [1 if pred[0] >= 0.5 else 0 for pred in predictions]

# Print the predicted keywords
print("Predicted keywords for the example sentence:")
print(predicted_keywords)
