In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import numpy as np
import pandas as pd
import nltk
from keras.preprocessing.text import Tokenizer


class LSTM_Classifier:
    def __init__(self, data:'pd.Dataframe'=None, max_features:int=200, maxlen:int=200, train:bool=False):
        self.corpus = None
        self.tokenizer:Tokenizer = Tokenizer()
        self.max_features = max_features
        self.maxlen = maxlen
        self.__process_data__(data)
        if train:
            self.model = self.__create_model__()
        else:
            self.model = keras.models.load_model('model.h5')
    
    def __process_data__(self, data):
        def refill(arr:list, zeros):
            for i, item in enumerate(arr[::-1]):
                if i == len(zeros):
                    break
                zeros[len(zeros)-i-1] = item

            return zeros
        data = pd.DataFrame.from_records(data)
        # data.rename(columns={'Tweet Text':'text', 'show':'show'})
        data = data[['clean_tweets', 'show']]
        data = data.dropna(axis=0)
        from sklearn.model_selection import train_test_split
        base = np.zeros(30)

        X = [refill(np.asanyarray(x), base) for x in self.vectorize(data['clean_tweets'].values)]

        print('selfing: ', type(X))
        X_train, X_val, y_train, y_val = train_test_split(X, data['show'], train_size=0.8)
        
        classes = {'got':1, 'rop':0}

        # X_train = np.array(X_train)
        # X_val = np.array(X_val)

        self.data =  {
            'x_train':  X_train,
            'y_train':  [classes[x] for x in y_train],
            'x_val':    X_val,
            'y_val':    [classes[x] for x in y_val]
        }
# np.asarray(self.data[key], dtype=object).astype(np.float32)
        self.data = {key:np.asanyarray(self.data[key]) for key in self.data}

        print([(type(self.data[x]), len(self.data[x]), self.data[x].shape) for x in self.data])

        # print(self.data)

    def preprocessing(self, ):
        pass

    def save(self, ):
        self.model.save('model.h5')

    def predict(self, text):
        #vectorize text

        #predict
        return self.model.predict([text])

    def vectorize(self, data):
        tokenizer = self.tokenizer
        tokenizer.fit_on_texts(data)
        tokens = tokenizer.texts_to_sequences(data)
        # print(tokens)
        self.corpus = tokens
        return tokens

    def __create_model__(self, ):
        model = Sequential()
        total_words = 10
        input_len = 10

        model.add(Embedding(total_words,150, input_length=input_len))


        # inputs = keras.Input(shape=(None, ), dtype='int32')
        # x = layers.Embedding(self.max_features, 150)(inputs)
        # # Add 2 bidirectional LSTMs
        # x = layers.Bidirectional(layers.LSTM(700))(x)
        # # x = layers.Bidirectional(layers.LSTM(64))(x)
        # # Add a classifier
        # outputs = layers.Dense(1, activation="softmax")(x)
        # model = keras.Model(inputs, outputs)
        # model.summary()
        # model.compile("adam", "binary_crossentropy", metrics=["accuracy"])

        model.add(LSTM(700))
        model.add(Dropout(0.3))
        
        # ----------Add Output Layer
        model.add(Dense(total_words, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')

        model.fit(self.data.get('x_train'), self.data.get('y_train'), verbose=5, epochs=20, validation_data=(self.data.get('x_val'), self.data.get('y_val')))

        return model


# if __name__ == '__main__':
#     import pymongo
#     client = pymongo.MongoClient("mongodb+srv://tuiter:tuiter@cluster0.avnamve.mongodb.net/?retryWrites=true&w=majority")
#     data = client['TwitterStream']['tweets'].find()
#     data = [post for post in data]
#     # print(data)
#     print('creating the model')
#     classifier = LSTM_Classifier(data=data)
#     classifier.save()

In [5]:
from tensorflow import keras
import pymongo
import pandas as pd

# Data Preprocessing

In [6]:

client = pymongo.MongoClient("mongodb+srv://tuiter:tuiter@cluster0.avnamve.mongodb.net/?retryWrites=true&w=majority")
data = client['TwitterStream']['tweets'].find()
data = [post for post in data]

In [7]:
df = pd.DataFrame.from_records(data)[['clean_tweets', 'show']]
df

Unnamed: 0,clean_tweets,show
0,the ring of power,rop
1,tri to avoid ani and all ring of power spoiler...,rop
2,okay so ring of power couldnt get the right to...,rop
3,the grubbi hobbit in ring of power have crap i...,rop
4,i dont realli understand the critic of ring of...,rop
...,...,...
86995,ive never seen an episod of game of throne any...,got
86996,current listen to game of throne soundtrack to...,got
86997,game of throne today let get it,got
86998,if you arent watch game of throne tonight it p...,got


There are more got than rop, let's get equally

In [8]:
len(df[df['show'] == 'rop']), len(df[df['show'] == 'got'])

(37000, 50000)

In [9]:
df_rop = df[df['show'] == 'rop']
df_got = df[df['show'] == 'got'][:37000]


In [10]:
data = pd.concat([df_rop,df_got])
data

Unnamed: 0,clean_tweets,show
0,the ring of power,rop
1,tri to avoid ani and all ring of power spoiler...,rop
2,okay so ring of power couldnt get the right to...,rop
3,the grubbi hobbit in ring of power have crap i...,rop
4,i dont realli understand the critic of ring of...,rop
...,...,...
73995,game of throne season episod live stream live ...,got
73996,never seen an episod of game of throne in my life,got
73997,ive never seen one second of game of throne,got
73998,psa if you tri to contact me in ani way dure g...,got


In [11]:
model = keras.models.load_model('model.h5')

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2022-12-06 15:42:48.388410: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-06 15:42:48.388617: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


# Training model

In [12]:
model_ = LSTM_Classifier(data=data, train=True)

selfing:  <class 'list'>
[(<class 'numpy.ndarray'>, 59200, (59200, 30)), (<class 'numpy.ndarray'>, 59200, (59200,)), (<class 'numpy.ndarray'>, 14800, (14800, 30)), (<class 'numpy.ndarray'>, 14800, (14800,))]
Epoch 1/20


2022-12-06 15:43:05.931038: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


ValueError: in user code:

    File "/Users/isabelmontalvo/miniconda/lib/python3.9/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/Users/isabelmontalvo/miniconda/lib/python3.9/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/isabelmontalvo/miniconda/lib/python3.9/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/Users/isabelmontalvo/miniconda/lib/python3.9/site-packages/keras/engine/training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "/Users/isabelmontalvo/miniconda/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/isabelmontalvo/miniconda/lib/python3.9/site-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 10), found shape=(32, 30)


In [46]:
text_vec = model_.tokenizer.texts_to_sequences(['tri to avoid ani and all ring of power spoilersfirst impress bc i promis my partner wed watch it togeth when we each other next week'])

In [47]:
text_vec

[[21284,
  11,
  684,
  17195,
  12,
  39,
  358,
  1,
  10,
  7733,
  540,
  9,
  24,
  3521,
  10551,
  19,
  16,
  91,
  70,
  613,
  252,
  178,
  188]]

In [48]:
pred = model.predict(text_vec)
pred



array([[0.57323694]], dtype=float32)

In [14]:
model_.tokenizer

<keras.preprocessing.text.Tokenizer at 0x2cc4fb970>

# Predicting