In [1]:
import tweepy
import json
import pandas as pd
import re

In [2]:
with open("TwitterAPI.txt", "r") as file:
    keys = json.load(file)

In [3]:
auth = tweepy.OAuthHandler(keys['API Key'], keys['API secret key'])
auth.set_access_token(keys['Access token'], keys['Access token secret'])

api = tweepy.API(auth)

In [4]:
def process_text(data):
        '''
        input: Tweet
        Remove hashtags, tags, URLs, special characters. Tokenize.
        '''
        data = str(data)
        
        #URLS
        val1 = data.find("http")
        if val1>0:
            data = data[:val1]
            
        val2 = data.find("pic.")
        if val2>0:
            data = data[:val2]
            
        val3 = data.find("www")
        if val3>0:
            data = data[:val3]
        
        data = data.replace("@", "").replace("#", "")
        regex_remove_ahu = "(\w+:\/\/\S+)|^RT|http.+?"
        data =  re.sub(regex_remove_ahu, ' ',data)
        return data

In [5]:
tweets = pd.read_csv(f"output_got.csv", sep=";", error_bad_lines = False)
tweets = tweets.text.apply(lambda text: process_text(text))

In [6]:
tweets = " ".join(tweets.values)

In [7]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


In [8]:
chars = sorted(list(set(tweets)))
print(chars)

[' ', '!', '"', '$', '%', '&', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '–', '—', '‘', '’', '“', '”', '…']


In [9]:
char_to_int = dict((c, i) for i, c in enumerate(chars))
print(char_to_int)

{' ': 0, '!': 1, '"': 2, '$': 3, '%': 4, '&': 5, "'": 6, '(': 7, ')': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, '=': 24, '?': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I': 34, 'J': 35, 'K': 36, 'L': 37, 'M': 38, 'N': 39, 'O': 40, 'P': 41, 'Q': 42, 'R': 43, 'S': 44, 'T': 45, 'U': 46, 'V': 47, 'W': 48, 'X': 49, 'Y': 50, 'Z': 51, '_': 52, 'a': 53, 'b': 54, 'c': 55, 'd': 56, 'e': 57, 'f': 58, 'g': 59, 'h': 60, 'i': 61, 'j': 62, 'k': 63, 'l': 64, 'm': 65, 'n': 66, 'o': 67, 'p': 68, 'q': 69, 'r': 70, 's': 71, 't': 72, 'u': 73, 'v': 74, 'w': 75, 'x': 76, 'y': 77, 'z': 78, '–': 79, '—': 80, '‘': 81, '’': 82, '“': 83, '”': 84, '…': 85}


In [10]:
n_chars = len(tweets)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  154663
Total Vocab:  86


In [11]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 140
dataX = []
dataY = []

In [12]:
for i in range(0, n_chars - seq_length, 1):
    seq_in = tweets[i:i + seq_length]
    seq_out = tweets[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

In [15]:
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  154523


In [16]:
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

In [17]:
X = X / float(n_vocab)

In [18]:
y = np_utils.to_categorical(dataY)

In [20]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(X, y, epochs=5, batch_size=2048)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
 26624/154523 [====>.........................] - ETA: 14:04 - loss: 3.2442

In [None]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

# generate characters
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")