In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
gtav = """
Grand Theft Auto V is a 2013 action-adventure game developed by Rockstar North
It is the seventh main entry in the Grand Theft Auto series
Set within the fictional state of San Andreas, based on Southern California, the single-player story follows three protagonists
retired bank robber Michael De Santa (Ned Luke), street gangster Franklin Clinton (Shawn Fonteno), and drug dealer and gunrunner Trevor Philips (Steven Ogg)
and their attempts to commit heists while under pressure from a corrupt government agency and powerful criminals.
Players freely roam San Andreas's open world countryside and fictional city of Los Santos, based on Los Angeles
The game world is navigated on foot and by vehicle, from either a third-person or first-person perspective
Players control the protagonists throughout single-player and switch among them, both during and outside missions
The story is centred on the heist sequences, and many missions involve shooting and driving gameplay
A "wanted" system governs the aggression of law enforcement response to players who commit crimes
n Grand Theft Auto Online, the game's online multiplayer mode, up to 30 players engage in a variety of different cooperative and competitive game modes
Rockstar North began to develop Grand Theft Auto V in 2008, around Grand Theft Auto IV's release
The development team totalled more than 1,000 people
The proprietary Rockstar Advanced Game Engine (RAGE) was overhauled for the game to improve its draw distance rendering capabilities
The open world was modelled on Southern California and Los Angeles
A fundamental design goal from the outset was to innovate on the series' core structure by giving players control of three lead protagonists instead of one
Developed in tandem with the single-player mode, the online multiplayer mode Grand Theft Auto Online was conceived as a separate experience to be played in a continually evolving world
Up to 30 players[n] freely roam across the game world and enter lobbies to complete jobs (story-driven competitive and cooperative modes)
Grand Theft Auto Online launched on 1 October 2013, two weeks after Grand Theft Auto V's release.[85] Many players reported connection difficulties and game freezes during load screens
Post-release content is continually added to Grand Theft Auto Online through free title updates. Some updates add new game modes and features
Grand Theft Auto V received "universal acclaim" from critics, according to review aggregator Metacritic, based on 50 reviews for the PlayStation 3 version
Many reviewers found the land-based vehicles more responsive and easier to control than in previous games
The story and characters—particularly Trevor—polarised reviewers. Some found the narrative inferior to previous Rockstar games and cited Grand Theft Auto IV and Red Dead Redemption's plot strengths
Grand Theft Auto V's re-release, similarly, received critical acclaim. It is the highest-rated PlayStation 4 and Xbox One game on Metacritic alongside Rockstar's Red Dead Redemption 2
"""


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([gtav])

In [None]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'grand': 4,
 'theft': 5,
 'auto': 6,
 'game': 7,
 'on': 8,
 'a': 9,
 'players': 10,
 'is': 11,
 'in': 12,
 'of': 13,
 'online': 14,
 'world': 15,
 'rockstar': 16,
 'based': 17,
 'story': 18,
 'from': 19,
 'release': 20,
 'was': 21,
 'v': 22,
 'by': 23,
 'single': 24,
 'player': 25,
 'protagonists': 26,
 'los': 27,
 'control': 28,
 'many': 29,
 'mode': 30,
 'modes': 31,
 '2013': 32,
 'developed': 33,
 'north': 34,
 'it': 35,
 'fictional': 36,
 'san': 37,
 'southern': 38,
 'california': 39,
 'three': 40,
 'commit': 41,
 'freely': 42,
 'roam': 43,
 'open': 44,
 'angeles': 45,
 'person': 46,
 'during': 47,
 'missions': 48,
 'n': 49,
 'multiplayer': 50,
 'up': 51,
 '30': 52,
 'cooperative': 53,
 'competitive': 54,
 'more': 55,
 'than': 56,
 '1': 57,
 'for': 58,
 'one': 59,
 'continually': 60,
 "v's": 61,
 'updates': 62,
 'some': 63,
 'received': 64,
 'acclaim': 65,
 'metacritic': 66,
 'playstation': 67,
 'reviewers': 68,
 'found': 69,
 'previous': 70,
 'game

In [None]:

# Creating our dataset sentence by sentence
input_seq = []
for sentence in gtav.split('\n'):
  tokenized_seq = tokenizer.texts_to_sequences([sentence])[0]
  for i in range(1,len(tokenized_seq)):
    input_seq.append(tokenized_seq[:i+1])

In [None]:
input_seq

[[4, 5],
 [4, 5, 6],
 [4, 5, 6, 22],
 [4, 5, 6, 22, 11],
 [4, 5, 6, 22, 11, 9],
 [4, 5, 6, 22, 11, 9, 32],
 [4, 5, 6, 22, 11, 9, 32, 74],
 [4, 5, 6, 22, 11, 9, 32, 74, 75],
 [4, 5, 6, 22, 11, 9, 32, 74, 75, 7],
 [4, 5, 6, 22, 11, 9, 32, 74, 75, 7, 33],
 [4, 5, 6, 22, 11, 9, 32, 74, 75, 7, 33, 23],
 [4, 5, 6, 22, 11, 9, 32, 74, 75, 7, 33, 23, 16],
 [4, 5, 6, 22, 11, 9, 32, 74, 75, 7, 33, 23, 16, 34],
 [35, 11],
 [35, 11, 1],
 [35, 11, 1, 76],
 [35, 11, 1, 76, 77],
 [35, 11, 1, 76, 77, 78],
 [35, 11, 1, 76, 77, 78, 12],
 [35, 11, 1, 76, 77, 78, 12, 1],
 [35, 11, 1, 76, 77, 78, 12, 1, 4],
 [35, 11, 1, 76, 77, 78, 12, 1, 4, 5],
 [35, 11, 1, 76, 77, 78, 12, 1, 4, 5, 6],
 [35, 11, 1, 76, 77, 78, 12, 1, 4, 5, 6, 79],
 [80, 81],
 [80, 81, 1],
 [80, 81, 1, 36],
 [80, 81, 1, 36, 82],
 [80, 81, 1, 36, 82, 13],
 [80, 81, 1, 36, 82, 13, 37],
 [80, 81, 1, 36, 82, 13, 37, 83],
 [80, 81, 1, 36, 82, 13, 37, 83, 17],
 [80, 81, 1, 36, 82, 13, 37, 83, 17, 8],
 [80, 81, 1, 36, 82, 13, 37, 83, 17, 8, 38],
 

In [None]:
max_len = max([len(x) for x in input_seq])
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_seq_pad = pad_sequences(input_seq,maxlen=max_len,padding='pre')
input_seq_pad


array([[  0,   0,   0, ...,   0,   4,   5],
       [  0,   0,   0, ...,   4,   5,   6],
       [  0,   0,   0, ...,   5,   6,  22],
       ...,
       [  0,   0,   0, ..., 254,  72,  73],
       [  0,   0,   4, ...,  72,  73, 255],
       [  0,   4,   5, ...,  73, 255, 256]], dtype=int32)

In [None]:
# the last word in each sentence is our output (the word we should have predicted)

X = input_seq_pad[:,:-1]
y = input_seq_pad[:,-1]

In [None]:
# We have our dataset now
# We will consider this as a multi-label classification problem

# We will one hot encode y for classification
from tensorflow.keras.utils import to_categorical
y = to_categorical(y)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [None]:

#Architecture
#Embedding layer to get dense vectors
#LSTM layers
#Dense layers (softmax) for output

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense

In [None]:
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1,output_dim=100,input_length=max_len-1),
    LSTM(150),
    Dense(units=len(tokenizer.word_index)+1,activation='softmax')
])



In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.fit(X,y,epochs=90)


Epoch 1/90
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - accuracy: 0.0250 - loss: 5.5438
Epoch 2/90
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.0369 - loss: 5.3446
Epoch 3/90
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.0444 - loss: 5.2292
Epoch 4/90
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 93ms/step - accuracy: 0.0399 - loss: 5.1073
Epoch 5/90
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.0458 - loss: 5.0434
Epoch 6/90
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.0771 - loss: 5.0650
Epoch 7/90
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.0791 - loss: 5.0060
Epoch 8/90
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.0887 - loss: 4.8909
Epoch 9/90
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x77ff88e83320>

In [None]:

text = "grand"
preds = 10

for i in range(preds):
  #tokenizing text
  tokenized_text = tokenizer.texts_to_sequences([text])[0]
  #padding text
  padded_tokenized_text = pad_sequences([tokenized_text],maxlen=len(tokenizer.word_index)+1,padding='pre')
  #prediction
  ind = np.argmax(model.predict(padded_tokenized_text))

  for word,index in tokenizer.word_index.items():
    if index == ind:
      text = text + " " + word
      print(text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278ms/step
grand theft
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
grand theft auto
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
grand theft auto v
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
grand theft auto v received
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
grand theft auto v received universal
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
grand theft auto v received universal acclaim
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
grand theft auto v received universal acclaim from
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
grand theft auto v received universal acclaim from from
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
grand theft auto v received universal acclaim from from the
[1m1/1[0m [32m━━━━━━━