In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from collections import Counter

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
with open("Video Games.txt", "r", encoding="utf-8") as text_file:
    initial_text = text_file.read()

In [3]:
# punctuations = '''!()-[]{};:'"\,<>./?@#$%^&’*_~'''
punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&’*_~'''
# Remove punctuations from the text
text_variable = ''.join(char for char in initial_text if char not in punctuations)

In [4]:
# print(text_variable)

This stage first tokenizes the whole `Video Games.txt` file and then, creates the dataset needed to train the model.

In [5]:
# Tokenize the text
tokens = word_tokenize(text_variable.lower())

#stop words
# stop_words = set(stopwords.words('english'))

# Remove stopwords
# filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]

# Create dataset: 5-word sequences with 6th word as target
input_sequences = []
target_words = []

for i in range(len(tokens) - 5):
    input_sequences.append(tokens[i:i+5])
    target_words.append(tokens[i+5])

# Preview
print("Sample input:", input_sequences[0])
print("Target word:", target_words[0])

Sample input: ['video', 'games', 'have', 'evolved', 'into']
Target word: a


This stage then joins the word token in each row to form a king of sentence.

In [6]:
the_list = []

for words in input_sequences:
    new_clean_text = ' '.join(words)

    the_list.append(new_clean_text)

This creates the final dataset.

In [7]:
corpus_df = pd.DataFrame({'Sentence' : the_list, 'Target' : target_words})

In [8]:
corpus_df

Unnamed: 0,Sentence,Target
0,video games have evolved into,a
1,games have evolved into a,major
2,have evolved into a major,form
3,evolved into a major form,of
4,into a major form of,entertainment
...,...,...
12607,high score because in the,world
12608,score because in the world,of
12609,because in the world of,games
12610,in the world of games,anythings


In [9]:
corpus_df.Target.value_counts()

Target
and            420
the            362
a              329
of             263
to             234
              ... 
wasnt            1
cultivation      1
formed           1
chat             1
anythings        1
Name: count, Length: 3613, dtype: int64

In [10]:
# corpus_df.Target = corpus_df.Target.apply(lambda x :'others' if x not in corpus_top else x)

In [11]:
corpus_df.Target.value_counts()

Target
and            420
the            362
a              329
of             263
to             234
              ... 
wasnt            1
cultivation      1
formed           1
chat             1
anythings        1
Name: count, Length: 3613, dtype: int64

This converts each sentence into a vector of numbers usin the `TF-IDF` encoding technique.

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus_df.Sentence)

This displayes the vectors in the from of `(doc_index, feature_index)    tfidf_score` instead of a sparce matrix for memory efficiency.


In [13]:
print(X)

  (0, 3391)	0.4036449968197907
  (0, 1320)	0.2876209140440805
  (0, 1458)	0.46580932310381934
  (0, 1098)	0.6368387560404972
  (0, 1676)	0.36304682846865993
  (1, 1320)	0.31436876397338415
  (1, 1458)	0.5091281405530329
  (1, 1098)	0.6960627785089225
  (1, 1676)	0.3968090537138697
  (2, 1458)	0.4502555652709021
  (2, 1098)	0.6155741842537654
  (2, 1676)	0.35092439516401003
  (2, 1873)	0.5433142907210021
  (3, 1098)	0.5842056433490291
  (3, 1676)	0.3330419262012861
  (3, 1873)	0.515627982573967
  (3, 1265)	0.5309563308389728
  (4, 1676)	0.3941052912885008
  (4, 1873)	0.6101685712266656
  (4, 1265)	0.6283073780335314
  (4, 2134)	0.27857704997764005
  (5, 1873)	0.5821787462704642
  (5, 1265)	0.599485484610713
  (5, 2134)	0.26579808489588636
  (5, 1055)	0.4806625003031631
  :	:
  (12607, 1601)	0.2912021750197425
  (12607, 3145)	0.2347879970344969
  (12607, 1492)	0.6050552565970259
  (12607, 263)	0.4339450374497415
  (12607, 2727)	0.552879490168834
  (12608, 1601)	0.3216140012481008
  (1260

In [14]:
y = corpus_df.Target

Model Fuitting using the `Multinomial Naive Bayes` model.

In [15]:
model = MultinomialNB()
model.fit(X, y)

In [16]:
new_review = input("Enter text here:")

This function does the following:
- takes the user's input orignially stored as `new-review` and the converts it into a Pandas series whiich the Vectorizer can then work with.
- transforms the input text into a vector of numbers using the `TF-IDF` vectorizer.
- gets the probabilities of all possible targets from the `NB` model.
- sorts all the target probabilities, sorts them and picks the top 5.
- randomly picks one out of the top 5 predictions (to introduce a sense of variability).


In [17]:
def predict_word(input):
    input_df = pd.Series(str(input))

    # Transform the input text using the same vectorizer
    new_review = vectorizer.transform(input_df)
    # Get class probabilities
    proba = model.predict_proba(new_review)

    # Get top 5 classes for each sample
    top_k = 5
    top_classes = np.argsort(proba, axis=1)[:, -top_k:][:, ::-1]  # sort and reverse

    # Map to class labels
    top_class_labels = model.classes_[top_classes][0]
    rand_variable = random.choice(top_class_labels)

    return rand_variable

The code above simply predicts the next word based on the input text.

In [18]:
print(predict_word(new_review))

and


This generates a sentence based on the given input.

In [19]:
def generate_sentence(words):
    count = 10  # number of words to generate
    word_list = words.split(" ")  # turn input into list of words

    for n in range(count):
        main_words = ' '.join(word_list)  # form the current context string
        next_word = str(predict_word(main_words))  # predict the next word
        words = words + " " + next_word  # add it to the sentence
        word_list = word_list[1:]  # shift the context window
        word_list.append(next_word)  # include the new word

    return words


The code above simply generates a 30 word sentence based on the input text.

In [20]:
print(generate_sentence(new_review))

game ia good to play a and of the to a gaming the and the
