In [1]:
# ICE-8
# This ICE is pretty straight forward. Please follow the link below and solve all the coding problems that are explained
# https://medium.com/analytics-vidhya/a-comprehensive-guide-to-build-your-own-language-model-in-python-5141b3917d6d
# Once done with the above part, use any dataset of your choice and build your own language model with the techniques covered in the online tutorial.

# (50%)

In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import re

data_text = my_text = """
As I have noted repeatedly, liberalism and leftism have virtually nothing in common. In fact, leftism is the enemy of liberalism — as a handful of liberals such as former New York Times writer Bari Weiss, former Young Turk Dave Rubin, and others have come to recognize. 

The left has never believed in free speech and has suppressed dissent wherever it has assumed power. Free speech is a pillar of liberalism, and it has always embraced dissent.

The left rejects the anti-racist ideal of color-blindness. Colorblind is the liberal racial ideal.

The left supports racial segregation — such as all-black dorms and separate black graduations. Liberals have always advocated racial integration.

The left has always loathed capitalism. Liberals were always major advocates of capitalism — recognizing that only capitalism has lifted billions of people out of poverty. 

The left has always been anti-Israel. Liberals have always been fervent supporters of Israel. 

The left has always held America in contempt. Liberals loved this country. A liberal wrote, “God bless America.” No leftist would write such a song.

Leftists want to defund the police. No liberal does.

The list of liberal-left differences is as long as the list of left-wing positions.

Yet, it is liberals who keep the left in power. Were it not for the liberal vote, the left would have no power.

The question is all the more apt given that it is conservatives who protect virtually every liberal value. It is conservatives who seek to preserve free speech, racial integration, love of America, a strong Israel, and capitalism.

So why do liberals vote for the left, for the very people who hold liberals and their values in contempt?

There are two primary reasons.

One is brainwash. Liberals are brainwashed from childhood into believing that the right is their enemy and that pas d’ennemis a gauche (there are “no enemies on the left”). That is why there is no left-wing position, no matter how destructive or vile, that could move a liberal to vote Republican or identify with conservatives.

The second reason is fear. Liberals fear they will lose friends and even family if they do not vote Democrat or if they publicly criticize the left. And this is not an irrational fear. 

America and the West are being destroyed by the left. But this destruction of the universities, the high schools, art and music, journalism, and of freedom itself could not take place were it not for liberals.

The fate of America and the West lies largely in the hands of liberals. There are simply not enough leftists to destroy our most revered institutions. They need liberals to serve as fellow travelers to accomplish their ends. 

Should the American experiment fail — and it may — that profile in lack of courage, the liberal, will have made it possible. 
"""


def text_cleaner(text):
    # lower case text
    newString = text.lower()
    newString = re.sub(r"'s\b", "", newString)
    # remove punctuations
    newString = re.sub("[^a-zA-Z]", " ", newString)
    long_words = []
    # remove short word
    for i in newString.split():
        if len(i) >= 3:
            long_words.append(i)
    return (" ".join(long_words)).strip()


# preprocess the text
data_new = text_cleaner(data_text)

In [3]:
def create_seq(text):
    length = 30
    sequences = list()
    for i in range(length, len(text)):
        # select sequence of tokens
        seq = text[i - length : i + 1]
        # store
        sequences.append(seq)
    print("Total Sequences: %d" % len(sequences))
    return sequences


# create sequences
sequences = create_seq(data_new)

Total Sequences: 2440


In [4]:
# create a character mapping index
chars = sorted(list(set(data_new)))
mapping = dict((c, i) for i, c in enumerate(chars))


def encode_seq(seq):
    sequences = list()
    for line in seq:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences


# encode the sequences
sequences = encode_seq(sequences)

In [5]:
from sklearn.model_selection import train_test_split

# vocabulary size
vocab = len(mapping)
sequences = np.array(sequences)
# create X and y
X, y = sequences[:, :-1], sequences[:, -1]

# one hot encode y
y = to_categorical(y, num_classes=vocab)
# create train and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

print("Train shape:", X_tr.shape, "Val shape:", X_val.shape)

Train shape: (2196, 30) Val shape: (244, 30)


In [6]:
# define model
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable=True))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
print(model.summary())

# compile the model
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
# fit the model
model.fit(X_tr, y_tr, epochs=100, verbose=2, validation_data=(X_val, y_val))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 50)            1350      
_________________________________________________________________
gru (GRU)                    (None, 150)               90900     
_________________________________________________________________
dense (Dense)                (None, 27)                4077      
Total params: 96,327
Trainable params: 96,327
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
69/69 - 19s - loss: 2.9825 - acc: 0.1526 - val_loss: 2.7990 - val_acc: 0.1557
Epoch 2/100
69/69 - 14s - loss: 2.7334 - acc: 0.2017 - val_loss: 2.5342 - val_acc: 0.2828
Epoch 3/100
69/69 - 14s - loss: 2.4324 - acc: 0.2864 - val_loss: 2.3276 - val_acc: 0.3320
Epoch 4/100
69/69 - 14s - loss: 2.2674 - acc: 0.3411 - val_loss: 2.2123 - val_acc: 0.3648
Epoch 5/100
69/69

<keras.callbacks.History at 0x7faec0043a10>

In [7]:
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
	in_text = seed_text
	# generate a fixed number of characters
	for _ in range(n_chars):
		# encode the characters as integers
		encoded = [mapping[char] for char in in_text]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict character
		yhat = np.argmax(model.predict(encoded), axis=-1)
		# reverse map integer to character
		out_char = ''
		for char, index in mapping.items():
			if index == yhat:
				out_char = char
				break
		# append to input
		in_text += char
	return in_text

In [8]:
to_be_written = 'i am afraid that'
print(generate_seq(model, mapping, 50, to_be_written.lower(), 100))


i am afraid that profile lack courag the left has always been anti israel liberals have always been anti israel libe


In [9]:
# Use any dataset and model to calculate Language Model's text level preplexity
# You may take help from the following link to have an idea about preplexity.
# Use any dataset and model to calculate Language Model's text level preplexity
# https://stackoverflow.com/questions/16509685/ngram-model-and-perplexity-in-nltk

# (50%)

In [56]:
import nltk
import re
from nltk.corpus import gutenberg
from nltk.model import NgramModel
from nltk.probability import LidstoneProbDist


# sample text
# nltk.download('gutenberg')
sample = gutenberg.raw("bible-kjv.txt")

# Clean the text
sample = sample.lower()
sample = re.sub(r'(\d+\:\d+)', '', sample)
sample = re.sub(r'[^\w]', ' ', sample)
corpus = ' '.join(sample.split())

# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in train)
vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.items())))

train = map(lambda x: x if x in vocabulary else "*unknown*", train)
test = map(lambda x: x if x in vocabulary else "*unknown*", test)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
lm = NgramModel(5, train, estimator=estimator)

print ("len(corpus) = %s, len(vocabulary) = %s, len(train) = %s, len(test) = %s" % ( len(corpus), len(vocabulary), len(train), len(test)))
print ("perplexity(test) =", lm.perplexity(test))

ModuleNotFoundError: ignored