In [473]:
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy import displacy
import nltk
import pandas as pd

In [474]:
nlp = spacy.load('en_core_web_sm')

In [475]:
# nlp.pipeline,
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [476]:
doc = nlp(u'it is a $10 doller gift card from Google.you can get it from "www.fgm.com" website! like seriously!!!')

In [477]:
for t in doc:
  print(t,t.pos_,t.lemma_)

it PRON it
is AUX be
a DET a
$ SYM $
10 NUM 10
doller NOUN doller
gift NOUN gift
card NOUN card
from ADP from
Google.you PROPN Google.you
can AUX can
get VERB get
it PRON it
from ADP from
" PUNCT "
www.fgm.com PROPN www.fgm.com
" PUNCT "
website NOUN website
! PUNCT !
like VERB like
seriously ADV seriously
! PUNCT !
! PUNCT !
! PUNCT !


In [478]:
displacy.render(doc,style = 'dep',jupyter = True,options={'distance':80})

### Stemming

In [479]:
pd.DataFrame(nlp.Defaults.stop_words)

Unnamed: 0,0
0,alone
1,even
2,out
3,perhaps
4,wherever
...,...
321,two
322,eight
323,yourself
324,unless


In [480]:
df = pd.read_csv('/content/smsspamcollection.tsv',sep='\t')
df.head(10)

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147,8
6,ham,Even my brother is not like to speak with me. ...,77,2
7,ham,As per your request 'Melle Melle (Oru Minnamin...,160,6
8,spam,WINNER!! As a valued network customer you have...,157,6
9,spam,Had your mobile 11 months or more? U R entitle...,154,2


In [481]:
from sklearn.model_selection import train_test_split

In [482]:
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [483]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [484]:
vectorizer = CountVectorizer()

# Fit and transform the text data
X_vect = vectorizer.fit_transform(X_train)

# feature names (words)
feature_names = vectorizer.get_feature_names_out()

#dense matrix
dense_matrix = X_vect.toarray()

print("Feature Names (Words):", feature_names)
print("Feature Matrix:\n", dense_matrix)

Feature Names (Words): ['00' '000' '000pes' ... 'zoom' 'zouk' 'èn']
Feature Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [485]:
tfidf = TfidfVectorizer()


# Fit and transform the text data
X_tfidf = tfidf.fit_transform(X_train)

# Get feature names (words)
feature_names_tfidf = tfidf.get_feature_names_out()

# Convert to dense matrix (for demonstration purposes)
dense_matrix_tfidf = X_tfidf.toarray()

print("Feature Names (Words):", feature_names_tfidf)
print("TF-IDF Feature Matrix:\n", dense_matrix_tfidf)


Feature Names (Words): ['00' '000' '000pes' ... 'zoom' 'zouk' 'èn']
TF-IDF Feature Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [486]:
movie_df = pd.read_csv('/content/moviereviews.tsv',sep='\t')
movie_df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [487]:
movie_df.isnull().sum()

label      0
review    35
dtype: int64

In [488]:
movie_df.dropna(inplace = True)

In [489]:
X = movie_df['review']
y = movie_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [490]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

In [491]:
text_clf_nb.fit(X_train, y_train)

In [492]:
predictions = text_clf_nb.predict(X_test)
# predictions

In [493]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[303  19]
 [114 213]]


In [494]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.73      0.94      0.82       322
         pos       0.92      0.65      0.76       327

    accuracy                           0.80       649
   macro avg       0.82      0.80      0.79       649
weighted avg       0.82      0.80      0.79       649



In [495]:
print(metrics.accuracy_score(y_test,predictions))

0.7950693374422187


In [496]:
text_clf_lsvc.fit(X_train, y_train)

In [497]:
predictions = text_clf_lsvc.predict(X_test)

In [498]:
print(metrics.confusion_matrix(y_test,predictions))

[[281  41]
 [ 56 271]]


In [499]:
print(metrics.accuracy_score(y_test,predictions))

0.8505392912172574


In [500]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

### adding stop words to imporve calssification

In [501]:
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC()),
])
text_clf_lsvc2.fit(X_train, y_train)

In [502]:
predictions = text_clf_lsvc2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

[[278  44]
 [ 54 273]]


In [503]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.84      0.86      0.85       322
         pos       0.86      0.83      0.85       327

    accuracy                           0.85       649
   macro avg       0.85      0.85      0.85       649
weighted avg       0.85      0.85      0.85       649



In [504]:
print(metrics.accuracy_score(y_test,predictions))

0.8489984591679507


In [505]:
myreview1 = "A movie I really wanted to love was terrible. \
I'm sure the producers had the best intentions, but the execution was lacking."

In [506]:
myreview2 = "as a thriller movie fan i was looking for a movie that is full of thrill and this movie \
did not dissapoint me. though i thought it won't meet my expection but i was wrong."

In [507]:
myreview3 = "Marvel's 'Avengers: Endgame' is a breathtaking cinematic masterpiece that beautifully wraps up over a decade of storytelling. \
 The movie impressively weaves together multiple story arcs, delivering an emotional and action-packed experience. \
  With stunning visual effects, compelling character development, and an epic conclusion,\
   it's a must-watch for every fan of the Marvel Cinematic Universe."

In [508]:
print(text_clf_nb.predict([myreview1]))
print(text_clf_lsvc2.predict([myreview2]))
print(text_clf_lsvc2.predict([myreview3]))

['neg']
['neg']
['pos']


In [509]:
### Using lemmatization for performence improvement

In [510]:
import spacy

# Load Spacy's English model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Lemmatization and stopwords
def spacy_lemmatize(text):
    return ' '.join([token.lemma_ for token in nlp(text) if not token.is_stop])

# Applying Lemmatization and custom stopwords
X_train_lemmatized = [spacy_lemmatize(text) for text in X_train]
X_test_lemmatized = [spacy_lemmatize(text) for text in X_test]

# Updating the model pipeline with Lemmatized text
text_clf_lsvc3 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

text_clf_lsvc3.fit(X_train_lemmatized, y_train)

predictions = text_clf_lsvc3.predict(X_test_lemmatized)

print(metrics.confusion_matrix(y_test, predictions))
print(metrics.classification_report(y_test, predictions))
print(metrics.accuracy_score(y_test, predictions))


[[264  58]
 [ 55 272]]
              precision    recall  f1-score   support

         neg       0.83      0.82      0.82       322
         pos       0.82      0.83      0.83       327

    accuracy                           0.83       649
   macro avg       0.83      0.83      0.83       649
weighted avg       0.83      0.83      0.83       649

0.8258859784283513


In [511]:
print(text_clf_lsvc3.predict([myreview1]))
print(text_clf_lsvc3.predict([myreview2])) #it should be posative
print(text_clf_lsvc3.predict([myreview3]))

['neg']
['neg']
['pos']


### semantic and sentiment analysis

In [512]:
!python -m spacy download en_core_web_lg

2024-01-01 15:37:05.477193: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-01 15:37:05.477246: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-01 15:37:05.478530: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now lo

In [513]:
nlp2 = spacy.load('en_core_web_lg',disable=['parser','tagger','ner'])

In [514]:
# nlp2(u'love').vector

In [515]:
tokens = nlp2(u'lion cat pet')
for i in tokens:
  for j in tokens:
    print(i.text,j.text,i.similarity(j))

lion lion 1.0
lion cat 0.3854507803916931
lion pet 0.20031584799289703
cat lion 0.3854507803916931
cat cat 1.0
cat pet 0.732966423034668
pet lion 0.20031584799289703
pet cat 0.732966423034668
pet pet 1.0




In [516]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [517]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [518]:
sid = SentimentIntensityAnalyzer()

In [519]:
a = 'this is a fucking good movie'
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.556, 'pos': 0.444, 'compound': 0.4927}

In [520]:
sid.polarity_scores(myreview2)

{'neg': 0.119, 'neu': 0.748, 'pos': 0.132, 'compound': -0.3716}

In [521]:
sid.polarity_scores(movie_df.iloc[202]['review'])

{'neg': 0.098, 'neu': 0.752, 'pos': 0.15, 'compound': 0.9892}

In [522]:
movie_df['scores'] = movie_df['review'].apply(lambda r : sid.polarity_scores(r))
movie_df.head(10)

Unnamed: 0,label,review,scores
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."
5,neg,"to put it bluntly , ed wood would have been pr...","{'neg': 0.123, 'neu': 0.821, 'pos': 0.056, 'co..."
6,neg,"synopsis : melissa , a mentally-disturbed woma...","{'neg': 0.087, 'neu': 0.742, 'pos': 0.17, 'com..."
7,neg,tim robbins and martin lawernce team up in thi...,"{'neg': 0.118, 'neu': 0.709, 'pos': 0.172, 'co..."
8,neg,"in "" gia "" , angelina jolie plays the titular ...","{'neg': 0.082, 'neu': 0.862, 'pos': 0.056, 'co..."
9,neg,"in 1990 , the surprise success an unheralded l...","{'neg': 0.145, 'neu': 0.728, 'pos': 0.127, 'co..."


In [523]:
movie_df['compound'] = movie_df['scores'].apply(lambda d : d['compound'])
movie_df.head(10)

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484
5,neg,"to put it bluntly , ed wood would have been pr...","{'neg': 0.123, 'neu': 0.821, 'pos': 0.056, 'co...",-0.9855
6,neg,"synopsis : melissa , a mentally-disturbed woma...","{'neg': 0.087, 'neu': 0.742, 'pos': 0.17, 'com...",0.9871
7,neg,tim robbins and martin lawernce team up in thi...,"{'neg': 0.118, 'neu': 0.709, 'pos': 0.172, 'co...",0.9829
8,neg,"in "" gia "" , angelina jolie plays the titular ...","{'neg': 0.082, 'neu': 0.862, 'pos': 0.056, 'co...",-0.8278
9,neg,"in 1990 , the surprise success an unheralded l...","{'neg': 0.145, 'neu': 0.728, 'pos': 0.127, 'co...",-0.9147


In [524]:
movie_df['compound_score'] = movie_df['compound'].apply(lambda s : 'pos' if s > 0 else 'neg' )
movie_df.head(10)

Unnamed: 0,label,review,scores,compound,compound_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg
5,neg,"to put it bluntly , ed wood would have been pr...","{'neg': 0.123, 'neu': 0.821, 'pos': 0.056, 'co...",-0.9855,neg
6,neg,"synopsis : melissa , a mentally-disturbed woma...","{'neg': 0.087, 'neu': 0.742, 'pos': 0.17, 'com...",0.9871,pos
7,neg,tim robbins and martin lawernce team up in thi...,"{'neg': 0.118, 'neu': 0.709, 'pos': 0.172, 'co...",0.9829,pos
8,neg,"in "" gia "" , angelina jolie plays the titular ...","{'neg': 0.082, 'neu': 0.862, 'pos': 0.056, 'co...",-0.8278,neg
9,neg,"in 1990 , the surprise success an unheralded l...","{'neg': 0.145, 'neu': 0.728, 'pos': 0.127, 'co...",-0.9147,neg


###  Latent Dirichlet Allocation (LDA)

In [525]:
len(movie_df['review'])

1965

In [526]:
cv2 = CountVectorizer(max_df = 0.95,min_df = 2,stop_words='english')
dtm = cv2.fit_transform(movie_df['review'])
dtm

<1965x23298 sparse matrix of type '<class 'numpy.int64'>'
	with 455692 stored elements in Compressed Sparse Row format>

In [527]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=8)

In [528]:
lda.fit(dtm)

### Grab the topics

In [529]:
lda.components_

array([[ 0.12546822, 20.77678168,  1.74485208, ...,  3.11645265,
         0.125     ,  0.125     ],
       [ 4.03242929, 23.35098075,  0.82894977, ...,  0.125     ,
        14.44445604,  2.12467112],
       [ 0.125     ,  2.59243876,  0.12500001, ...,  0.125     ,
         0.125     ,  0.125     ],
       ...,
       [ 0.12540244,  8.40733056,  5.48987674, ...,  0.125     ,
         0.125     ,  0.12501402],
       [ 0.12504728, 14.36473637,  0.8452469 , ...,  0.13349239,
         0.12531307,  0.12509334],
       [ 3.21546601, 26.85342574,  1.0321611 , ...,  0.12505496,
         0.12510743,  0.12522152]])

In [530]:
single_topic = lda.components_[0]
toptenwords = single_topic.argsort()[-10:]

In [531]:
for i in toptenwords:
  print(cv2.get_feature_names_out()[i])

way
plot
characters
character
time
good
just
like
movie
film


In [532]:
for i,topic in enumerate(lda.components_):
   print(f'top 10 words for topic {i}')
   print([cv2.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

top 10 words for topic 0
['way', 'plot', 'characters', 'character', 'time', 'good', 'just', 'like', 'movie', 'film']
top 10 words for topic 1
['story', 'life', 'way', 'good', 'character', 'time', 'just', 'movie', 'like', 'film']
top 10 words for topic 2
['man', 'apes', 'does', 'story', 'just', 'alien', 'life', 'like', 'movie', 'film']
top 10 words for topic 3
['does', 'people', 'really', 'good', 'time', 'story', 'just', 'like', 'movie', 'film']
top 10 words for topic 4
['good', 'make', 'story', 'action', 'new', 'time', 'like', 'just', 'movie', 'film']
top 10 words for topic 5
['character', 'really', 'time', 'love', 'good', 'just', 'story', 'like', 'movie', 'film']
top 10 words for topic 6
['character', 'good', 'time', 'characters', 'just', 'like', 'story', 'life', 'movie', 'film']
top 10 words for topic 7
['action', 'plot', 'good', 'bad', 'time', 'star', 'like', 'just', 'movie', 'film']


In [533]:
topic_result = lda.transform(dtm)
topic_result

array([[6.69891649e-04, 9.95312897e-01, 6.69446173e-04, ...,
        6.69528690e-04, 6.69500657e-04, 6.69743402e-04],
       [3.32439397e-03, 1.98046566e-01, 4.21406376e-04, ...,
        6.71783320e-01, 4.21516958e-04, 1.25159792e-01],
       [4.18772739e-04, 9.97068239e-01, 4.18768468e-04, ...,
        4.18803850e-04, 4.19002496e-04, 4.18817042e-04],
       ...,
       [2.73851814e-01, 3.79033990e-02, 3.26319119e-04, ...,
        6.32176532e-04, 3.26108230e-04, 3.26061581e-04],
       [9.23632024e-01, 4.36223101e-04, 4.36058729e-04, ...,
        4.36074589e-04, 4.36141934e-04, 4.35996103e-04],
       [2.16033259e-01, 4.05249372e-04, 4.05121913e-04, ...,
        4.05231536e-04, 1.61868042e-01, 5.55246696e-01]])

In [534]:
movie_df['Topic'] = topic_result.argmax(axis = 1)

In [535]:
movie_df

Unnamed: 0,label,review,scores,compound,compound_score,Topic
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg,1
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg,5
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos,1
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos,1
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg,0
...,...,...,...,...,...,...
1995,pos,"i like movies with albert brooks , and i reall...","{'neg': 0.073, 'neu': 0.763, 'pos': 0.164, 'co...",0.9991,pos,6
1996,pos,it might surprise some to know that joel and e...,"{'neg': 0.238, 'neu': 0.688, 'pos': 0.074, 'co...",-0.9993,neg,0
1997,pos,the verdict : spine-chilling drama from horror...,"{'neg': 0.15, 'neu': 0.702, 'pos': 0.147, 'com...",-0.5966,neg,3
1998,pos,i want to correct what i wrote in a former ret...,"{'neg': 0.131, 'neu': 0.71, 'pos': 0.16, 'comp...",0.9387,pos,0


In [536]:
def open_file(path):
  with open(path) as f:
    text = f.read()
    return text

In [537]:
# print(open_file('/content/melville-moby_dick.txt'))

In [538]:
nlp2.max_length = 1198623

In [539]:
def tokns(docmnt):
  return [token.text.lower() for token in nlp2(docmnt)]

In [540]:
d = open_file('/content/moby_dick_four_chapters.txt')

In [541]:
tokens = tokns(d)



In [542]:
len(tokens)

14480

In [543]:
train_len = 25+1
text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[ i - train_len : i ]
    text_sequences.append(seq)

In [544]:
# arr = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
# res = []
# for i in range(len(arr)):
#   seq = arr[ i - 5 : i]
#   res.append(seq)
# res

In [545]:
print(' '.join(text_sequences[0]))

call me ishmael .   some years ago -- never mind how long 
 precisely -- having little or no money in my purse , and


In [546]:
print(' '.join(text_sequences[20]))

money in my purse , and nothing 
 particular to interest me on shore , i thought i would sail about a 
 little and see


In [547]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

In [548]:
tokenizer.fit_on_texts(text_sequences)

In [549]:
sequence = tokenizer.texts_to_sequences(text_sequences)

In [550]:
# tokenizer.index_word

In [551]:
# tokenizer.word_counts

In [552]:
sequences = np.array(sequence)

In [553]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2734

In [554]:
# sequence

In [555]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [556]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()

    return model

In [557]:
from keras.utils import to_categorical

In [558]:
X = sequences[:,:-1]
y = sequences[:,-1]

In [559]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [560]:
seq_len = X.shape[1]

In [561]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 25, 25)            68375     
                                                                 
 lstm_12 (LSTM)              (None, 25, 150)           105600    
                                                                 
 lstm_13 (LSTM)              (None, 150)               180600    
                                                                 
 dense_12 (Dense)            (None, 150)               22650     
                                                                 
 dense_13 (Dense)            (None, 2735)              412985    
                                                                 
Total params: 790210 (3.01 MB)
Trainable params: 790210 (3.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [563]:
model.fit(X, y, batch_size=128, epochs=200,verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x7f0897d5b0d0>

In [564]:
from pickle import dump,load

In [565]:
model.save('mymodel.h5')
dump(tokenizer, open('mymodel', 'wb'))

  saving_api.save_model(


In [566]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [567]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):

    # Final Output
    output_text = []

    # Initial Seed Sequence
    input_text = seed_text

    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')

        # Get predicted probabilities for the next word
        pred_probabilities = model.predict(pad_encoded, verbose=0)[0]

        # Get the index of the word with the highest probability
        pred_word_ind = np.argmax(pred_probabilities)

        # Retrieve the word corresponding to the index
        pred_word = tokenizer.index_word.get(pred_word_ind)  # Use get method to handle out-of-vocabulary indices

        # Append the predicted word to the input text
        input_text += ' ' + pred_word
        output_text.append(pred_word)

    # Make it look like a sentence
    return ' '.join(output_text)


In [568]:
import random
random_pick = random.randint(0,len(text_sequences))

In [569]:
random_seed_text = text_sequences[random_pick]

In [570]:
seed_text = ' '.join(random_seed_text)
print(seed_text)

and the crannies 
 though , and thrust in a little lint here and there .   but it 's too 
 late to make any


In [571]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=20)

'planing about there from the brown similar put forth , , , , , , , , , , ,'

In [572]:
from keras.models import load_model

In [573]:
model = load_model('/content/epochBIG.h5')



In [574]:
# tokenizer = load(open('/content/epochBIG','rb'))

### Beam search

In [575]:
def beam_search(model, tokenizer, seq_len, seed_text, num_gen_words, k=3):
    sequences = [[tokenizer.texts_to_sequences([seed_text])[0], 0.0]]

    while len(sequences[0][0]) < num_gen_words:
        all_candidates = []
        for seq, score in sequences:
            pad_encoded = pad_sequences([seq], maxlen=seq_len, truncating='pre')
            pred_probabilities = model.predict(pad_encoded, verbose=0)[0]
            top_k_preds = np.argsort(pred_probabilities)[-k:]  # Top k predictions

            for pred in top_k_preds:
                candidate_seq = [seq + [pred], score - np.log(pred_probabilities[pred])]
                all_candidates.append(candidate_seq)

        # Sort candidates by score
        sequences = sorted(all_candidates, key=lambda tup: tup[1])[:k]

    # Retrieve the best sequence
    best_sequence = sequences[0][0]
    output_text = [tokenizer.index_word.get(idx, '[UNK]') for idx in best_sequence]

    return ' '.join(output_text)

In [576]:
generated_text = beam_search(model, tokenizer, seq_len, seed_text, num_gen_words=40)
print(generated_text)

and the crannies though and thrust in a little lint here and there but it 's too late to make any you more after space eyes [UNK] help in fields [UNK] creaking distracted was - remembered [UNK] , [UNK] 

