In [1]:
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy import displacy
import nltk
import pandas as pd

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
# nlp.pipeline,
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [4]:
doc = nlp(u'it is a $10 doller gift card from Google.you can get it from "www.fgm.com" website! like seriously!!!')

In [5]:
for t in doc:
  print(t,t.pos_,t.lemma_)

it PRON it
is AUX be
a DET a
$ SYM $
10 NUM 10
doller NOUN doller
gift NOUN gift
card NOUN card
from ADP from
Google.you PROPN Google.you
can AUX can
get VERB get
it PRON it
from ADP from
" PUNCT "
www.fgm.com PROPN www.fgm.com
" PUNCT "
website NOUN website
! PUNCT !
like VERB like
seriously ADV seriously
! PUNCT !
! PUNCT !
! PUNCT !


In [6]:
displacy.render(doc,style = 'dep',jupyter = True,options={'distance':80})

### Stemming

In [7]:
pd.DataFrame(nlp.Defaults.stop_words)

Unnamed: 0,0
0,same
1,next
2,another
3,most
4,forty
...,...
321,no
322,you
323,see
324,up


In [8]:
df = pd.read_csv('/content/smsspamcollection.tsv',sep='\t')
df.head(10)

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147,8
6,ham,Even my brother is not like to speak with me. ...,77,2
7,ham,As per your request 'Melle Melle (Oru Minnamin...,160,6
8,spam,WINNER!! As a valued network customer you have...,157,6
9,spam,Had your mobile 11 months or more? U R entitle...,154,2


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = CountVectorizer()

# Fit and transform the text data
X_vect = vectorizer.fit_transform(X_train)

# feature names (words)
feature_names = vectorizer.get_feature_names_out()

#dense matrix
dense_matrix = X_vect.toarray()

print("Feature Names (Words):", feature_names)
print("Feature Matrix:\n", dense_matrix)

Feature Names (Words): ['00' '000' '000pes' ... 'zoom' 'zouk' 'èn']
Feature Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [13]:
tfidf = TfidfVectorizer()


# Fit and transform the text data
X_tfidf = tfidf.fit_transform(X_train)

# Get feature names (words)
feature_names_tfidf = tfidf.get_feature_names_out()

# Convert to dense matrix (for demonstration purposes)
dense_matrix_tfidf = X_tfidf.toarray()

print("Feature Names (Words):", feature_names_tfidf)
print("TF-IDF Feature Matrix:\n", dense_matrix_tfidf)


Feature Names (Words): ['00' '000' '000pes' ... 'zoom' 'zouk' 'èn']
TF-IDF Feature Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [14]:
movie_df = pd.read_csv('/content/moviereviews.tsv',sep='\t')
movie_df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [15]:
movie_df.isnull().sum()

label      0
review    35
dtype: int64

In [16]:
movie_df.dropna(inplace = True)

In [17]:
X = movie_df['review']
y = movie_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

In [19]:
text_clf_nb.fit(X_train, y_train)

In [20]:
predictions = text_clf_nb.predict(X_test)
# predictions

In [21]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[303  19]
 [114 213]]


In [22]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.73      0.94      0.82       322
         pos       0.92      0.65      0.76       327

    accuracy                           0.80       649
   macro avg       0.82      0.80      0.79       649
weighted avg       0.82      0.80      0.79       649



In [23]:
print(metrics.accuracy_score(y_test,predictions))

0.7950693374422187


In [24]:
text_clf_lsvc.fit(X_train, y_train)

In [25]:
predictions = text_clf_lsvc.predict(X_test)

In [26]:
print(metrics.confusion_matrix(y_test,predictions))

[[281  41]
 [ 56 271]]


In [27]:
print(metrics.accuracy_score(y_test,predictions))

0.8505392912172574


In [28]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

### adding stop words to imporve calssification

In [29]:
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC()),
])
text_clf_lsvc2.fit(X_train, y_train)

In [30]:
predictions = text_clf_lsvc2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

[[278  44]
 [ 54 273]]


In [31]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.84      0.86      0.85       322
         pos       0.86      0.83      0.85       327

    accuracy                           0.85       649
   macro avg       0.85      0.85      0.85       649
weighted avg       0.85      0.85      0.85       649



In [32]:
print(metrics.accuracy_score(y_test,predictions))

0.8489984591679507


In [33]:
myreview1 = "A movie I really wanted to love was terrible. \
I'm sure the producers had the best intentions, but the execution was lacking."

In [34]:
myreview2 = "as a thriller movie fan i was looking for a movie that is full of thrill and this movie \
did not dissapoint me. though i thought it won't meet my expection but i was wrong."

In [35]:
myreview3 = "Marvel's 'Avengers: Endgame' is a breathtaking cinematic masterpiece that beautifully wraps up over a decade of storytelling. \
 The movie impressively weaves together multiple story arcs, delivering an emotional and action-packed experience. \
  With stunning visual effects, compelling character development, and an epic conclusion,\
   it's a must-watch for every fan of the Marvel Cinematic Universe."

In [36]:
print(text_clf_nb.predict([myreview1]))
print(text_clf_lsvc2.predict([myreview2]))
print(text_clf_lsvc2.predict([myreview3]))

['neg']
['neg']
['pos']


In [37]:
### Using lemmatization for performence improvement

In [38]:
import spacy

# Load Spacy's English model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Lemmatization and stopwords
def spacy_lemmatize(text):
    return ' '.join([token.lemma_ for token in nlp(text) if not token.is_stop])

# Applying Lemmatization and custom stopwords
X_train_lemmatized = [spacy_lemmatize(text) for text in X_train]
X_test_lemmatized = [spacy_lemmatize(text) for text in X_test]

# Updating the model pipeline with Lemmatized text
text_clf_lsvc3 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

text_clf_lsvc3.fit(X_train_lemmatized, y_train)

predictions = text_clf_lsvc3.predict(X_test_lemmatized)

print(metrics.confusion_matrix(y_test, predictions))
print(metrics.classification_report(y_test, predictions))
print(metrics.accuracy_score(y_test, predictions))


[[264  58]
 [ 55 272]]
              precision    recall  f1-score   support

         neg       0.83      0.82      0.82       322
         pos       0.82      0.83      0.83       327

    accuracy                           0.83       649
   macro avg       0.83      0.83      0.83       649
weighted avg       0.83      0.83      0.83       649

0.8258859784283513


In [39]:
print(text_clf_lsvc3.predict([myreview1]))
print(text_clf_lsvc3.predict([myreview2])) #it should be posative
print(text_clf_lsvc3.predict([myreview3]))

['neg']
['neg']
['pos']


### semantic and sentiment analysis

In [40]:
!python -m spacy download en_core_web_lg

2024-01-02 14:23:12.836458: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-02 14:23:12.836510: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-02 14:23:12.838240: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully install

In [41]:
nlp2 = spacy.load('en_core_web_lg',disable=['parser','tagger','ner'])

In [42]:
# nlp2(u'love').vector

In [43]:
tokens = nlp2(u'lion cat pet')
for i in tokens:
  for j in tokens:
    print(i.text,j.text,i.similarity(j))

lion lion 1.0
lion cat 0.3854507803916931
lion pet 0.20031584799289703
cat lion 0.3854507803916931
cat cat 1.0
cat pet 0.732966423034668
pet lion 0.20031584799289703
pet cat 0.732966423034668
pet pet 1.0




In [44]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [45]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [46]:
sid = SentimentIntensityAnalyzer()

In [47]:
a = 'this is a fucking good movie'
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.556, 'pos': 0.444, 'compound': 0.4927}

In [48]:
sid.polarity_scores(myreview2)

{'neg': 0.119, 'neu': 0.748, 'pos': 0.132, 'compound': -0.3716}

In [49]:
sid.polarity_scores(movie_df.iloc[202]['review'])

{'neg': 0.098, 'neu': 0.752, 'pos': 0.15, 'compound': 0.9892}

In [50]:
movie_df['scores'] = movie_df['review'].apply(lambda r : sid.polarity_scores(r))
movie_df.head(10)

Unnamed: 0,label,review,scores
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."
5,neg,"to put it bluntly , ed wood would have been pr...","{'neg': 0.123, 'neu': 0.821, 'pos': 0.056, 'co..."
6,neg,"synopsis : melissa , a mentally-disturbed woma...","{'neg': 0.087, 'neu': 0.742, 'pos': 0.17, 'com..."
7,neg,tim robbins and martin lawernce team up in thi...,"{'neg': 0.118, 'neu': 0.709, 'pos': 0.172, 'co..."
8,neg,"in "" gia "" , angelina jolie plays the titular ...","{'neg': 0.082, 'neu': 0.862, 'pos': 0.056, 'co..."
9,neg,"in 1990 , the surprise success an unheralded l...","{'neg': 0.145, 'neu': 0.728, 'pos': 0.127, 'co..."


In [51]:
movie_df['compound'] = movie_df['scores'].apply(lambda d : d['compound'])
movie_df.head(10)

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484
5,neg,"to put it bluntly , ed wood would have been pr...","{'neg': 0.123, 'neu': 0.821, 'pos': 0.056, 'co...",-0.9855
6,neg,"synopsis : melissa , a mentally-disturbed woma...","{'neg': 0.087, 'neu': 0.742, 'pos': 0.17, 'com...",0.9871
7,neg,tim robbins and martin lawernce team up in thi...,"{'neg': 0.118, 'neu': 0.709, 'pos': 0.172, 'co...",0.9829
8,neg,"in "" gia "" , angelina jolie plays the titular ...","{'neg': 0.082, 'neu': 0.862, 'pos': 0.056, 'co...",-0.8278
9,neg,"in 1990 , the surprise success an unheralded l...","{'neg': 0.145, 'neu': 0.728, 'pos': 0.127, 'co...",-0.9147


In [52]:
movie_df['compound_score'] = movie_df['compound'].apply(lambda s : 'pos' if s > 0 else 'neg' )
movie_df.head(10)

Unnamed: 0,label,review,scores,compound,compound_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg
5,neg,"to put it bluntly , ed wood would have been pr...","{'neg': 0.123, 'neu': 0.821, 'pos': 0.056, 'co...",-0.9855,neg
6,neg,"synopsis : melissa , a mentally-disturbed woma...","{'neg': 0.087, 'neu': 0.742, 'pos': 0.17, 'com...",0.9871,pos
7,neg,tim robbins and martin lawernce team up in thi...,"{'neg': 0.118, 'neu': 0.709, 'pos': 0.172, 'co...",0.9829,pos
8,neg,"in "" gia "" , angelina jolie plays the titular ...","{'neg': 0.082, 'neu': 0.862, 'pos': 0.056, 'co...",-0.8278,neg
9,neg,"in 1990 , the surprise success an unheralded l...","{'neg': 0.145, 'neu': 0.728, 'pos': 0.127, 'co...",-0.9147,neg


###  Latent Dirichlet Allocation (LDA)

In [53]:
len(movie_df['review'])

1965

In [54]:
cv2 = CountVectorizer(max_df = 0.95,min_df = 2,stop_words='english')
dtm = cv2.fit_transform(movie_df['review'])
dtm

<1965x23298 sparse matrix of type '<class 'numpy.int64'>'
	with 455692 stored elements in Compressed Sparse Row format>

In [55]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=8)

In [56]:
lda.fit(dtm)

### Grab the topics

In [57]:
lda.components_

array([[ 1.15777019, 36.13444957,  0.12501983, ...,  1.1246987 ,
         0.12559395,  0.125     ],
       [ 0.125     , 20.20736859,  0.12501025, ...,  0.125     ,
         1.63105845,  1.12499877],
       [ 0.12507972, 14.20950358, 10.12488618, ...,  0.12506018,
         0.12500097,  0.125     ],
       ...,
       [ 0.1252269 ,  8.96286655,  0.12502338, ...,  0.12541231,
        16.48534484,  0.125     ],
       [ 5.00635763, 16.75544876,  0.125     , ...,  0.125     ,
         0.125     ,  0.125     ],
       [ 0.125     ,  8.12208508,  0.125     , ...,  0.125     ,
         0.125     ,  0.125     ]])

In [58]:
single_topic = lda.components_[0]
toptenwords = single_topic.argsort()[-10:]

In [59]:
for i in toptenwords:
  print(cv2.get_feature_names_out()[i])

alien
big
really
time
bad
good
like
just
film
movie


In [60]:
for i,topic in enumerate(lda.components_):
   print(f'top 10 words for topic {i}')
   print([cv2.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

top 10 words for topic 0
['alien', 'big', 'really', 'time', 'bad', 'good', 'like', 'just', 'film', 'movie']
top 10 words for topic 1
['does', 'action', 'time', 'story', 'character', 'just', 'good', 'like', 'movie', 'film']
top 10 words for topic 2
['characters', 'really', 'films', 'good', 'character', 'time', 'just', 'like', 'movie', 'film']
top 10 words for topic 3
['character', 'make', 'action', 'little', 'time', 'story', 'just', 'like', 'movie', 'film']
top 10 words for topic 4
['people', 'time', 'smith', 'way', 'make', 'characters', 'story', 'like', 'movie', 'film']
top 10 words for topic 5
['love', 'time', 'character', 'just', 'characters', 'story', 'like', 'life', 'movie', 'film']
top 10 words for topic 6
['characters', 'character', 'way', 'story', 'time', 'good', 'just', 'movie', 'like', 'film']
top 10 words for topic 7
['way', 'love', 'life', 'story', 'good', 'just', 'like', 'time', 'movie', 'film']


In [61]:
topic_result = lda.transform(dtm)
topic_result

array([[6.69862145e-04, 6.69493481e-04, 6.69605002e-04, ...,
        6.69546112e-04, 6.69667672e-04, 6.69575011e-04],
       [4.73640580e-01, 4.21595676e-04, 4.21666195e-04, ...,
        3.49584229e-01, 1.74667449e-01, 4.21475954e-04],
       [7.28796311e-02, 1.69348177e-01, 4.08424109e-01, ...,
        3.47673050e-01, 4.18833595e-04, 4.18811358e-04],
       ...,
       [3.26108956e-04, 3.26100171e-04, 5.45116732e-01, ...,
        3.26060835e-04, 3.26072956e-04, 4.52926796e-01],
       [4.35992895e-04, 4.36124951e-04, 4.36058318e-04, ...,
        9.96947718e-01, 4.36035375e-04, 4.36001477e-04],
       [9.97163763e-01, 4.05241808e-04, 4.05241768e-04, ...,
        4.05174414e-04, 4.05211868e-04, 4.05080560e-04]])

In [62]:
movie_df['Topic'] = topic_result.argmax(axis = 1)

In [63]:
movie_df

Unnamed: 0,label,review,scores,compound,compound_score,Topic
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg,3
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg,0
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos,2
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos,4
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg,1
...,...,...,...,...,...,...
1995,pos,"i like movies with albert brooks , and i reall...","{'neg': 0.073, 'neu': 0.763, 'pos': 0.164, 'co...",0.9991,pos,7
1996,pos,it might surprise some to know that joel and e...,"{'neg': 0.238, 'neu': 0.688, 'pos': 0.074, 'co...",-0.9993,neg,2
1997,pos,the verdict : spine-chilling drama from horror...,"{'neg': 0.15, 'neu': 0.702, 'pos': 0.147, 'com...",-0.5966,neg,2
1998,pos,i want to correct what i wrote in a former ret...,"{'neg': 0.131, 'neu': 0.71, 'pos': 0.16, 'comp...",0.9387,pos,5


In [64]:
def open_file(path):
  with open(path) as f:
    text = f.read()
    return text

In [65]:
# print(open_file('/content/melville-moby_dick.txt'))

In [66]:
nlp2.max_length = 1198623

In [67]:
def tokns(docmnt):
  return [token.text.lower() for token in nlp2(docmnt)]

In [68]:
d = open_file('/content/moby_dick_four_chapters.txt')

In [69]:
tokens = tokns(d)



In [70]:
len(tokens)

14480

In [71]:
train_len = 25+1
text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[ i - train_len : i ]
    text_sequences.append(seq)

In [72]:
# arr = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
# res = []
# for i in range(len(arr)):
#   seq = arr[ i - 5 : i]
#   res.append(seq)
# res

In [73]:
print(' '.join(text_sequences[0]))

call me ishmael .   some years ago -- never mind how long 
 precisely -- having little or no money in my purse , and


In [74]:
print(' '.join(text_sequences[20]))

money in my purse , and nothing 
 particular to interest me on shore , i thought i would sail about a 
 little and see


In [75]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

In [76]:
tokenizer.fit_on_texts(text_sequences)

In [77]:
sequence = tokenizer.texts_to_sequences(text_sequences)

In [78]:
# tokenizer.index_word

In [79]:
# tokenizer.word_counts

In [80]:
sequences = np.array(sequence)

In [81]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2734

In [82]:
# sequence

In [83]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [84]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()

    return model

In [85]:
from keras.utils import to_categorical

In [86]:
X = sequences[:,:-1]
y = sequences[:,-1]

In [87]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [88]:
seq_len = X.shape[1]

In [89]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            68375     
                                                                 
 lstm (LSTM)                 (None, 25, 150)           105600    
                                                                 
 lstm_1 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 150)               22650     
                                                                 
 dense_1 (Dense)             (None, 2735)              412985    
                                                                 
Total params: 790210 (3.01 MB)
Trainable params: 790210 (3.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [90]:
model.fit(X, y, batch_size=128, epochs=160,verbose=1)

Epoch 1/160
Epoch 2/160
Epoch 3/160
Epoch 4/160
Epoch 5/160
Epoch 6/160
Epoch 7/160
Epoch 8/160
Epoch 9/160
Epoch 10/160
Epoch 11/160
Epoch 12/160
Epoch 13/160
Epoch 14/160
Epoch 15/160
Epoch 16/160
Epoch 17/160
Epoch 18/160
Epoch 19/160
Epoch 20/160
Epoch 21/160
Epoch 22/160
Epoch 23/160
Epoch 24/160
Epoch 25/160
Epoch 26/160
Epoch 27/160
Epoch 28/160
Epoch 29/160
Epoch 30/160
Epoch 31/160
Epoch 32/160
Epoch 33/160
Epoch 34/160
Epoch 35/160
Epoch 36/160
Epoch 37/160
Epoch 38/160
Epoch 39/160
Epoch 40/160
Epoch 41/160
Epoch 42/160
Epoch 43/160
Epoch 44/160
Epoch 45/160
Epoch 46/160
Epoch 47/160
Epoch 48/160
Epoch 49/160
Epoch 50/160
Epoch 51/160
Epoch 52/160
Epoch 53/160
Epoch 54/160
Epoch 55/160
Epoch 56/160
Epoch 57/160
Epoch 58/160
Epoch 59/160
Epoch 60/160
Epoch 61/160
Epoch 62/160
Epoch 63/160
Epoch 64/160
Epoch 65/160
Epoch 66/160
Epoch 67/160
Epoch 68/160
Epoch 69/160
Epoch 70/160
Epoch 71/160
Epoch 72/160
Epoch 73/160
Epoch 74/160
Epoch 75/160
Epoch 76/160
Epoch 77/160
Epoch 78

<keras.src.callbacks.History at 0x7a0a8c4f3310>

In [91]:
from pickle import dump,load

In [92]:
model.save('mymodel.h5')
dump(tokenizer, open('mymodel', 'wb'))

  saving_api.save_model(


In [93]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [94]:
def generated_text(model, tokenizer, seq_len, seed_text, num_gen_words):

    # Final Output
    output_text = []

    # Initial Seed Sequence
    input_text = seed_text

    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')

        # Get predicted probabilities for the next word
        pred_probabilities = model.predict(pad_encoded, verbose=0)[0]

        # Get the index of the word with the highest probability
        pred_word_ind = np.argmax(pred_probabilities)

        # Retrieve the word corresponding to the index
        pred_word = tokenizer.index_word.get(pred_word_ind)  # Use get method to handle out-of-vocabulary indices

        # Append the predicted word to the input text
        input_text += ' ' + pred_word
        output_text.append(pred_word)

    # Make it look like a sentence
    return ' '.join(output_text)


In [95]:
import random
random_pick = random.randint(0,len(text_sequences))

In [96]:
random_seed_text = text_sequences[random_pick]

In [97]:
seed_text = ' '.join(random_seed_text)
print(seed_text)

  yes , as every one knows , meditation 
 and water are wedded for ever . 

 but here is an artist .   he


In [98]:
generated_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=20)

'deep unusual regarding . . . . . . . . . . . . . . . . .'

In [99]:
from keras.models import load_model

In [100]:
# model = load_model('/content/epochBIG.h5')

In [101]:
# tokenizer = load(open('/content/epochBIG','rb'))

### Beam search

In [102]:
def beam_search(model, tokenizer, seq_len, seed_text, num_gen_words, k=3):
    sequences = [[tokenizer.texts_to_sequences([seed_text])[0], 0.0]]

    while len(sequences[0][0]) < num_gen_words:
        all_candidates = []
        for seq, score in sequences:
            pad_encoded = pad_sequences([seq], maxlen=seq_len, truncating='pre')
            pred_probabilities = model.predict(pad_encoded, verbose=0)[0]
            top_k_preds = np.argsort(pred_probabilities)[-k:]  # Top k predictions

            for pred in top_k_preds:
                candidate_seq = [seq + [pred], score - np.log(pred_probabilities[pred])]
                all_candidates.append(candidate_seq)

        # Sort candidates by score
        sequences = sorted(all_candidates, key=lambda tup: tup[1])[:k]

    # Retrieve the best sequence
    best_sequence = sequences[0][0]
    output_text = [tokenizer.index_word.get(idx, '[UNK]') for idx in best_sequence]

    return ' '.join(output_text)

In [103]:
generated_text = beam_search(model, tokenizer, seq_len, seed_text, num_gen_words=40)
print(generated_text)

yes as every one knows meditation and water are wedded for ever but here is an artist he deep unusual regarding . 

 " landlord , " said i , sir ,   this were 
 storied weapons , and


#chatbots



In [104]:
import pickle
with open('/content/train_qa.txt','rb') as f:
  train_data = pickle.load(f)

In [105]:
with open('/content/test_qa.txt','rb') as f:
  test_data = pickle.load(f)

In [106]:
all_data  = train_data + test_data

In [107]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [108]:
' '.join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [109]:
' '.join(train_data[0][2])

'n o'

In [110]:
df  = pd.DataFrame(all_data,columns=['Story','Question','Answer'])
df

Unnamed: 0,Story,Question,Answer
0,"[Mary, moved, to, the, bathroom, ., Sandra, jo...","[Is, Sandra, in, the, hallway, ?]",no
1,"[Mary, moved, to, the, bathroom, ., Sandra, jo...","[Is, Daniel, in, the, bathroom, ?]",no
2,"[Mary, moved, to, the, bathroom, ., Sandra, jo...","[Is, Daniel, in, the, office, ?]",no
3,"[Mary, moved, to, the, bathroom, ., Sandra, jo...","[Is, Daniel, in, the, bedroom, ?]",yes
4,"[Mary, moved, to, the, bathroom, ., Sandra, jo...","[Is, Daniel, in, the, bedroom, ?]",yes
...,...,...,...
10995,"[Mary, moved, to, the, kitchen, ., Mary, trave...","[Is, Mary, in, the, bedroom, ?]",no
10996,"[Mary, moved, to, the, kitchen, ., Mary, trave...","[Is, Sandra, in, the, kitchen, ?]",no
10997,"[Mary, moved, to, the, kitchen, ., Mary, trave...","[Is, Mary, in, the, bedroom, ?]",no
10998,"[Mary, moved, to, the, kitchen, ., Mary, trave...","[Is, Sandra, in, the, garden, ?]",yes


In [111]:
vocab = set()
for story,question,answer in all_data:
  vocab = vocab.union(set(story))
  vocab = vocab.union(set(question))


In [112]:
vocab.add('yes')
vocab.add('no')
vocab

{'.',
 '?',
 'Daniel',
 'Is',
 'John',
 'Mary',
 'Sandra',
 'apple',
 'back',
 'bathroom',
 'bedroom',
 'discarded',
 'down',
 'dropped',
 'football',
 'garden',
 'got',
 'grabbed',
 'hallway',
 'in',
 'journeyed',
 'kitchen',
 'left',
 'milk',
 'moved',
 'no',
 'office',
 'picked',
 'put',
 'the',
 'there',
 'to',
 'took',
 'travelled',
 'up',
 'went',
 'yes'}

In [113]:
vocab_len = len(vocab) + 1
# for i in range(len(all_data)):
#   for data in all_data[i]:
#     print(' '.join(data))

In [114]:
stories_len = [len(data[0]) for data in all_data]
question_len = [len(data[1]) for data in all_data]

In [115]:
maxlen_stories = max(stories_len)
maxlen_stories

156

In [116]:
maxlen_question = max(question_len)
maxlen_question

6

In [117]:
tokenizer = Tokenizer(filters = [])

In [118]:
tokenizer.fit_on_texts(vocab)

In [119]:
tokenizer.word_index['yes']

12

In [120]:
tokenizer.word_index['no']

29

In [121]:
tokenizer.word_index

{'travelled': 1,
 'apple': 2,
 '?': 3,
 'put': 4,
 'moved': 5,
 'discarded': 6,
 'office': 7,
 'john': 8,
 'bathroom': 9,
 'there': 10,
 'to': 11,
 'yes': 12,
 'left': 13,
 'journeyed': 14,
 'is': 15,
 '.': 16,
 'down': 17,
 'football': 18,
 'milk': 19,
 'garden': 20,
 'took': 21,
 'got': 22,
 'mary': 23,
 'hallway': 24,
 'went': 25,
 'in': 26,
 'kitchen': 27,
 'grabbed': 28,
 'no': 29,
 'dropped': 30,
 'picked': 31,
 'daniel': 32,
 'sandra': 33,
 'back': 34,
 'up': 35,
 'the': 36,
 'bedroom': 37}

In [122]:
train_story_text = []
train_question_text = []
train_answer_text = []
for s,q,a in train_data:
  train_story_text.append(s)
  train_question_text.append(q)
  train_answer_text.append(q)

In [123]:
# train_story_text

In [124]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)
# train_story_seq = np.array(train_story_seq)
# train_story_seq

In [125]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=maxlen_stories,max_question_len=maxlen_question):
    '''
    OUTPUT:
    Vectorizes the stories,questions, and answers into padded sequences. We first loop for every story, query , and
    answer in the data. Then we convert the raw words to an word index value. Then we append each set to their appropriate
    output list. Then once we have converted the words to numbers, we pad the sequences so they are all of equal length.
    Returns this in the form of a tuple (X,Xq,Y) (padded based on max lengths)
    '''
    # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []


    for story, query, answer in data:

        # Grab the word index for every word in story
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1

        X.append(x)
        Xq.append(xq)
        Y.append(y)

    return (pad_sequences(X, maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [126]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [127]:
import keras.layers
from keras.models import Sequential, Model
from keras.layers import Embedding,Input, Activation, Dense, Permute, Dropout,add, dot, concatenate,LSTM

In [131]:
input_sequence = Input((maxlen_stories,))
question_sequence = Input((maxlen_question,))
question_sequence

<KerasTensor: shape=(None, 6) dtype=float32 (created by layer 'input_6')>

### input encoder M

In [135]:
# Input gets embedded to a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_len,output_dim=64))
input_encoder_m.add(Dropout(0.5))

# This encoder will output:
# (samples, story_maxlen, embedding_dim)

### input encoder C

In [137]:
# embed the input into a sequence of vectors of size query_maxlen
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_len,output_dim=maxlen_question))
input_encoder_c.add(Dropout(0.5))
# output: (samples, story_maxlen, query_maxlen)

### question encoder


In [139]:
# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_len,
                               output_dim=64,
                               input_length=maxlen_question))
question_encoder.add(Dropout(0.5))
# output: (samples, query_maxlen, embedding_dim)

In [140]:
# encode input sequence and questions (which are indices)
# to sequences of dense vectors
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question_sequence)

In [141]:
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

In [142]:
# add the match matrix with the second input vector sequence
response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)

In [146]:
# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded])
answer

<KerasTensor: shape=(None, 6, 220) dtype=float32 (created by layer 'concatenate_1')>

In [147]:
# Reduce with RNN (LSTM)
answer = LSTM(32)(answer)  # (samples, 32)
# Regularization with Dropout
answer = Dropout(0.5)(answer)
answer = Dense(vocab_len)(answer)  # (samples, vocab_size)
# we output a probability distribution over the vocabulary
answer = Activation('softmax')(answer)

In [149]:
model = Model([input_sequence, question_sequence], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [150]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 156)]                0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, 6)]                  0         []                            
                                                                                                  
 sequential_4 (Sequential)   (None, None, 64)             2432      ['input_5[0][0]']             
                                                                                                  
 sequential_8 (Sequential)   (None, 6, 64)                2432      ['input_6[0][0]']             
                                                                                            

In [151]:
# train
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=100,validation_data=([inputs_test, queries_test], answers_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [152]:
filename = 'chatbot.h5'
model.save(filename)

  saving_api.save_model(


In [153]:
model.load_weights(filename)
pred_results = model.predict(([inputs_test, queries_test]))



In [156]:
story =' '.join(word for word in test_data[1][0])
print(story)

Mary got the milk there . John moved to the bedroom . Mary discarded the milk . John went to the garden .


In [157]:
query = ' '.join(word for word in test_data[1][1])
print(query)

Is John in the kitchen ?


In [158]:
print("Answer is:",test_data[1][2])

Answer is: no


In [159]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  no
Probability of certainty was:  0.9860724


In [160]:
# Note the whitespace of the periods
my_story = "John left the kitchen . Sandra dropped the football in the garden ."
my_story.split()

['John',
 'left',
 'the',
 'kitchen',
 '.',
 'Sandra',
 'dropped',
 'the',
 'football',
 'in',
 'the',
 'garden',
 '.']

In [162]:
my_question = "Is the football in the garden ?"
my_question.split()

['Is', 'the', 'football', 'in', 'the', 'garden', '?']

In [163]:
mydata = [(my_story.split(),my_question.split(),'yes')]

In [164]:
my_story,my_ques,my_ans = vectorize_stories(mydata)

In [166]:
pred_results = model.predict(([ my_story, my_ques]))
pred_results



array([[3.0167868e-08, 3.2038216e-08, 2.6341205e-08, 2.5152840e-08,
        3.0435984e-08, 3.5684948e-08, 2.9524221e-08, 2.4299165e-08,
        3.3395718e-08, 2.4177348e-08, 2.7120091e-08, 2.4886557e-08,
        9.2139941e-01, 2.9354421e-08, 3.1034265e-08, 2.7397046e-08,
        3.3228297e-08, 2.8366317e-08, 2.8547330e-08, 2.2071832e-08,
        3.1515892e-08, 2.8837432e-08, 2.6241411e-08, 2.7352248e-08,
        2.2664567e-08, 3.1332306e-08, 2.6604470e-08, 3.1240170e-08,
        2.8283443e-08, 7.8599535e-02, 2.6137410e-08, 3.4757790e-08,
        4.1513395e-08, 3.2912972e-08, 3.0688390e-08, 2.8270231e-08,
        3.5254814e-08, 2.9769804e-08]], dtype=float32)

In [167]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

Predicted answer is:  yes
Probability of certainty was:  0.9213994
