In [2]:
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy import displacy
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp("As the Spring comes there is a variety of Flower in the trees and lots of bees")
tokens = [token.text for token in doc]
print(tokens)

['As', 'the', 'Spring', 'comes', 'there', 'is', 'a', 'variety', 'of', 'Flower', 'in', 'the', 'trees', 'and', 'lots', 'of', 'bees']


In [None]:
# nlp.pipeline
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
text = "The sky is Cloudy,it could be rain today Right?"
text = text.lower()
tokens = word_tokenize(text)
print(tokens)

['the', 'sky', 'is', 'cloudy', ',', 'it', 'could', 'be', 'rain', 'today', 'right', '?']


In [None]:
doc = nlp(u'it is a $10 doller gift card from Google.you can get it from "www.fgm.com" website! like seriously!!!')

In [None]:
pd.DataFrame(nlp.Defaults.stop_words)

Unnamed: 0,0
0,via
1,several
2,some
3,whereby
4,at
...,...
321,somewhere
322,whereafter
323,back
324,moreover


###stop word removal

In [None]:
doc = nlp("Once upon a time there was a fox.He was very cleaver,one day he was walking in the jungle when he fell into a trap and loose his tail.so he become very upset!")
filtered_tokens = [token.text for token in doc if not token.is_stop]
print(filtered_tokens)

['time', 'fox', '.', 'cleaver', ',', 'day', 'walking', 'jungle', 'fell', 'trap', 'loose', 'tail.so', 'upset', '!']


###Stemming and Lemmatization:

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
word = "running"
stemmed_word = stemmer.stem(word)
print(stemmed_word)

run


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
word = "running"
lemmatized_word = lemmatizer.lemmatize(word)
print(lemmatized_word)

[nltk_data] Downloading package wordnet to /root/nltk_data...


running


###Named entity recogantion(NER)

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is headquartered in Cupertino, California.")
ner_tags = [(ent.text, ent.label_) for ent in doc.ents]
print(ner_tags)

[('Apple', 'ORG'), ('Cupertino', 'GPE'), ('California', 'GPE')]


###Bag of Words (BoW) Representation

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ["The cat sat on the mat.", "The dog played in the garden."]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['cat' 'dog' 'garden' 'in' 'mat' 'on' 'played' 'sat' 'the']
[[1 0 0 0 1 1 0 1 2]
 [0 1 1 1 0 0 1 0 2]]


###TF-IDF (Term Frequency-Inverse Document Frequency)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["The cat sat on the mat.", "The dog played in the garden."]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['cat' 'dog' 'garden' 'in' 'mat' 'on' 'played' 'sat' 'the']
[[0.40740124 0.         0.         0.         0.40740124 0.40740124
  0.         0.40740124 0.57973867]
 [0.         0.40740124 0.40740124 0.40740124 0.         0.
  0.40740124 0.         0.57973867]]


###Text Classification

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
X_train = ["I love this movie.", "This movie is terrible.","This movie is fantastic.","This movie is good.","This movie is worse."]
y_train = ["positive", "negative","positive","positive","negative"]
X_test = ["This movie is great."]
model = make_pipeline(TfidfVectorizer(), SVC())
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(predictions)

['positive']


###Sentiment Analysis

In [15]:
from textblob import TextBlob
text = "titanic is the best romantic movie ever"
blob = TextBlob(text)
sentiment = blob.sentiment
print(sentiment)

Sentiment(polarity=0.5, subjectivity=0.4)


###Regular expression

In [None]:
import re
txt = 'the phone number is 458-566-1202 ,and our email is example@gmail.com.'

In [None]:
'phone' in txt

True

In [None]:
patrn = r'\d\d\d-\d\d\d-\d\d\d\d'
re.search(patrn,txt)

<re.Match object; span=(20, 32), match='458-566-1202'>

In [None]:
re.findall(r'.at','this is a flat,there is a cat wearing a')

['lat', 'cat']

In [None]:
pattern = re.compile(r'm.n')
result = pattern.findall('men can do everything if the man has patient')
result

['men', 'man']

In [None]:
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
email_text = "Contact support@example.com for assistance or info@company.co.uk for more details."
emails = email_pattern.findall(email_text)
emails

['support@example.com', 'info@company.co.uk']

In [None]:
#removing numbers and punctuation from the sentence
' ' .join(re.findall(r'[^!?.,\d]+','there is 3 numbers, in  1 lines out of 10 sentence!right?'))

'there is   numbers  in    lines out of   sentence right'

In [None]:
for t in doc:
  print(t,t.pos_,t.lemma_)


it PRON it
is AUX be
a DET a
$ SYM $
10 NUM 10
doller NOUN doller
gift NOUN gift
card NOUN card
from ADP from
Google.you PROPN Google.you
can AUX can
get VERB get
it PRON it
from ADP from
" PUNCT "
www.fgm.com PROPN www.fgm.com
" PUNCT "
website NOUN website
! PUNCT !
like VERB like
seriously ADV seriously
! PUNCT !
! PUNCT !
! PUNCT !


In [None]:
displacy.render(doc,style = 'dep',jupyter = True,options={'distance':80})

### Stemming

In [None]:
pd.DataFrame(nlp.Defaults.stop_words)

Unnamed: 0,0
0,throughout
1,'ll
2,again
3,another
4,rather
...,...
321,seemed
322,under
323,whenever
324,various


In [None]:
df = pd.read_csv('/content/smsspamcollection.tsv',sep='\t')
df.head(10)

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147,8
6,ham,Even my brother is not like to speak with me. ...,77,2
7,ham,As per your request 'Melle Melle (Oru Minnamin...,160,6
8,spam,WINNER!! As a valued network customer you have...,157,6
9,spam,Had your mobile 11 months or more? U R entitle...,154,2


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = CountVectorizer()

X_vect = vectorizer.fit_transform(X_train)

# feature names (words)
feature_names = vectorizer.get_feature_names_out()

#dense matrix
dense_matrix = X_vect.toarray()

print("Feature Names (Words):", pd.DataFrame(feature_names))
print("Feature Matrix:\n", dense_matrix)

Feature Names (Words):               0
0            00
1           000
2        000pes
3          0089
4          0121
...         ...
7077        zoe
7078  zogtorius
7079       zoom
7080       zouk
7081         èn

[7082 rows x 1 columns]
Feature Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
tfidf = TfidfVectorizer()


# Fit and transform the text data
X_tfidf = tfidf.fit_transform(X_train)

# Get feature names (words)
feature_names_tfidf = tfidf.get_feature_names_out()

# Convert to dense matrix (for demonstration purposes)
dense_matrix_tfidf = X_tfidf.toarray()

print("Feature Names (Words):", feature_names_tfidf)
print("TF-IDF Feature Matrix:\n", dense_matrix_tfidf)


Feature Names (Words): ['00' '000' '000pes' ... 'zoom' 'zouk' 'èn']
TF-IDF Feature Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
movie_df = pd.read_csv('/content/moviereviews.tsv',sep='\t')
movie_df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [None]:
movie_df.isnull().sum()

label      0
review    35
dtype: int64

In [None]:
movie_df.dropna(inplace = True)

In [None]:
X = movie_df['review']
y = movie_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'tfidf__max_features': [1000, 5000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1.0, 10.0],
    'clf__penalty': ['l1', 'l2'],
}

# Grid Search
grid_search = GridSearchCV(text_clf_lsvc, param_grid, cv=5)
grid_search.fit(X_train, y_train)

#best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

#best model for prediction
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 1223, in _fit_liblinear
    solver_type = _g

In [None]:
print(best_model)
print(predictions)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
                ('clf', LinearSVC())])
['pos' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'neg' 'pos'
 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg'
 'neg' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg'
 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos'
 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg'
 'neg' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg'
 'pos' 'neg' 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg' 'neg'
 'pos' 'neg' 'neg' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg'
 'pos' 'pos' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg'
 'neg' 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'pos'
 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'ne

In [None]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[274  48]
 [ 51 276]]


In [None]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.84      0.85      0.85       322
         pos       0.85      0.84      0.85       327

    accuracy                           0.85       649
   macro avg       0.85      0.85      0.85       649
weighted avg       0.85      0.85      0.85       649



In [None]:
print(metrics.accuracy_score(y_test,predictions))

0.847457627118644


In [None]:
text_clf_lsvc.fit(X_train, y_train)

In [None]:
predictions = text_clf_lsvc.predict(X_test)

In [None]:
print(metrics.confusion_matrix(y_test,predictions))

[[281  41]
 [ 56 271]]


In [None]:
print(metrics.accuracy_score(y_test,predictions))

0.8505392912172574


In [None]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

### adding stop words to imporve calssification

In [None]:
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC()),
])
grid_search2 = GridSearchCV(text_clf_lsvc2, param_grid, cv=5)
grid_search2.fit(X_train, y_train)

60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 1223, in _fit_liblinear
    solver_type = _g

In [None]:
best_model2 = grid_search2.best_estimator_
predictions = best_model2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

[[269  53]
 [ 50 277]]


In [None]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.84      0.84      0.84       322
         pos       0.84      0.85      0.84       327

    accuracy                           0.84       649
   macro avg       0.84      0.84      0.84       649
weighted avg       0.84      0.84      0.84       649



In [None]:
print(metrics.accuracy_score(y_test,predictions))

0.8412942989214176


In [None]:
myreview1 = "A movie I really wanted to love was terrible. \
I'm sure the producers had the best intentions, but the execution was lacking."

In [None]:
myreview2 = "as a thriller movie fan i was looking for a movie that is full of thrill and this movie \
did not dissapoint me. though i thought it won't meet my expection but i was wrong."

In [None]:
myreview3 = "Marvel's 'Avengers: Endgame' is a breathtaking cinematic masterpiece that beautifully wraps up over a decade of storytelling. \
 The movie impressively weaves together multiple story arcs, delivering an emotional and action-packed experience. \
  With stunning visual effects, compelling character development, and an epic conclusion,\
   it's a must-watch for every fan of the Marvel Cinematic Universe."

In [None]:
print(best_model.predict([myreview1]))
print(best_model2.predict([myreview2]))
print(best_model.predict([myreview3]))

['neg']
['neg']
['pos']


In [None]:
### Using lemmatization for performence improvement

In [None]:
import spacy

# Load Spacy's English model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Lemmatization and stopwords
def spacy_lemmatize(text):
    return ' '.join([token.lemma_ for token in nlp(text) if not token.is_stop])

# Applying Lemmatization and custom stopwords
X_train_lemmatized = [spacy_lemmatize(text) for text in X_train]
X_test_lemmatized = [spacy_lemmatize(text) for text in X_test]

# Updating the model pipeline with Lemmatized text
text_clf_lsvc3 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

text_clf_lsvc3.fit(X_train_lemmatized, y_train)

predictions = text_clf_lsvc3.predict(X_test_lemmatized)

print(metrics.confusion_matrix(y_test, predictions))
print(metrics.classification_report(y_test, predictions))
print(metrics.accuracy_score(y_test, predictions))


[[264  58]
 [ 55 272]]
              precision    recall  f1-score   support

         neg       0.83      0.82      0.82       322
         pos       0.82      0.83      0.83       327

    accuracy                           0.83       649
   macro avg       0.83      0.83      0.83       649
weighted avg       0.83      0.83      0.83       649

0.8258859784283513


In [None]:
print(text_clf_lsvc3.predict([myreview1]))
print(text_clf_lsvc3.predict([myreview2])) #it should be posative
print(text_clf_lsvc3.predict([myreview3]))

['neg']
['neg']
['pos']


### semantic and sentiment analysis

In [None]:
!python -m spacy download en_core_web_lg

2024-01-19 05:16:58.832917: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-19 05:16:58.832976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-19 05:16:58.834259: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully install

In [None]:
nlp2 = spacy.load('en_core_web_lg',disable=['parser','tagger','ner'])

In [None]:
# nlp2(u'love').vector

In [None]:
tokens = nlp2(u'lion cat pet')
for i in tokens:
  for j in tokens:
    print(i.text,j.text,i.similarity(j))

lion lion 1.0
lion cat 0.3854507803916931
lion pet 0.20031584799289703
cat lion 0.3854507803916931
cat cat 1.0
cat pet 0.732966423034668
pet lion 0.20031584799289703
pet cat 0.732966423034668
pet pet 1.0




In [None]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
a = 'this is a fucking good movie'
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.556, 'pos': 0.444, 'compound': 0.4927}

In [None]:
sid.polarity_scores(myreview2)

{'neg': 0.119, 'neu': 0.748, 'pos': 0.132, 'compound': -0.3716}

In [None]:
sid.polarity_scores(movie_df.iloc[202]['review'])

{'neg': 0.098, 'neu': 0.752, 'pos': 0.15, 'compound': 0.9892}

In [None]:
movie_df['scores'] = movie_df['review'].apply(lambda r : sid.polarity_scores(r))
movie_df.head(10)

Unnamed: 0,label,review,scores
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."
5,neg,"to put it bluntly , ed wood would have been pr...","{'neg': 0.123, 'neu': 0.821, 'pos': 0.056, 'co..."
6,neg,"synopsis : melissa , a mentally-disturbed woma...","{'neg': 0.087, 'neu': 0.742, 'pos': 0.17, 'com..."
7,neg,tim robbins and martin lawernce team up in thi...,"{'neg': 0.118, 'neu': 0.709, 'pos': 0.172, 'co..."
8,neg,"in "" gia "" , angelina jolie plays the titular ...","{'neg': 0.082, 'neu': 0.862, 'pos': 0.056, 'co..."
9,neg,"in 1990 , the surprise success an unheralded l...","{'neg': 0.145, 'neu': 0.728, 'pos': 0.127, 'co..."


In [None]:
movie_df['compound'] = movie_df['scores'].apply(lambda d : d['compound'])
movie_df.head(10)

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484
5,neg,"to put it bluntly , ed wood would have been pr...","{'neg': 0.123, 'neu': 0.821, 'pos': 0.056, 'co...",-0.9855
6,neg,"synopsis : melissa , a mentally-disturbed woma...","{'neg': 0.087, 'neu': 0.742, 'pos': 0.17, 'com...",0.9871
7,neg,tim robbins and martin lawernce team up in thi...,"{'neg': 0.118, 'neu': 0.709, 'pos': 0.172, 'co...",0.9829
8,neg,"in "" gia "" , angelina jolie plays the titular ...","{'neg': 0.082, 'neu': 0.862, 'pos': 0.056, 'co...",-0.8278
9,neg,"in 1990 , the surprise success an unheralded l...","{'neg': 0.145, 'neu': 0.728, 'pos': 0.127, 'co...",-0.9147


In [None]:
movie_df['compound_score'] = movie_df['compound'].apply(lambda s : 'pos' if s > 0 else 'neg' )
movie_df.head(10)

Unnamed: 0,label,review,scores,compound,compound_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg
5,neg,"to put it bluntly , ed wood would have been pr...","{'neg': 0.123, 'neu': 0.821, 'pos': 0.056, 'co...",-0.9855,neg
6,neg,"synopsis : melissa , a mentally-disturbed woma...","{'neg': 0.087, 'neu': 0.742, 'pos': 0.17, 'com...",0.9871,pos
7,neg,tim robbins and martin lawernce team up in thi...,"{'neg': 0.118, 'neu': 0.709, 'pos': 0.172, 'co...",0.9829,pos
8,neg,"in "" gia "" , angelina jolie plays the titular ...","{'neg': 0.082, 'neu': 0.862, 'pos': 0.056, 'co...",-0.8278,neg
9,neg,"in 1990 , the surprise success an unheralded l...","{'neg': 0.145, 'neu': 0.728, 'pos': 0.127, 'co...",-0.9147,neg


###  Latent Dirichlet Allocation (LDA)

In [None]:
len(movie_df['review'])

1965

In [None]:
cv2 = CountVectorizer(max_df = 0.95,min_df = 2,stop_words='english')
dtm = cv2.fit_transform(movie_df['review'])
dtm

<1965x23298 sparse matrix of type '<class 'numpy.int64'>'
	with 455692 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=8)

In [None]:
lda.fit(dtm)

### Grab the topics

In [None]:
lda.components_

array([[ 1.24448984, 10.14522402,  0.12510645, ...,  0.125     ,
         3.58156273,  0.125     ],
       [ 0.12504038, 14.10591513,  0.125     , ...,  0.125     ,
         0.12504787,  0.125     ],
       [ 0.53891449, 30.0612454 ,  0.125034  , ...,  0.125     ,
         2.01305555,  0.12519916],
       ...,
       [ 0.125     ,  1.6830778 ,  0.12523659, ...,  0.12517741,
        13.78020235,  1.12480177],
       [ 0.16295909, 17.65983534,  4.43725228, ...,  3.12476278,
         0.12501823,  1.12499906],
       [ 0.12502772,  3.69790851,  2.89658755, ...,  0.12504099,
         0.125     ,  0.125     ]])

In [None]:
single_topic = lda.components_[0]
toptenwords = single_topic.argsort()[-10:]

In [None]:
for i in toptenwords:
  print(cv2.get_feature_names_out()[i])

star
jackie
films
story
just
time
good
like
movie
film


In [None]:
for i,topic in enumerate(lda.components_):
   print(f'top 10 words for topic {i}')
   print([cv2.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

top 10 words for topic 0
['star', 'jackie', 'films', 'story', 'just', 'time', 'good', 'like', 'movie', 'film']
top 10 words for topic 1
['character', 'little', 'way', 'time', 'good', 'make', 'just', 'like', 'movie', 'film']
top 10 words for topic 2
['characters', 'man', 'life', 'character', 'time', 'story', 'just', 'movie', 'like', 'film']
top 10 words for topic 3
['plot', 'really', 'character', 'time', 'bad', 'good', 'like', 'just', 'film', 'movie']
top 10 words for topic 4
['really', 'characters', 'time', 'action', 'good', 'just', 'story', 'like', 'movie', 'film']
top 10 words for topic 5
['best', 'characters', 'movies', 'character', 'good', 'just', 'time', 'like', 'film', 'movie']
top 10 words for topic 6
['world', 'people', 'character', 'good', 'just', 'time', 'life', 'movie', 'like', 'film']
top 10 words for topic 7
['movies', 'wars', 'films', 'characters', 'star', 'time', 'scream', 'like', 'movie', 'film']


In [None]:
topic_result = lda.transform(dtm)
topic_result

array([[6.69269818e-04, 3.62629237e-01, 6.69413379e-04, ...,
        6.69657320e-04, 6.69502291e-04, 6.69305307e-04],
       [4.21574986e-04, 4.21633159e-04, 4.21548775e-04, ...,
        5.05730294e-02, 3.16984006e-01, 4.21502767e-04],
       [4.18911508e-04, 4.18821489e-04, 4.18817234e-04, ...,
        4.18723148e-04, 9.97068538e-01, 4.18718260e-04],
       ...,
       [3.25990239e-04, 3.26048708e-04, 3.26110063e-04, ...,
        2.65025632e-01, 3.26094486e-04, 3.26066646e-04],
       [4.36049703e-04, 9.96947715e-01, 4.36124877e-04, ...,
        4.36045365e-04, 4.36068945e-04, 4.35967389e-04],
       [4.05301160e-04, 4.05200357e-04, 4.05170040e-04, ...,
        4.05027634e-04, 4.05167581e-04, 4.05066455e-04]])

In [None]:
movie_df['Topic'] = topic_result.argmax(axis = 1)

In [None]:
movie_df

Unnamed: 0,label,review,scores,compound,compound_score,Topic
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg,4
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg,3
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos,6
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos,6
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg,0
...,...,...,...,...,...,...
1995,pos,"i like movies with albert brooks , and i reall...","{'neg': 0.073, 'neu': 0.763, 'pos': 0.164, 'co...",0.9991,pos,6
1996,pos,it might surprise some to know that joel and e...,"{'neg': 0.238, 'neu': 0.688, 'pos': 0.074, 'co...",-0.9993,neg,2
1997,pos,the verdict : spine-chilling drama from horror...,"{'neg': 0.15, 'neu': 0.702, 'pos': 0.147, 'com...",-0.5966,neg,4
1998,pos,i want to correct what i wrote in a former ret...,"{'neg': 0.131, 'neu': 0.71, 'pos': 0.16, 'comp...",0.9387,pos,1


In [None]:
def open_file(path):
  with open(path) as f:
    text = f.read()
    return text

In [None]:
# print(open_file('/content/melville-moby_dick.txt'))

In [None]:
nlp2.max_length = 1198623

In [None]:
def tokns(docmnt):
  return [token.text.lower() for token in nlp2(docmnt)]

In [None]:
d = open_file('/content/moby_dick_four_chapters.txt')

FileNotFoundError: [Errno 2] No such file or directory: '/content/moby_dick_four_chapters.txt'

In [None]:
tokens = tokns(d)

In [None]:
len(tokens)

In [None]:
train_len = 25+1
text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[ i - train_len : i ]
    text_sequences.append(seq)

In [None]:
# arr = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
# res = []
# for i in range(len(arr)):
#   seq = arr[ i - 5 : i]
#   res.append(seq)
# res

In [None]:
print(' '.join(text_sequences[0]))

In [None]:
print(' '.join(text_sequences[20]))

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

In [None]:
tokenizer.fit_on_texts(text_sequences)

In [None]:
sequence = tokenizer.texts_to_sequences(text_sequences)

In [None]:
# tokenizer.index_word

In [None]:
# tokenizer.word_counts

In [None]:
sequences = np.array(sequence)

In [None]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

In [None]:
# sequence

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [None]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()

    return model

In [None]:
from keras.utils import to_categorical

In [None]:
X = sequences[:,:-1]
y = sequences[:,-1]

In [None]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [None]:
seq_len = X.shape[1]

In [None]:
model = create_model(vocabulary_size+1, seq_len)

In [None]:
model.fit(X, y, batch_size=128, epochs=160,verbose=1)

In [None]:
from pickle import dump,load

In [None]:
model.save('mymodel.h5')
dump(tokenizer, open('mymodel', 'wb'))

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [None]:
def generated_text(model, tokenizer, seq_len, seed_text, num_gen_words):

    # Final Output
    output_text = []

    # Initial Seed Sequence
    input_text = seed_text

    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')

        # Get predicted probabilities for the next word
        pred_probabilities = model.predict(pad_encoded, verbose=0)[0]

        # Get the index of the word with the highest probability
        pred_word_ind = np.argmax(pred_probabilities)

        # Retrieve the word corresponding to the index
        pred_word = tokenizer.index_word.get(pred_word_ind)  # Use get method to handle out-of-vocabulary indices

        # Append the predicted word to the input text
        input_text += ' ' + pred_word
        output_text.append(pred_word)

    # Make it look like a sentence
    return ' '.join(output_text)


In [None]:
import random
random_pick = random.randint(0,len(text_sequences))

In [None]:
random_seed_text = text_sequences[random_pick]

In [None]:
seed_text = ' '.join(random_seed_text)
print(seed_text)

In [None]:
generated_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=20)

In [None]:
from keras.models import load_model

In [None]:
# model = load_model('/content/epochBIG.h5')

In [None]:
# tokenizer = load(open('/content/epochBIG','rb'))

### Beam search

In [None]:
def beam_search(model, tokenizer, seq_len, seed_text, num_gen_words, k=3):
    sequences = [[tokenizer.texts_to_sequences([seed_text])[0], 0.0]]

    while len(sequences[0][0]) < num_gen_words:
        all_candidates = []
        for seq, score in sequences:
            pad_encoded = pad_sequences([seq], maxlen=seq_len, truncating='pre')
            pred_probabilities = model.predict(pad_encoded, verbose=0)[0]
            top_k_preds = np.argsort(pred_probabilities)[-k:]  # Top k predictions

            for pred in top_k_preds:
                candidate_seq = [seq + [pred], score - np.log(pred_probabilities[pred])]
                all_candidates.append(candidate_seq)

        # Sort candidates by score
        sequences = sorted(all_candidates, key=lambda tup: tup[1])[:k]

    # Retrieve the best sequence
    best_sequence = sequences[0][0]
    output_text = [tokenizer.index_word.get(idx, '[UNK]') for idx in best_sequence]

    return ' '.join(output_text)

In [None]:
generated_text = beam_search(model, tokenizer, seq_len, seed_text, num_gen_words=40)
print(generated_text)

#chatbots



In [None]:
import pickle
with open('/content/train_qa.txt','rb') as f:
  train_data = pickle.load(f)

In [None]:
with open('/content/test_qa.txt','rb') as f:
  test_data = pickle.load(f)

In [None]:
all_data  = train_data + test_data

In [None]:
' '.join(train_data[0][0])

In [None]:
' '.join(train_data[0][0])

In [None]:
' '.join(train_data[0][2])

In [None]:
df  = pd.DataFrame(all_data,columns=['Story','Question','Answer'])
df

In [None]:
vocab = set()
for story,question,answer in all_data:
  vocab = vocab.union(set(story))
  vocab = vocab.union(set(question))


In [None]:
vocab.add('yes')
vocab.add('no')
vocab

In [None]:
vocab_len = len(vocab) + 1
# for i in range(len(all_data)):
#   for data in all_data[i]:
#     print(' '.join(data))

In [None]:
stories_len = [len(data[0]) for data in all_data]
question_len = [len(data[1]) for data in all_data]

In [None]:
maxlen_stories = max(stories_len)
maxlen_stories

In [None]:
maxlen_question = max(question_len)
maxlen_question

In [None]:
tokenizer = Tokenizer(filters = [])

In [None]:
tokenizer.fit_on_texts(vocab)

In [None]:
tokenizer.word_index['yes']

In [None]:
tokenizer.word_index['no']

In [None]:
tokenizer.word_index

In [None]:
train_story_text = []
train_question_text = []
train_answer_text = []
for s,q,a in train_data:
  train_story_text.append(s)
  train_question_text.append(q)
  train_answer_text.append(q)

In [None]:
# train_story_text

In [None]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)
# train_story_seq = np.array(train_story_seq)
# train_story_seq

In [None]:
def vectorize_stories(data, word_index=tokenizer.word_index, max_story_len=maxlen_stories,max_question_len=maxlen_question):
    '''
    OUTPUT:
    Vectorizes the stories,questions, and answers into padded sequences. We first loop for every story, query , and
    answer in the data. Then we convert the raw words to an word index value. Then we append each set to their appropriate
    output list. Then once we have converted the words to numbers, we pad the sequences so they are all of equal length.
    Returns this in the form of a tuple (X,Xq,Y) (padded based on max lengths)
    '''
    # X = STORIES
    X = []
    # Xq = QUERY/QUESTION
    Xq = []
    # Y = CORRECT ANSWER
    Y = []


    for story, query, answer in data:

        # Grab the word index for every word in story
        x = [word_index[word.lower()] for word in story]
        xq = [word_index[word.lower()] for word in query]
        y = np.zeros(len(word_index) + 1)
        y[word_index[answer]] = 1

        X.append(x)
        Xq.append(xq)
        Y.append(y)

    return (pad_sequences(X, maxlen=max_story_len),pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [None]:
inputs_train, queries_train, answers_train = vectorize_stories(train_data)
inputs_test, queries_test, answers_test = vectorize_stories(test_data)

In [None]:
import keras.layers
from keras.models import Sequential, Model
from keras.layers import Embedding,Input, Activation, Dense, Permute, Dropout,add, dot, concatenate,LSTM

In [None]:
input_sequence = Input((maxlen_stories,))
question_sequence = Input((maxlen_question,))
question_sequence

### input encoder M

In [None]:
# Input gets embedded to a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_len,output_dim=64))
input_encoder_m.add(Dropout(0.5))

# This encoder will output:
# (samples, story_maxlen, embedding_dim)

### input encoder C

In [None]:
# embed the input into a sequence of vectors of size query_maxlen
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_len,output_dim=maxlen_question))
input_encoder_c.add(Dropout(0.5))
# output: (samples, story_maxlen, query_maxlen)

### question encoder


In [None]:
# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_len,
                               output_dim=64,
                               input_length=maxlen_question))
question_encoder.add(Dropout(0.5))
# output: (samples, query_maxlen, embedding_dim)

In [None]:
# encode input sequence and questions (which are indices)
# to sequences of dense vectors
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question_sequence)

In [None]:
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

In [None]:
# add the match matrix with the second input vector sequence
response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)

In [None]:
# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded])
answer

In [None]:
# Reduce with RNN (LSTM)
answer = LSTM(32)(answer)  # (samples, 32)
# Regularization with Dropout
answer = Dropout(0.5)(answer)
answer = Dense(vocab_len)(answer)  # (samples, vocab_size)
# we output a probability distribution over the vocabulary
answer = Activation('softmax')(answer)

In [None]:
model = Model([input_sequence, question_sequence], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# train
history = model.fit([inputs_train, queries_train], answers_train,batch_size=32,epochs=100,validation_data=([inputs_test, queries_test], answers_test))

In [None]:
filename = 'chatbot.h5'
model.save(filename)

In [None]:
model.load_weights(filename)
pred_results = model.predict(([inputs_test, queries_test]))

In [None]:
story =' '.join(word for word in test_data[1][0])
print(story)

In [None]:
query = ' '.join(word for word in test_data[1][1])
print(query)

In [None]:
print("Answer is:",test_data[1][2])

In [None]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])

In [None]:
# Note the whitespace of the periods
my_story = "John left the kitchen . Sandra dropped the football in the garden ."
my_story.split()

In [None]:
my_question = "Is the football in the kitchen ?"
my_question.split()

In [None]:
mydata = [(my_story.split(),my_question.split(),'yes')]

In [None]:
my_story,my_ques,my_ans = vectorize_stories(mydata)

In [None]:
pred_results = model.predict(([ my_story, my_ques]))
pred_results

In [None]:
#Generate prediction from model
val_max = np.argmax(pred_results[0])

for key, val in tokenizer.word_index.items():
    if val == val_max:
        k = key

print("Predicted answer is: ", k)
print("Probability of certainty was: ", pred_results[0][val_max])