In [2]:
# Importing required libraries

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import string
import random
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## 1. LDA

### 1.1 Clean the data to remove stop-words, punctuation, and emoticons.


In [3]:
# Opening the file
f = open("amazon_cells_labelled.txt", "r")

data =[]
# Converting it to pandas dataframe
for line in f:
    review = line[:len(line) - 2]
    sentiment = "neg" if line[len(line)-2] == "0" else "pos"
    row = [review, sentiment]
    data.append(row)

df = pd.DataFrame(data, columns = ['reviews', 'sentiment'])
# The below pandas dataframe has reviews and sentiment
print(df)

                                               reviews sentiment
0    So there is no way for me to plug it in here i...       neg
1                        Good case, Excellent value.\t       pos
2                             Great for the jawbone.\t       pos
3    Tied to charger for conversations lasting more...       neg
4                                  The mic is great.\t       pos
..                                                 ...       ...
995  The screen does get smudged easily because it ...       neg
996  What a piece of junk.. I lose more calls on th...       neg
997                     Item Does Not Match Picture.\t       neg
998  The only thing that disappoint me is the infra...       neg
999  You can not answer calls with the unit, never ...       neg

[1000 rows x 2 columns]


In [4]:
# Clean the data to remove stop-words, punctuation, and emoticons

nltk.download('stopwords')

stop = stopwords.words('english')

def remove_punctuation(text):
    translator = str.maketrans('','', string.punctuation)
    return text.translate(translator)

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

df['reviews'] = df['reviews'].apply(remove_punctuation).apply(remove_stopwords)
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,reviews,sentiment
0,way plug us unless go converter,neg
1,good case excellent value,pos
2,great jawbone,pos
3,tied charger conversations lasting 45 minutesm...,neg
4,mic great,pos


### 1.2 Apply LDA and print out 10 topics.

In [5]:
vect = CountVectorizer(max_features = 5000, max_df=.15)
X = vect.fit_transform(df['reviews'])

In [6]:
lda = LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=25, random_state=0)
document_topics = lda.fit_transform(X)
print(lda.components_.shape)
document_topics

(10, 1788)


array([[0.87141694, 0.0142878 , 0.01428764, ..., 0.01428817, 0.01428655,
        0.01428601],
       [0.02000096, 0.0200025 , 0.02000158, ..., 0.02000055, 0.02000212,
        0.81998241],
       [0.03334133, 0.03333333, 0.03333919, ..., 0.03333333, 0.03333333,
        0.29768756],
       ...,
       [0.02500393, 0.02500102, 0.025     , ..., 0.02500125, 0.025001  ,
        0.025005  ],
       [0.01428746, 0.01428571, 0.01428599, ..., 0.01428603, 0.01428745,
        0.87142414],
       [0.21770012, 0.01666792, 0.01666897, ..., 0.01666814, 0.01667184,
        0.01666977]])

In [7]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
print(len(sorting))
print(sorting)

10
[[1767  697  717 ...  668 1592 1392]
 [1469 1261 1238 ... 1072 1592 1392]
 [ 697 1667 1274 ...  515  668 1392]
 ...
 [ 461  998 1709 ... 1592 1524 1392]
 [1764  893  439 ...  515 1072 1392]
 [ 697  547 1238 ... 1592  515 1392]]


In [8]:
feature_names = np.array(vect.get_feature_names_out())
print(len(feature_names))
print(feature_names)

1788
['10' '100' '11' ... 'youll' 'z500a' 'zero']


In [9]:
def print_topics(topics, feature_names, sorting, topics_per_chunk, n_words):
    for i in range(0, len(topics), topics_per_chunk):
        # for each chunk:
        these_topics = topics[i: i + topics_per_chunk]
        # maybe we have less than topics_per_chunk left
        len_this_chunk = len(these_topics)
        print(these_topics)
        print(*these_topics)
        print(len_this_chunk)
        # print topic headers
        print(("topic {:<8}" * len_this_chunk).format(*these_topics))
        print(("-------- {0:<5}" * len_this_chunk).format(""))
        # print top n_words frequent words
        for i in range(n_words):
            try:
                print(("{:<14}" * len_this_chunk).format(*feature_names[sorting[these_topics, i]]))
            except:
                pass
        print("\n")

print_topics(topics=range(10), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=10)

range(0, 5)
0 1 2 3 4
5
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
works         sound         great         recommend     battery       
great         really        use           would         good          
happy         quality       reception     service       horrible      
easy          good          make          customer      software      
battery       headset       car           one           life          
junk          product       like          highly        also          
use           bad           new           ear           product       
piece         well          working       right         cell          
cheap         bluetooth     product       stay          never         
item          service       light         terrible      nice          


range(5, 10)
5 6 7 8 9
5
topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------    

## 2. Chatbot

### 2.1 Clean the data as you did for LDA

In [10]:
nltk.download('punkt') # first-time use only
nltk.download('wordnet') # first-time use only

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [31]:
# The chatbot needs to be based on the 'reviews' dataset above.
# Combine the 3 datasets into one and then build the chatbot as mentioned in the module.

datasets = ["amazon_cells_labelled.txt", "imdb_labelled.txt", "yelp_labelled.txt"]
data = []

for dataset_file in datasets:
    with open(dataset_file, "r") as f:
        for line in f:
            review = line[:len(line) - 2]
            sentiment = "neg" if line[len(line)-2] == "0" else "pos"
            row = [review, sentiment]
            data.append(row)

# Converting data to pandas dataframe
df = pd.DataFrame(data, columns=['reviews', 'sentiment'])

In [57]:
stop = stopwords.words('english')

def LemTokens(tokens):
    lemmer = nltk.stem.WordNetLemmatizer()
    return [lemmer.lemmatize(token) for token in tokens]

def LemNormalize(text):
    remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

df['reviews'] = df['reviews'].apply(remove_punctuation).apply(remove_stopwords)

# Combine all the reviews into a single string
reviews = " ".join(df["reviews"])

sent_tokens = nltk.sent_tokenize(reviews)  # converts to list of sentences
word_tokens = nltk.word_tokenize(reviews)  # converts to list of words

In [55]:
# Define TF-IDF vectorizer
tfidfvec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')

# Train TF-IDF vectorizer
tfidf = tfidfvec.fit_transform(sent_tokens)

# Define response function
def response(user_response):
    robo_response = ''
    sent_tokens.append(user_response)
    tfidf = tfidfvec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    similarity = flat[-2]
    if similarity == 0:
        robo_response = robo_response + "I am sorry! I don't understand you."
        return robo_response
    else:
        robo_response = robo_response + sent_tokens[idx]
        return robo_response

### 2.2 Build a chatbot based on the reviews


In [56]:
# Define chatbot function
def chatbot(user_input):
    user_input = user_input.lower()
    if user_input != 'bye':
        if user_input in reviews.lower():
            # If user input matches any review, return the review
            matching_reviews = [review for review in df['reviews'] if user_input in review.lower()]
            return random.choice(matching_reviews)
        else:
            if greeting(user_input) is not None:
                return greeting(user_input)
            else:
                return response(user_input)
    else:
        return "Goodbye! Have a great day!"

# Define greeting inputs and responses
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up", "hey")
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]

def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

# Interact with the chatbot
print("ROBO: My name is Robo. I will answer your queries about reviews. Just type one word, and I will display its reviews. If you want to exit, type 'bye'!")
while True:
    user_response = input("You: ")
    print("ROBO:", chatbot(user_response))
    if user_response.lower() == 'bye':
        break

ROBO: My name is Robo. I will answer your queries about reviews. Just type one word, and I will display its reviews. If you want to exit, type 'bye'!
You: greetings
ROBO: I am glad! You are talking to me
You: keyboard
ROBO: keyboard really worthwhile usefulness sturdy enough dont expect problems
You: burger
ROBO: great place relax awesome burger beer
You: pizza
ROBO: ordered appetizer took 40 minutes pizza another 10 minutes
You: Mickey
ROBO: dont yet hear mickey speak tons sound effects music throughout filmsomething take granted huge crowd pleaser 1928
You: Vatsal
ROBO: I am sorry! I don't understand you.
You: bye
ROBO: Goodbye! Have a great day!
