In [22]:

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
lemm = nltk.wordnet.WordNetLemmatizer()


In [35]:
data = pd.read_csv('chatbot.csv')
    
len(data), data.head()

(17,
                           user_quote                           answer
 0                                 Hi        Hi there, how can I help?
 1                                Bye                   See you again!
 2  what is the name of your creators               I'm not telling ya
 3                 do you have a name  maybe, but iI'm not telling you
 4                    how old are you                  today years old)

In [8]:
def preprocess(text):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    lemmatized = [lemm.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized)

In [11]:
data['lemmed'] = data['user_quote'].apply(preprocess)
data.head()

Unnamed: 0,user_quote,answer,lemmed
0,Hi,"Hi there, how can I help?",hi
1,Bye,See you again!,bye
2,what is the name of your creators,I'm not telling ya,what is the name of your creator
3,do you have a name,"maybe, but iI'm not telling you",do you have a name
4,how old are you,today years old,how old are you


In [13]:
tfidf = TfidfVectorizer()

In [18]:
vectorized = tfidf.fit_transform(data['lemmed']).toarray()
vectorized[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [20]:

df_tfidf = pd.DataFrame(vectorized, columns=tfidf.get_feature_names_out())
df_tfidf.head()

Unnamed: 0,affect,am,are,bye,can,could,creator,do,doe,don,...,tell,the,today,understand,what,who,why,worthless,you,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.406565,0.0,0.0,0.0,...,0.0,0.406565,0.0,0.0,0.355005,0.0,0.0,0.0,0.0,0.355005
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.513213,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.357453,0.0
4,0.0,0.0,0.493369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343632,0.0


In [27]:
def chatbot(user_input):
    user_input = preprocess(user_input)
    vector = tfidf.transform([user_input]).toarray()
    cos = 1- pairwise_distances(df_tfidf, vector, metric = 'cosine')
    idx = cos.argmax()

    return data['answer'].loc[idx]

In [28]:
chatbot('hello')

'Hi there, how can I help?'

In [29]:
chatbot('you stupid')

"I wish you wouldn't say such hurtful things. I'm sorry if I wasn't useful"

In [34]:
text = ''
while text != 'stop':
    text = input()
    print('--- ', text)
    print(chatbot(text))

---  hi
Hi there, how can I help?
---  Hello
Hi there, how can I help?
---  bye
See you again!
---  im joking
Hi there, how can I help?
---  oh no
Hi there, how can I help?
---  well
Hi there, how can I help?
---  i'm feeling bad
I'm sorry to hear that. I'm here for you. Talking about it might help. So, tell me why do you think you're feeling this way?
---  tell a joke
mental health is not a joke
---  whats your name
I'm not telling ya
---  how old are you
today years old
---  tell me about mental health
mental health is not a joke
---  who is it for
It is estimated that mental illness affects 1 in 5 adults in America, and that 1 in 24 adults have a serious mental illness. Mental illness does not discriminate; it can affect anyone, regardless of gender, age, income, social status, ethnicity, religion, sexual orientation, or background. Although mental illness can affect anyone, certain conditions may be more common in different populations. For instance, eating disorders tend to occur 