<a href="https://colab.research.google.com/github/yuja28/sw_Bootcamp/blob/main/VirginTrains.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# 데이터 불러오기
df = pd.read_csv("/content/drive/MyDrive/customer-support-on-twitter/input/twcs/twcs.csv")
df.head()

#Specify which company we want to work with
company = "VirginTrains"

#Filter for answers only made by that company
answers = df.loc[df['author_id'] == company]

df_text=df[['text']]
df_text.head()

#Convert all our text to lower case
answers['text'] = answers.apply(lambda row: row['text'].lower(), axis=1)
#Strip off any trailing full stops
answers['text'] = answers.apply(lambda row: row['text'].rstrip('.'), axis=1)
#Remove any mentions to users e.g. "@johnsmith you can do this by...."
answers['text'] = answers.apply(lambda row: re.sub("\B@\w+", "", row['text']), axis=1)

#variable for concatinating all answers sent by the company
raw = ""

#concatinate answers into raw variable
for index, row in answers.iterrows():
    raw += ". " + row['text']

nltk.download('punkt')
nltk.download('wordnet')

#convert our raw sentences into sentence tokens
sentence_tokens = nltk.sent_tokenize(raw)
#convert our raw sentences into word tokens
word_tokens = nltk.word_tokenize(raw);

lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

#import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#define our function for processing a response
def response(user_response):
    #define our response variable
    robo_response=''
    #add our users input as a response
    sentence_tokens.append(user_response)
    #create out vectorizer
    vectorizer = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    #process our tokens
    diff = vectorizer.fit_transform(sentence_tokens)
    #find the similarity
    vals = cosine_similarity(diff[-1], diff)
    #select our sentence
    idx = vals.argsort()[0][-2]
    #calculate accuracy
    flat = vals.flatten()
    flat.sort()
    req_diff = flat[-2]
    if(req_diff==0):
        #if no appropriate response can be made
        robo_response=robo_response+"Sorry! I don't think I can help you with that."
        return robo_response
    else:
        #if an appropriate response is found
        robo_response = sentence_tokens[idx]
        return robo_response

import gensim
from gensim import corpora

# Tokenization using nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Tokenize and remove stop words
texts = [[word for word in nltk.word_tokenize(text.lower()) if word.isalnum() and word not in stop_words] for text in sentence_tokens]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Create corpus
corpus = [dictionary.doc2bow(text) for text in texts]

# LDA model training
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=5, # Specify the number of topics (you can change this)
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# Print the main topics
print("{companyname} Support: Welcome to {companyname} Support. I will answer your queries about {companyname}. If you wish to end the chat, type bye!".format(companyname = company))
while True:
    #get an input
    user_response = input()
    #convert to lower
    user_response=user_response.lower()
    #if they type something other than 'bye'
    if user_response != 'bye':
        #show bot is typing
        print("{companyname} Support: ".format(companyname = company), end="")
        #print our AI response
        print(response(user_response))
        sentence_tokens.remove(user_response)
    else:
        #exit the loop
        print("{companyname} Support: Thanks for chatting. I hope we could assist you today.".format(companyname = company))
        break


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


VirginTrains Support: Welcome to VirginTrains Support. I will answer your queries about VirginTrains. If you wish to end the chat, type bye!
Can I cancel my ticket?
VirginTrains Support: i see, how did you cancel this?
What is the ticket cancellation policy?
VirginTrains Support: we have a 28 day policy for delay repay.
bye
VirginTrains Support: Thanks for chatting. I hope we could assist you today.
