<a href="https://colab.research.google.com/github/yuja28/sw_Bootcamp/blob/main/text_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import warnings
warnings.filterwarnings('ignore')

In [4]:

# 데이터 불러오기
df = pd.read_csv("/content/drive/MyDrive/customer-support-on-twitter/input/twcs/twcs.csv")
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [5]:
#Specify which company we want to work with
company = "VirginTrains"

#Filter for answers only made by that company
answers = df.loc[df['author_id'] == company]

In [6]:
df_text=df[['text']]
df_text.head()

Unnamed: 0,text
0,@115712 I understand. I would like to assist y...
1,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...
3,@115712 Please send us a Private Message so th...
4,@sprintcare I did.


In [7]:
# 데이터 전처리
def preprocess_text(text):
    # 소문자로 변환
    text = text.lower()
    # 특수문자 제거
    text = re.sub(r'[^\w\s]', '', text)
    # 숫자 제거
    text = re.sub(r'\d+', '', text)
    return text

answers['text'] = answers['text'].apply(preprocess_text)

In [8]:
# 토큰화 및 불용어 처리
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return " ".join(tokens)

answers['text'] = answers['text'].apply(tokenize_and_lemmatize)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# TF-IDF 벡터화
vectorizer = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
                             stop_words='english', ngram_range=(1,1))
tfidf_matrix = vectorizer.fit_transform(answers['text'])

# 토픽 모델링 (LDA)
lda_model = LDA(n_components=5, random_state=42)
lda_topic_matrix = lda_model.fit_transform(tfidf_matrix)


In [None]:
# 사용자 질문에 대한 응답 함수 정의
def get_response(user_question):
    user_question = preprocess_text(user_question)
    user_question = tokenize_and_lemmatize(user_question)
    user_vector = vectorizer.transform([user_question])
    user_topic = lda_model.transform(user_vector)
    most_similar_answer_index = np.argmax(cosine_similarity(user_topic, lda_topic_matrix))
    return answers.iloc[most_similar_answer_index]['text']

# 챗봇 동작
print("{companyname} Support: Welcome to {companyname} Support. I will answer your queries about {companyname}. If you wish to end the chat, type bye!".format(companyname = company))

while True:
    user_input = input()
    if user_input.lower() == 'bye':
        print("{companyname} Support: Thanks for chatting. I hope we could assist you today.".format(companyname = company))
        break
    else:
        response = get_response(user_input)
        print("{companyname} Support: {response}".format(companyname = company, response=response))

VirginTrains Support: Welcome to VirginTrains Support. I will answer your queries about VirginTrains. If you wish to end the chat, type bye!
