## Customer Complaints Analysis

In [None]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
import nltk
nltk.download("popular")

### 1. Read raw data

In [None]:
complaints = pd.read_csv("https://raw.githubusercontent.com/wsko/Text_Analytics_Case_Studies/master/complaints01.csv")
complaintsRaw = complaints.Consumer_complaint.values[:5000]
complaintsRaw

### 2. Standardize / clean-up text

In [None]:
print(stopwords.words('english'))

#### 2.1 Tokenization

In [None]:
s = complaintsRaw[9]##sample text
z = word_tokenize(s)
print(s)
print(" ")
print(z)

#### 2.2 Remove stopwords and convert to lower case

In [None]:
z = [word for word in z if word.lower() not in stopwords.words('english')]##remove stopwords
z = [word.lower() for word in z]##convert everything to lower case
print(s)
print(" ")
print(z)

#### 2.3 Stemming

In [None]:
z = [stemmer.stem(word) for word in z]##stem similar words)
z = ' '.join(z)
print(s)
print(" ")
print(z)

#### 2.4 Create a text clean-up function and apply it to the raw text

In [None]:
def cleantext(s):
  z = word_tokenize(s)
  z = [word.lower() for word in z]##convert everything to lower case
  z = [word for word in z if word.lower() not in stopwords.words('english')]##remove stopwords
  z = [stemmer.stem(word) for word in z]##stem similar words)
  z = [word for word in z if word not in [".", ",", " ", "xx","xxx", "xxxx", "xxxxxxxx", "xx/xx/xxxx", "xx/xx/2019", "xx/xx/2018", "xx/xx/2017", "00", "0.00", "000"]]##remove custom tokens
  return(' '.join(z))

In [None]:
cleantext(s)

In [None]:
#complaintsClean = complaintsRaw.copy()
#for i in range(len(complaintsRaw)):
#  complaintsClean[i] = cleantext(complaintsRaw[i])
complaintsClean = pd.read_csv("https://raw.githubusercontent.com/wsko/Text_Analytics_Case_Studies/master/complaintsClean.csv")['complaintsClean'].values
complaintsClean[:5]

### 3. Build a document-term matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec.fit(complaintsClean)
dtm = vec.transform(complaintsClean)##docuent-term matrix

In [None]:
print(dtm)

In [None]:
vocab = pd.DataFrame({'Number' : list(vec.vocabulary_.values()), 'Key' : list(vec.vocabulary_.keys())})
vocab.sort_values(by = 'Number', inplace = True)
topWords = pd.DataFrame({"Keys" : vocab.Key, 'Freq': dtm.toarray().sum(axis = 0)}).sort_values(by = 'Freq', ascending = False)
topWords.head(20)

In [None]:
DTM = pd.DataFrame(dtm.toarray())
DTM.columns = vocab.Key.values
DTM[topWords.Keys.values[:20]].head()

In [None]:
##Text representation as a bag of words
print("Clean Text:")
print(complaintsClean[0])
print("  ")
DTM[complaintsClean[0].split()].iloc[0, :]

#### 3.1 Perform TF-IDF transformation

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer().fit(dtm)
X = tfidf.transform(dtm)
X = X.toarray()
X.shape

#### Reduce number of columns based on total variance

In [None]:
sum(X.var(axis = 0) > 0.02*X.var(axis = 0).max())

In [None]:
X1 = X[:, X.var(axis = 0) > 0.02*X.var(axis = 0).max()]
vocab1 = vocab[X.var(axis = 0) > 0.02*X.var(axis = 0).max()]
topWords1 = pd.DataFrame({"Keys" : vocab1.Key, 'Freq': X1.sum(axis = 0)}).sort_values(by = 'Freq', ascending = False)
TFIDF = pd.DataFrame(X1)
TFIDF.columns = vocab1.Key.values
TFIDF[topWords1.Keys.values[:10]].head()

### 4. Cluster analysis

In [None]:
### create a column "complaintType"
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=12).fit(X1)
complaintType = KMeans(n_clusters=4, random_state=None).fit_predict(X1) + 1
pd.DataFrame({"TYPE":pd.Series(complaintType).value_counts().index, "Counts":pd.Series(complaintType).value_counts().values})

#### Most frequent words in each complaint cluster

In [None]:
for i in range(1,5):
  print("complaint type", i, ":", "\n", pd.DataFrame({"Keys" : vocab1.Key, 'Freq': X1[complaintType == i].sum(axis = 0)}).sort_values(by = 'Freq', ascending = False)['Keys'].values[:10])
  print("  ")

### 5. Categorize customer complaints; save output

In [None]:
## List categories based on the most frequent words
Categories = []
Categorized = pd.DataFrame({'Complaint_Type':"", "Complaint":complaintsRaw})
for i in range(len(complaintsRaw)):
    Categorized["Complaint_Type"][i] = Categories[complaintType[i]-1]
Categorized.head()

In [None]:
from google.colab import files
Categorized.to_csv('Complaints_Categorized.csv')
files.download('Complaints_Categorized.csv')