In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# suppress warnings
import warnings;
warnings.filterwarnings('ignore');

# common imports
import pandas as pd
import numpy as np
import math
import re
import glob
import os
import sys
import json
import random
import pprint as pp
import textwrap
import sqlite3
import logging

import spacy
import nltk

from tqdm.auto import tqdm
# register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
tqdm.pandas()

# pandas display options
# https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html#available-options
pd.options.display.max_columns = 30 # default 20
pd.options.display.max_rows = 60 # default 60
pd.options.display.float_format = '{:.2f}'.format
# pd.options.display.precision = 2
pd.options.display.max_colwidth = 200 # default 50; -1 = all
# otherwise text between $ signs will be interpreted as formula and printed in italic
pd.set_option('display.html.use_mathjax', False)

# np.set_printoptions(edgeitems=3) # default 3

import matplotlib
from matplotlib import pyplot as plt

plot_params = {'figure.figsize': (8, 4), 
               'axes.labelsize': 'large',
               'axes.titlesize': 'large',
               'xtick.labelsize': 'large',
               'ytick.labelsize':'large',
               'figure.dpi': 100}
# adjust matplotlib defaults
matplotlib.rcParams.update(plot_params)

import seaborn as sns
sns.set_style("darkgrid")

In [3]:
BASE_DIR = '/content/drive/MyDrive/ESCP NLP dataset/group project dataset'
os.chdir(BASE_DIR)

CASE I – Business Intelligence in Social Customer Service

In [None]:
Airline_Tweets = './airline_date.csv'

In [None]:
df = pd.read_csv(Airline_Tweets)
df.head()

Unnamed: 0,weekday,month,date,year,text
0,Thu,Oct,1,2015,@mjdout I know that can be frustrating..we hope to have you parked and deplaned shortly. Thanks for your patience. *AA
1,Thu,Oct,1,2015,"@rmarkerm Terribly sorry for the inconvenience. If we can be of assistance at this time, pls let us know. *AA"
2,Thu,Oct,1,2015,"@checho85 I can check, pls follow and DM your confirmation # for review. *AA"
3,Thu,Oct,1,2015,"@nealaa ...Alerts, pls check here: http://t.co/0jlcZnT95Q *JH 3/3"
4,Thu,Oct,1,2015,"@nealaa ...advisory has only been issued for the Bahamas, but that could change. To check for updates on Weather advisories &amp;... 2/3"


In [None]:
#What is the average length of a social customer service reply ?
df['text'].str.len().mean()

print("average length of a social customer service reply", df['text'].str.len().mean())

average length of a social customer service reply 92.1445170660857


In [None]:
#What type of links were referenced most often ?
counts = {"url": 0, "phone": 0, "dm": 0}

counts['url'] = df['text'].str.count('http').sum()
counts['phone'] = df['text'].str.count(r"\d{3}-\d{3}-\d{4}").sum()
counts['dm'] = df['text'].str.count('DM').sum()

max_type = max(counts, key=counts.get)

print("Type referenced most often:", max_type)


Type referenced most often: dm


In [None]:
#How many people should be on a social media customer service team? 
df[['text', 'date']].groupby('date').count()

Unnamed: 0_level_0,text
date,Unnamed: 1_level_1
1,77
2,100
3,100
4,80
5,100
6,100
7,100
8,93
9,67
10,100


In [None]:
#according to the daily number of text, maximum is 100 per day, I think the customer team should hire 5 people.

In [None]:
#How many social replies are reasonable for a customer service representative to handle?
#number of replies range from 14 ~ 20 per day for a customer service representative.

CASE III – An exploration of the impact of vectorization on classification

In [4]:
import os, json, gzip 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet, stopwords

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

import string
from nltk import pos_tag
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [5]:
data = []
with open('reviews_5_balanced.json') as f:
    for l in f:
        data.append(json.loads(l.strip()))
df3 = pd.DataFrame.from_dict(data)

In [6]:
df3.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewText,summary,unixReviewTime
0,1,True,"03 12, 2018",A3QY3THQ42WSCQ,B000YFSR5G,Waaaay too BIG,One Star,1520812800
1,1,True,"03 12, 2018",A3QY3THQ42WSCQ,B000YFSR4W,Waaaay too BIG,One Star,1520812800
2,1,True,"02 8, 2017",A21HH0VIBKK80J,B000YFSR5G,"Was terribly disappointed, the pants were way too large in the legs, my husband looked like he was wearing blown up clown pants.","Was terribly disappointed, the pants were way too large in ...",1486512000
3,1,True,"02 8, 2017",A21HH0VIBKK80J,B000YFSR4W,"Was terribly disappointed, the pants were way too large in the legs, my husband looked like he was wearing blown up clown pants.","Was terribly disappointed, the pants were way too large in ...",1486512000
4,1,True,"02 19, 2018",A276HQXYS553QW,B0014F8TIU,Constantly rolls down,One Star,1518998400


  Step 1. Data Preparation

In [7]:
df3.isna().sum()

overall             0
verified            0
reviewTime          0
reviewerID          0
asin                0
reviewText          0
summary           108
unixReviewTime      0
dtype: int64

In [8]:
df3 = df3.dropna()

In [9]:
df3['overall'].value_counts()

5    127110
1     84805
2     59363
4     22854
Name: overall, dtype: int64

In [10]:
df3['sentiment'] = df3['overall'].apply(lambda x: 0 if x == 1 or x == 2 else 1 if x == 4 or x == 5 else None)

In [11]:
df3['sentiment'].value_counts(normalize=True)

1   0.51
0   0.49
Name: sentiment, dtype: float64

In [12]:
df3.drop(columns=[
    'reviewTime', 'unixReviewTime', 'overall', 'reviewerID', 'summary'],
        inplace=True)
df3.sample(3)

Unnamed: 0,verified,asin,reviewText,sentiment
159637,True,B00IZP4W4W,Great snack for lower carb.,1
85272,True,B001KVXJPG,NEVER DOWNLOADED,0
220418,True,B00CIXSW0M,I'm happy with the quality.,1


In [13]:
stopwords_list = stopwords.words('english')

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = nltk.stem.WordNetLemmatizer()

def get_lemmatize(tokens):
    return [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens]

def clean(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [14]:
df3_60, df3_40 = train_test_split(df3, test_size=0.4, stratify=df3['sentiment'], random_state=42)

In [15]:
df3_40["reviewText"] = df3_40["reviewText"].apply(clean)

In [16]:
df3_40 = df3_40[df3_40['reviewText'].str.len() != 0]

Step 2. Train-Test Split

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(df3_40['reviewText'],
                                                    df3_40['sentiment'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df3_40['sentiment'])

print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])

print ('Distribution of classes in Training Data :')
print ('Positive Sentiment ', str(sum(Y_train == 1)/ len(Y_train) * 100.0))
print ('Negative Sentiment ', str(sum(Y_train == 0)/ len(Y_train) * 100.0))

print ('Distribution of classes in Testing Data :')
print ('Positive Sentiment ', str(sum(Y_test == 1)/ len(Y_test) * 100.0))
print ('Negative Sentiment ', str(sum(Y_test == 0)/ len(Y_test) * 100.0))

Size of Training Data  93590
Size of Test Data  23398
Distribution of classes in Training Data :
Positive Sentiment  50.91249065071055
Negative Sentiment  49.087509349289455
Distribution of classes in Testing Data :
Positive Sentiment  50.91460808616121
Negative Sentiment  49.08539191383879


Step 3 - Text Vectorization

In [18]:
# Option 1 --	Bag of words with TFIDF weighing
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,1))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [19]:
# Option 2 -- -	Vectorisation with Word2Vec embeddings
from gensim.models import Word2Vec

# Convert each document into a list of words
documents = []
for doc in X_train:
    words = doc.split()
    documents.append(words)

# Train Word2Vec model
modelVec = Word2Vec(documents, window=5, min_count=1, workers=4)

In [20]:
# Get the Word2Vec embeddings for each document
X_train_w2v = []
for doc in documents:
    doc_vec = np.zeros((100,))
    for word in doc:
        if word in modelVec.wv.key_to_index:
            doc_vec += modelVec.wv.get_vector(word)
    X_train_w2v.append(doc_vec)

X_test_w2v = []
for doc in X_test:
    words = doc.split()
    doc_vec = np.zeros((100,))
    for word in words:
        if word in modelVec.wv.key_to_index:
            doc_vec += modelVec.wv.get_vector(word)
    X_test_w2v.append(doc_vec)

In [21]:
# Option 3 -- Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

Step 4 - Training the Machine Learning model

In [22]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [23]:
from sklearn.svm import SVC
model1 = SVC(kernel='linear', probability=True, random_state=42)

In [None]:
model1.fit(X_train_tf, Y_train)

In [None]:
# LinearSVC model result with tfidf victorization
Y_pred_tf = model1.predict(X_test_tf)
Y_pred_tf_proba = model1.predict_proba(X_test_tf)[:,1]
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_tf))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred_tf))

Accuracy Score -  0.8640482092486538
ROC-AUC Score -  0.8644257903613416


In [None]:
model1.fit(X_train_w2v, Y_train)

In [None]:
# LinearSVC model result with Word2Vec embeddings
Y_pred_w2v = model1.predict(X_test_w2v)
Y_pred_w2v_proba = model1.predict_proba(X_test_w2v)[:,1]
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_w2v))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred_w2v))

In [None]:
model1.fit(X_train_counts, Y_train)

In [None]:
# LinearSVC model result with Count Vectorizer
Y_pred_counts = model1.predict(X_test_counts)
Y_pred_counts_proba = model1.predict_proba(X_test_counts)[:,1]
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_counts))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred_counts))

Accuracy Score -  0.8630652192495085
ROC-AUC Score -  0.8631757940614367


In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

model2 = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=100)

In [None]:
model2.fit(X_train_tf, Y_train)

In [None]:
# Random Forest Classifier model result with tfidf victorization
Y_pred_tf = model2.predict(X_test_tf)
Y_pred_tf_proba = model2.predict_proba(X_test_tf)[:,1]
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_tf))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred_tf))

Accuracy Score -  0.8575519275151723
ROC-AUC Score -  0.8579742569121706


In [None]:
model2.fit(X_train_w2v, Y_train)

In [None]:
# Random Forest Classifier model result with Word2Vec embeddings
Y_pred_w2v = model2.predict(X_test_w2v)
Y_pred_w2v_proba = model2.predict_proba(X_test_w2v)[:,1]
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_w2v))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred_w2v))

Accuracy Score -  0.8400290623130182
ROC-AUC Score -  0.8405378078282758


In [None]:
model2.fit(X_train_counts, Y_train)

In [None]:
# Random Forest Classifier model result with Count Vectorizer
Y_pred_counts = model2.predict(X_test_counts)
Y_pred_counts_proba = model2.predict_proba(X_test_counts)[:,1]
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_counts))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred_counts))

Accuracy Score -  0.8599025557739978
ROC-AUC Score -  0.8600761996686104
