# Apple Tweets Preprocessing

In [3]:
import numpy as np
import pandas as pd
import datetime as dt
import random
from collections import Counter
from scipy import sparse
import pickle
import os
import re
import string
from pymongo import MongoClient
from pprint import pprint

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tokenize import MWETokenizer
import spacy
import gensim
import emoji
from spellchecker import SpellChecker
from textblob import TextBlob
from langdetect import detect

## Load tweet data

In [360]:
# set up client instance
client = MongoClient()

In [11]:
# set up db instance (where the tweet data is pre-stored in mongodb)
db = client.customersupport

In [17]:
# check collections in db
db.list_collection_names()

['tweets']

In [13]:
# check out one tweet
cursor = db.tweets.find({}, {'_id':0}).limit(1)
list(cursor)

[{'tweet_id': 1,
  'author_id': 'sprintcare',
  'inbound': 'False',
  'created_at': 'Tue Oct 31 22:10:47 +0000 2017',
  'text': '@115712 I understand. I would like to assist you. We would need to get you into a private secured link to further assist.',
  'response_tweet_id': 2,
  'in_response_to_tweet_id': 3}]

In [108]:
# load collection into dataframe 
cursor = db.tweets.find()
df = pd.DataFrame(list(cursor))
df.to_pickle('customer_tweets.pkl')

In [87]:
# read pickle
df = pd.read_pickle('customer_tweets.pkl')

In [155]:
# shape of df
df.shape

(2811774, 8)

In [23]:
# basic info about the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2811774 entries, 0 to 2811773
Data columns (total 8 columns):
_id                        object
tweet_id                   int64
author_id                  object
inbound                    object
created_at                 object
text                       object
response_tweet_id          object
in_response_to_tweet_id    object
dtypes: int64(1), object(7)
memory usage: 171.6+ MB


In [231]:
# check out a few sample records
df.head(3)

Unnamed: 0,_id,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,5ebc7ad2507a19aa9e7ccae9,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2,3
1,5ebc7ad2507a19aa9e7ccaea,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4,6
2,5ebc7ad2507a19aa9e7ccaeb,6,sprintcare,False,Tue Oct 31 21:46:24 +0000 2017,@115712 Can you please send us a private messa...,57,8


In [27]:
# how many unique author ids are there
len(df.author_id.unique())

702777

In [28]:
# how many tweets from each author id?
df.author_id.value_counts()

AmazonHelp      169840
AppleSupport    106860
Uber_Support     56270
SpotifyCares     43265
Delta            42253
                 ...  
403265               1
403266               1
640356               1
640354               1
746645               1
Name: author_id, Length: 702777, dtype: int64

In [88]:
# drop column _id
df = df.drop(['_id'], axis=1)

In [89]:
# let's focus on tweets to and from apple support only
df = df[(df.author_id == 'AppleSupport') | (df.text.str.contains('@applesupport', na=False, flags=re.IGNORECASE, regex=True))]

In [234]:
df.head(3)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
396,696,AppleSupport,False,Tue Oct 31 22:27:49 +0000 2017,@115854 We're here for you. Which version of t...,697.0,698
397,697,115854,True,Tue Oct 31 22:31:23 +0000 2017,@AppleSupport The newest update. I️ made sure ...,699.0,696
398,699,AppleSupport,False,Tue Oct 31 22:36:27 +0000 2017,@115854 Lets take a closer look into this issu...,,697


In [166]:
# how many unique users who tweeted to apple support?
len(df.author_id.unique())

58583

In [113]:
# sample tweet and its replies between a user and apple support
df[(df.author_id == 115854) | (df.tweet_id.isin([696, 699]))].sort_values(by='created_at')

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
400,700,115854,True,Tue Oct 31 22:16:56 +0000 2017,@AppleSupport why are my I️’s changing not sho...,698.0,
399,698,115854,True,Tue Oct 31 22:17:40 +0000 2017,@AppleSupport https://t.co/NV0yucs0lB,696.0,700.0
396,696,AppleSupport,False,Tue Oct 31 22:27:49 +0000 2017,@115854 We're here for you. Which version of t...,697.0,698.0
397,697,115854,True,Tue Oct 31 22:31:23 +0000 2017,@AppleSupport The newest update. I️ made sure ...,699.0,696.0
398,699,AppleSupport,False,Tue Oct 31 22:36:27 +0000 2017,@115854 Lets take a closer look into this issu...,,697.0


In [90]:
# remove outbound messages that are not from apple support 
df = df[~((df.inbound == 'False') & (df.author_id != 'AppleSupport'))]

## Cleaning

### Clean up datetime column and add date only column

In [513]:
# check the current format of created at date
df.created_at[400]

'Tue Oct 31 22:16:56 +0000 2017'

In [91]:
# convert created at column to datetime type
df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')

# add date only column
df['date_only'] = df['created_at'].dt.normalize()

### Clean up text column

In [92]:
# fix word lengthening, such as the word 'amazingggggg'
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

df['text_clean'] = df.text.apply(lambda x: reduce_lengthening(x))

# lower case text
df.text_clean = df.text_clean.str.lower()

# remove punctuation
punc = (lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', str(x))) 
df.text_clean = df.text_clean.map(punc)

# remove curly open and closing quotes (for both single and double quotes)
# single curly open quote
df.text_clean = df.text_clean.map(lambda x: re.sub("‘", ' ', str(x)))
# single curly closing quote
df.text_clean = df.text_clean.map(lambda x: re.sub("’", ' ', str(x)))
# double curly open quote
df.text_clean = df.text_clean.map(lambda x: re.sub("“", ' ', str(x)))
# double curly closing quote
df.text_clean = df.text_clean.map(lambda x: re.sub("”", ' ', str(x)))

# remove numbers
num = (lambda x: re.sub('\w*\d\w*', ' ', str(x)))
df.text_clean = df.text_clean.map(num)

In [94]:
# convert slang / abbreviated phrases to words, such as brb to be right back    
chat_words_map_dict = {}
chat_words_list = []
with open('chat_words_str.txt', 'r') as file:
    chat_words_str = file.read()
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

df.text_clean = df.text_clean.apply(lambda x: chat_words_conversion(x))

In [95]:
# remove stop words
stop = stopwords.words('english')
df.text_clean = df.text_clean.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [123]:
df.head(5)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only
396,696,AppleSupport,False,2017-10-31 22:27:49,version ios running check settings gt general gt,697.0,698.0,2017-10-31
397,697,115854,True,2017-10-31 22:31:23,applesupport newest update i️ made sure downlo...,699.0,696.0,2017-10-31
398,699,AppleSupport,False,2017-10-31 22:36:27,lets take closer look issue select following l...,,697.0,2017-10-31
399,698,115854,True,2017-10-31 22:17:40,applesupport https co,696.0,700.0,2017-10-31
400,700,115854,True,2017-10-31 22:16:56,applesupport i️ changing showing correctly soc...,698.0,,2017-10-31


In [99]:
# most frequently occurring words 
word_counter = Counter()
for text in df.text_clean.values:
    for word in text.split():
        word_counter[word] += 1
        
word_counter.most_common(10)

[('co', 101956),
 ('https', 101944),
 ('applesupport', 98885),
 ('us', 71483),
 ('dm', 56712),
 ('help', 46891),
 ('let', 34422),
 ('ios', 34388),
 ('iphone', 32952),
 ('update', 24023)]

In [100]:
# least frequently occurring words 
word_counter = Counter()
for text in df.text_clean.values:
    for word in text.split():
        word_counter[word] += 1
        
word_counter.most_common()[:-10-1:-1]

[('stopmakingnewphonesuntillyoulearntomaketheoldonerunrite', 1),
 ('tyouguysgetitright', 1),
 ('whycan', 1),
 ('🤬😡🤬😡🤬', 1),
 ('cqngyqnslz', 1),
 ('🙎🏾\u200d♂️', 1),
 ('unsaving', 1),
 ('quickquestion', 1),
 ('dzxhazufio', 1),
 ('censoring', 1)]

In [101]:
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
df.text_clean = df.text_clean.apply(lambda text: lemmatize_words(text))

# remove emoji 
def give_emoji_free_text(text):
    allchars = [str for str in text] 
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)]) 
    return clean_text
df.text_clean = df.text_clean.apply(lambda x: give_emoji_free_text(x))

# remove urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
df.text_clean = df.text_clean.apply(lambda x: remove_urls(x))

In [102]:
df.text_clean.sample(10)

2733943    applesupport second iphone x order said shippe...
1840266    happy look issue dm u detail seeing box amp qu...
1217157    thanks letting u know like look let u know app...
66377                   thanks let look send u dm go http co
2444894    applesupport find video app erased mistake ava...
2201833    applesupport happened tried install update itu...
244953                   help contact device currently using
1034961    offer support via twitter english contact u he...
1710261    control center disconnect bluetooth temporaril...
1872896                   applesupport minuet fkdkxkkxkzkznz
Name: text_clean, dtype: object

In [103]:
len(df.text_clean)

204756

In [None]:
# correct spelling using text blob
# for tweet in sample_text:
#     # TextBlob is providing correct method
#     sample_text = TextBlob(tweet).correct()

In [286]:
# correct spelling using spell checker
# spell = SpellChecker()
# def correct_spellings(text):
#     corrected_text = []
#     misspelled_words = spell.unknown(text.split())
#     for word in text.split():
#         if word in misspelled_words:
#             corrected_text.append(spell.correction(word))
#         else:
#             corrected_text.append(word)
#     return " ".join(corrected_text)
        
# sample_text = sample_text.apply(lambda x: correct_spellings(x))

In [527]:
# any non-english characters?

# -*- coding: utf-8 -*-
# def isEnglish(s):
#     try:
#         s.encode(encoding='utf-8').decode('ascii')
#     except UnicodeDecodeError:
#         return False
#     else:
#         return True

# assert not isEnglish('slabiky, ale liší se podle významu')
# assert isEnglish('English')
# assert not isEnglish('ގެ ފުރަތަމަ ދެ އަކުރު ކަ')
# assert not isEnglish('how about this one : 通 asfަ')
# assert isEnglish('?fd4))45s&')

In [528]:
#df[~df.text_clean.apply(lambda x: isEnglish(x))]

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699,696,2017-10-31,applesupport newest update i️ made sure downlo...
400,700,115854,True,2017-10-31 22:16:56,@AppleSupport why are my I️’s changing not sho...,698,,2017-10-31,applesupport i️ changing showing correctly soc...
406,707,115855,True,2017-10-31 21:48:51,@AppleSupport I️ have an iPhone 7 Plus and yes...,705,708,2017-10-31,applesupport i️ iphone plus yes i️
408,709,115855,True,2017-10-31 21:34:45,@AppleSupport I️ need answers because it’s ann...,708,710,2017-10-31,applesupport i️ need answer annoying
413,714,115856,True,2017-10-31 22:19:32,Hey @AppleSupport and anyone else who upgraded...,712715,,2017-10-31,hey applesupport anyone else upgraded issue ca...
...,...,...,...,...,...,...,...,...,...
2809832,2986064,691757,True,2017-10-31 21:53:55,@AppleSupport Hi! I saw this page earlier and ...,,2986063,2017-10-31,applesupport hi saw page earlier say op temp w...
2809833,2986065,691757,True,2017-10-31 21:44:59,@AppleSupport Just a little question: Will it ...,2986063,,2017-10-31,applesupport little question hurt macbook pro ...
2810028,2986242,823415,True,2017-10-31 21:43:50,@AppleSupport why is “I️ “ showing up like thi...,2986241,,2017-10-31,applesupport i️ showing like annoying
2811116,2987300,823685,True,2017-11-21 22:10:42,Travelled over an hour to @115858 Store to get...,2987299,,2017-11-21,travelled hour store get repair £ iphonex told...


In [None]:
# def detect_lang(x):   
#     b = TextBlob(x)
#     return b.detect_language()   

# sample_text.apply(lambda x: detect_lang(x))

In [680]:
#lang = detect("hello worlds!")
#text_lang = sample_text.apply(lambda x: detect(x))

In [374]:
# text_lang

1059998    en
1648152    en
376891     en
859531     en
1399583    fr
           ..
2234310    it
2782313    en
490659     en
1446715    en
1765172    en
Name: text_clean, Length: 1000, dtype: object

In [379]:
# text_lang.value_counts()

en    849
fr     73
nl     26
no     13
it      8
af      8
es      6
da      4
tr      3
ca      2
et      2
tl      2
pt      2
hr      1
cy      1
Name: text_clean, dtype: int64

In [None]:
# df['text_lang'] = df.text_clean.apply(lambda x: detect(x))

In [110]:
# pickle dataframe for modeling
df.to_pickle('data/tweet_clean.pkl')

## Create a dataframe for user tweets to Apple Support only

In [None]:
# create dataframe for tweets sent to Apple Support only, keeping each tweet as an individual document
df_tweet_user = df.copy()
df_tweet_user = df_tweet_user[df_tweet_user.inbound == 'True']

In [None]:
# pickle dataframe for modeling
df_tweet_user.to_pickle('data/df_tweet_user.pkl')

## Create a dataframe for a user's initial tweet to Apple Support

In [None]:
# create dataframe for a user's initial tweet to Apple Support (i.e., excluding replies)
df_first_tweet_user = df.copy()
df_first_tweet_user = df_first_tweet_user[df_first_tweet_user.inbound == 'True']

In [None]:
# filter dataframe to first tweet per user only
df_first_tweet_user = df_first_tweet_user.loc[df_first_tweet_user.groupby('author_id').
                                              created_at.idxmin()].reset_index(drop=True)

In [None]:
# pickle dataframe for modeling
df_first_tweet_user.to_pickle('data/df_first_tweet_user.pkl')

## Create a dataframe that combines user tweets from a conversation into one document

In [116]:
# create new dataframe where all tweets from a user are combined (tweet conversation treated as a document)
df_convo_user = df.copy()
df_convo_user = df_convo_user[df_convo_user.inbound == 'True']

In [118]:
# aggregate text by user
df_orig_text = df_convo_user.groupby(['author_id'])['text'].apply(' '.join).reset_index()

# get min and max date per user
df_min_max_date = df_convo_user.groupby('author_id').agg({'date_only':['min', 'max']}).reset_index()

# add column names to min and max date
df_min_max_date.columns = ['_'.join(col).strip() for col in df_min_max_date.columns.values]

In [119]:
# aggregate text clean by user
df_convo_user = df_convo_user.groupby(['author_id'])['text_clean'].apply(' '.join).reset_index()

In [120]:
# merge df convo user with aggregated text
df_convo_user = pd.merge(df_convo_user, df_orig_text[['author_id', 'text']], how='left', on='author_id')

In [121]:
# merge df convo user with min and max date
df_convo_user = pd.merge(df_convo_user, df_min_max_date[['author_id_', 'date_only_min', 'date_only_max']], how='left', left_on='author_id', right_on='author_id_')
df_convo_user = df_convo_user.drop(['author_id_'], axis=1)
df_convo_user = df_convo_user[['author_id', 'text', 'text_clean', 'date_only_min', 'date_only_max']]
df_convo_user.head()

Unnamed: 0,author_id,text,text_clean,date_only_min,date_only_max
0,408,"@AppleSupport Uh, weirdness after watchOS 4.1 ...",applesupport uh weirdness watchos update http co,2017-11-02,2017-11-02
1,1437,I'm not sure what the F is happening with iOS ...,sure f happening io trying let battery run lie...,2017-11-17,2017-11-17
2,1501,@AppleSupport all good now thanks. i’ve been a...,applesupport good thanks able send least half ...,2017-11-03,2017-11-03
3,2084,@AppleSupport Just updated iTunes and it said ...,applesupport updated itunes said longer subscr...,2017-10-31,2017-10-31
4,3922,@AppleSupport Pretty sure it started with iOS ...,applesupport pretty sure started io seems happ...,2017-10-07,2017-10-08


In [123]:
# pickle dataframe for modeling
df_convo_user.to_pickle('data/df_convo_user.pkl')