# Apple Tweets Preprocessing

In [139]:
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style("whitegrid", {'axes.grid' : False})
import datetime as dt
import random
from collections import Counter
from scipy import sparse

import pickle
import os
import re
import string
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tokenize import MWETokenizer
import gensim
import emoji
from spellchecker import SpellChecker
from textblob import TextBlob
from langdetect import detect

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.manifold import TSNE

import scattertext as st
from biterm.utility import vec_to_biterms
from biterm.btm import oBTM
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

from helper_functions import display_topics

## Load tweet data

In [360]:
# set up client instance
client = MongoClient()

In [11]:
# set up db instance
db = client.customersupport

In [17]:
# check collections in db
db.list_collection_names()

['tweets']

In [13]:
# check out one tweet
cursor = db.tweets.find({}, {'_id':0}).limit(1)
list(cursor)

[{'tweet_id': 1,
  'author_id': 'sprintcare',
  'inbound': 'False',
  'created_at': 'Tue Oct 31 22:10:47 +0000 2017',
  'text': '@115712 I understand. I would like to assist you. We would need to get you into a private secured link to further assist.',
  'response_tweet_id': 2,
  'in_response_to_tweet_id': 3}]

In [108]:
# load collection into dataframe 
cursor = db.tweets.find()
df = pd.DataFrame(list(cursor))
df.to_pickle('customer_tweets.pkl')

In [87]:
# read pickle
df = pd.read_pickle('customer_tweets.pkl')

In [155]:
# shape of df
df.shape

(2811774, 8)

In [23]:
# basic info about the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2811774 entries, 0 to 2811773
Data columns (total 8 columns):
_id                        object
tweet_id                   int64
author_id                  object
inbound                    object
created_at                 object
text                       object
response_tweet_id          object
in_response_to_tweet_id    object
dtypes: int64(1), object(7)
memory usage: 171.6+ MB


In [231]:
# check out a few sample records
df.head(3)

Unnamed: 0,_id,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,5ebc7ad2507a19aa9e7ccae9,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2,3
1,5ebc7ad2507a19aa9e7ccaea,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4,6
2,5ebc7ad2507a19aa9e7ccaeb,6,sprintcare,False,Tue Oct 31 21:46:24 +0000 2017,@115712 Can you please send us a private messa...,57,8


In [27]:
# how many unique author ids are there
len(df.author_id.unique())

702777

In [28]:
# how many tweets from each author id?
df.author_id.value_counts()

AmazonHelp      169840
AppleSupport    106860
Uber_Support     56270
SpotifyCares     43265
Delta            42253
                 ...  
403265               1
403266               1
640356               1
640354               1
746645               1
Name: author_id, Length: 702777, dtype: int64

In [88]:
# drop column _id
df = df.drop(['_id'], axis=1)

In [89]:
# let's focus on tweets to and from apple support only
df = df[(df.author_id == 'AppleSupport') | (df.text.str.contains('@applesupport', na=False, flags=re.IGNORECASE, regex=True))]

In [234]:
df.head(3)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
396,696,AppleSupport,False,Tue Oct 31 22:27:49 +0000 2017,@115854 We're here for you. Which version of t...,697.0,698
397,697,115854,True,Tue Oct 31 22:31:23 +0000 2017,@AppleSupport The newest update. I️ made sure ...,699.0,696
398,699,AppleSupport,False,Tue Oct 31 22:36:27 +0000 2017,@115854 Lets take a closer look into this issu...,,697


In [166]:
# how many unique users who tweeted to apple support?
len(df.author_id.unique())

58583

In [113]:
# sample tweet and its replies between a user and apple support
df[(df.author_id == 115854) | (df.tweet_id.isin([696, 699]))].sort_values(by='created_at')

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
400,700,115854,True,Tue Oct 31 22:16:56 +0000 2017,@AppleSupport why are my I️’s changing not sho...,698.0,
399,698,115854,True,Tue Oct 31 22:17:40 +0000 2017,@AppleSupport https://t.co/NV0yucs0lB,696.0,700.0
396,696,AppleSupport,False,Tue Oct 31 22:27:49 +0000 2017,@115854 We're here for you. Which version of t...,697.0,698.0
397,697,115854,True,Tue Oct 31 22:31:23 +0000 2017,@AppleSupport The newest update. I️ made sure ...,699.0,696.0
398,699,AppleSupport,False,Tue Oct 31 22:36:27 +0000 2017,@115854 Lets take a closer look into this issu...,,697.0


In [90]:
# remove outbound messages that are not from apple support 
df = df[~((df.inbound == 'False') & (df.author_id != 'AppleSupport'))]

## Cleaning

### Clean up datetime column and add date only column

In [513]:
# check the current format of created at date
df.created_at[400]

'Tue Oct 31 22:16:56 +0000 2017'

In [91]:
# convert created at column to datetime type
df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')

# add date only column
df['date_only'] = df['created_at'].dt.normalize()

### Clean up text column

In [92]:
# fix word lengthening, such as the word 'amazingggggg'
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

df['text_clean'] = df.text.apply(lambda x: reduce_lengthening(x))

# lower case text
df.text_clean = df.text_clean.str.lower()

# remove punctuation
punc = (lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', str(x))) 
df.text_clean = df.text_clean.map(punc)

# remove curly open and closing quotes (for both single and double quotes)
# single curly open quote
df.text_clean = df.text_clean.map(lambda x: re.sub("‘", ' ', str(x)))
# single curly closing quote
df.text_clean = df.text_clean.map(lambda x: re.sub("’", ' ', str(x)))
# double curly open quote
df.text_clean = df.text_clean.map(lambda x: re.sub("“", ' ', str(x)))
# double curly closing quote
df.text_clean = df.text_clean.map(lambda x: re.sub("”", ' ', str(x)))

# remove numbers
num = (lambda x: re.sub('\w*\d\w*', ' ', str(x)))
df.text_clean = df.text_clean.map(num)

In [94]:
# convert slang / abbreviated phrases to words, such as brb to be right back    
chat_words_map_dict = {}
chat_words_list = []
with open('chat_words_str.txt', 'r') as file:
    chat_words_str = file.read()
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

df.text_clean = df.text_clean.apply(lambda x: chat_words_conversion(x))

In [95]:
# remove stop words
stop = stopwords.words('english')
df.text_clean = df.text_clean.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [123]:
df.head(5)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only
396,696,AppleSupport,False,2017-10-31 22:27:49,version ios running check settings gt general gt,697.0,698.0,2017-10-31
397,697,115854,True,2017-10-31 22:31:23,applesupport newest update i️ made sure downlo...,699.0,696.0,2017-10-31
398,699,AppleSupport,False,2017-10-31 22:36:27,lets take closer look issue select following l...,,697.0,2017-10-31
399,698,115854,True,2017-10-31 22:17:40,applesupport https co,696.0,700.0,2017-10-31
400,700,115854,True,2017-10-31 22:16:56,applesupport i️ changing showing correctly soc...,698.0,,2017-10-31


In [99]:
# most frequently occurring words 
word_counter = Counter()
for text in df.text_clean.values:
    for word in text.split():
        word_counter[word] += 1
        
word_counter.most_common(10)

[('co', 101956),
 ('https', 101944),
 ('applesupport', 98885),
 ('us', 71483),
 ('dm', 56712),
 ('help', 46891),
 ('let', 34422),
 ('ios', 34388),
 ('iphone', 32952),
 ('update', 24023)]

In [100]:
# least frequently occurring words 
word_counter = Counter()
for text in df.text_clean.values:
    for word in text.split():
        word_counter[word] += 1
        
word_counter.most_common()[:-10-1:-1]

[('stopmakingnewphonesuntillyoulearntomaketheoldonerunrite', 1),
 ('tyouguysgetitright', 1),
 ('whycan', 1),
 ('🤬😡🤬😡🤬', 1),
 ('cqngyqnslz', 1),
 ('🙎🏾\u200d♂️', 1),
 ('unsaving', 1),
 ('quickquestion', 1),
 ('dzxhazufio', 1),
 ('censoring', 1)]

In [101]:
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
df.text_clean = df.text_clean.apply(lambda text: lemmatize_words(text))

# remove emoji 
def give_emoji_free_text(text):
    allchars = [str for str in text] 
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)]) 
    return clean_text
df.text_clean = df.text_clean.apply(lambda x: give_emoji_free_text(x))

# remove urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
df.text_clean = df.text_clean.apply(lambda x: remove_urls(x))

In [102]:
df.text_clean.sample(10)

2733943    applesupport second iphone x order said shippe...
1840266    happy look issue dm u detail seeing box amp qu...
1217157    thanks letting u know like look let u know app...
66377                   thanks let look send u dm go http co
2444894    applesupport find video app erased mistake ava...
2201833    applesupport happened tried install update itu...
244953                   help contact device currently using
1034961    offer support via twitter english contact u he...
1710261    control center disconnect bluetooth temporaril...
1872896                   applesupport minuet fkdkxkkxkzkznz
Name: text_clean, dtype: object

In [103]:
len(df.text_clean)

204756

In [None]:
# correct spelling using text blob
# for tweet in sample_text:
#     # TextBlob is providing correct method
#     sample_text = TextBlob(tweet).correct()

In [286]:
# correct spelling using spell checker
# spell = SpellChecker()
# def correct_spellings(text):
#     corrected_text = []
#     misspelled_words = spell.unknown(text.split())
#     for word in text.split():
#         if word in misspelled_words:
#             corrected_text.append(spell.correction(word))
#         else:
#             corrected_text.append(word)
#     return " ".join(corrected_text)
        
# sample_text = sample_text.apply(lambda x: correct_spellings(x))

In [527]:
# any non-english characters?

# -*- coding: utf-8 -*-
# def isEnglish(s):
#     try:
#         s.encode(encoding='utf-8').decode('ascii')
#     except UnicodeDecodeError:
#         return False
#     else:
#         return True

# assert not isEnglish('slabiky, ale liší se podle významu')
# assert isEnglish('English')
# assert not isEnglish('ގެ ފުރަތަމަ ދެ އަކުރު ކަ')
# assert not isEnglish('how about this one : 通 asfަ')
# assert isEnglish('?fd4))45s&')

In [528]:
#df[~df.text_clean.apply(lambda x: isEnglish(x))]

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699,696,2017-10-31,applesupport newest update i️ made sure downlo...
400,700,115854,True,2017-10-31 22:16:56,@AppleSupport why are my I️’s changing not sho...,698,,2017-10-31,applesupport i️ changing showing correctly soc...
406,707,115855,True,2017-10-31 21:48:51,@AppleSupport I️ have an iPhone 7 Plus and yes...,705,708,2017-10-31,applesupport i️ iphone plus yes i️
408,709,115855,True,2017-10-31 21:34:45,@AppleSupport I️ need answers because it’s ann...,708,710,2017-10-31,applesupport i️ need answer annoying
413,714,115856,True,2017-10-31 22:19:32,Hey @AppleSupport and anyone else who upgraded...,712715,,2017-10-31,hey applesupport anyone else upgraded issue ca...
...,...,...,...,...,...,...,...,...,...
2809832,2986064,691757,True,2017-10-31 21:53:55,@AppleSupport Hi! I saw this page earlier and ...,,2986063,2017-10-31,applesupport hi saw page earlier say op temp w...
2809833,2986065,691757,True,2017-10-31 21:44:59,@AppleSupport Just a little question: Will it ...,2986063,,2017-10-31,applesupport little question hurt macbook pro ...
2810028,2986242,823415,True,2017-10-31 21:43:50,@AppleSupport why is “I️ “ showing up like thi...,2986241,,2017-10-31,applesupport i️ showing like annoying
2811116,2987300,823685,True,2017-11-21 22:10:42,Travelled over an hour to @115858 Store to get...,2987299,,2017-11-21,travelled hour store get repair £ iphonex told...


In [None]:
# def detect_lang(x):   
#     b = TextBlob(x)
#     return b.detect_language()   

# sample_text.apply(lambda x: detect_lang(x))

In [680]:
#lang = detect("hello worlds!")
#text_lang = sample_text.apply(lambda x: detect(x))

In [374]:
# text_lang

1059998    en
1648152    en
376891     en
859531     en
1399583    fr
           ..
2234310    it
2782313    en
490659     en
1446715    en
1765172    en
Name: text_clean, Length: 1000, dtype: object

In [379]:
# text_lang.value_counts()

en    849
fr     73
nl     26
no     13
it      8
af      8
es      6
da      4
tr      3
ca      2
et      2
tl      2
pt      2
hr      1
cy      1
Name: text_clean, dtype: int64

In [None]:
# df['text_lang'] = df.text_clean.apply(lambda x: detect(x))

In [110]:
# pickle dataframe
df.to_pickle('tweet_clean.pkl')

## Create a dataframe that keeps each tweet or reply as a document

In [140]:
# read pickle
df = pd.read_pickle('tweet_clean.pkl')

In [141]:
# create copy of df
df_all = df.copy()

In [142]:
# head of df all
df_all.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
396,696,AppleSupport,False,2017-10-31 22:27:49,@115854 We're here for you. Which version of t...,697.0,698.0,2017-10-31,version io running check setting gt general gt
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699.0,696.0,2017-10-31,applesupport newest update i️ made sure downlo...
398,699,AppleSupport,False,2017-10-31 22:36:27,@115854 Lets take a closer look into this issu...,,697.0,2017-10-31,let take closer look issue select following li...
399,698,115854,True,2017-10-31 22:17:40,@AppleSupport https://t.co/NV0yucs0lB,696.0,700.0,2017-10-31,applesupport http co
400,700,115854,True,2017-10-31 22:16:56,@AppleSupport why are my I️’s changing not sho...,698.0,,2017-10-31,applesupport i️ changing showing correctly soc...


In [143]:
# vectorize text data using tf-idf vectorizer
my_additional_stop_words = ['apple', 'applesupport', 'want', 'hey', 'hi', 'hello', 'http', 'thank', 'thanks', 'ok']
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

tfidf = TfidfVectorizer(stop_words=my_stop_words, max_df=0.10, min_df=0.01) #ngram_range=(1,2)
doc_word = tfidf.fit_transform(df_all.text_clean)

df_all_vec = pd.DataFrame(doc_word.toarray(), index=df_all.text_clean, columns=tfidf.get_feature_names())

In [144]:
# shape of df all vec
df_all_vec.shape

(204756, 103)

In [145]:
# sample of df all vec
df_all_vec.sample(3)

Unnamed: 0_level_0,able,amp,app,apps,article,assist,available,battery,better,change,...,updated,updating,use,using,version,watch,way,work,working,yes
text_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
applesupport make newer io update work well phone iphone old yet io working well,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.421086,0.538276,0.0
annoyed keep autocorrecting i️ applesupport,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
applesupport iphone io,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
# use NMF to reduce dimensionality to some # of topics
nmf_model = NMF(20)
doc_topic = nmf_model.fit_transform(doc_word)

In [147]:
# topics by words
nmf_model.components_.shape

(20, 103)

In [148]:
# documents by topics matrix
doc_topic.shape

(204756, 20)

In [149]:
# distribution of words by topic
np.unique(doc_topic.argmax(axis=1), return_counts = True)[1]

array([35160,  6329,  9680, 12547,  7040,  9193,  8591,  4791, 10267,
       12077,  7048, 11943, 17329,  8175,  6858,  5424,  6351, 12275,
        6665,  7013])

In [150]:
# topics by words into dataframe
topic_word = pd.DataFrame(nmf_model.components_.round(3),
                         #index = ["component_1", "component_2", "component_3"],
                         columns = tfidf.get_feature_names())
topic_word.head(5)

Unnamed: 0,able,amp,app,apps,article,assist,available,battery,better,change,...,updated,updating,use,using,version,watch,way,work,working,yes
0,0.0,0.0,0.0,0.0,0.0,0.015,0.007,0.0,0.017,0.0,...,0.022,0.02,0.0,0.0,0.032,0.0,0.0,0.0,0.0,0.0
1,0.009,0.004,0.0,0.127,0.0,0.272,0.0,0.0,0.213,0.062,...,0.0,0.0,0.075,0.0,0.152,0.068,0.517,0.0,0.0,0.0
2,0.033,0.182,0.0,0.211,0.0,0.0,0.0,0.0,0.015,0.041,...,0.611,0.061,0.153,0.019,0.0,0.031,0.056,0.0,0.0,0.068
3,0.175,0.019,0.0,0.139,0.0,0.209,0.017,0.0,0.018,0.048,...,0.044,0.48,0.113,5.581,1.194,0.026,0.0,0.0,0.0,0.019
4,0.0,0.085,0.0,0.0,0.0,0.0,0.01,0.0,0.049,0.021,...,0.092,0.485,0.033,0.0,0.0,0.011,0.046,0.0,0.0,0.0


In [151]:
# display top words per topic
display_topics(nmf_model, tfidf.get_feature_names(), 5)


Topic  0
fixed, software, future, need, reach

Topic  1
like, started, meet, experiencing, happening

Topic  2
phone, updated, apps, make, amp

Topic  3
device, using, version, link, model

Topic  4
fix, problem, updating, sure, released

Topic  5
send, country, closer, updated, started

Topic  6
battery, life, important, updated, day

Topic  7
reaching, need, love, support, glad

Topic  8
version, gt, setting, general, running

Topic  9
app, music, apps, store, use

Topic  10
continue, meet, got, link, country

Topic  11
happy, started, reach, assist, experiencing

Topic  12
step, article, check, try, question

Topic  13
support, twitter, english, offer, contact

Topic  14
new, got, problem, plus, amp

Topic  15
work, meet, future, tried, apps

Topic  16
message, getting, country, link, seeing

Topic  17
time, screen, day, problem, type

Topic  18
going, tell, love, experiencing, experience

Topic  19
working, updated, apps, sure, glad


In [152]:
# create dataframe for docs x topics
doc_topic_nmf = pd.DataFrame(doc_topic.round(5),
                             index = df_all.text_clean)
doc_topic_nmf.head(5)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
text_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
version io running check setting gt general gt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09218,0.0,0.0,0.0,0.01448,0.0,0.0,0.0,0.0,0.0,0.0,0.0
applesupport newest update i️ made sure download yesterday,0.0,0.0,0.0,0.00578,0.00616,0.00062,0.0,0.0,0.0,0.00042,0.0,0.0,0.00801,0.00027,0.00019,0.0,0.0,0.0,0.0,0.00463
let take closer look issue select following link join u dm go http co,0.0,0.00165,0.0,0.00535,0.0,0.00519,0.00044,2e-05,0.0,0.0,0.00565,0.00134,0.00312,0.02579,0.0,0.00057,0.00508,0.0,0.0,0.0
applesupport http co,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
applesupport i️ changing showing correctly social medium platform http co gyrvpyvnke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
# let's look at the cosine similarity between documents 1 and 3 by each component
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[2]))

array([[1.        , 0.01719122],
       [0.01719122, 1.        ]])

## Create two dataframes, one from apple support and other from all users

In [154]:
# read pickle
df = pd.read_pickle('tweet_clean.pkl')

In [155]:
# head of dataframe
df.head(5)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
396,696,AppleSupport,False,2017-10-31 22:27:49,@115854 We're here for you. Which version of t...,697.0,698.0,2017-10-31,version io running check setting gt general gt
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699.0,696.0,2017-10-31,applesupport newest update i️ made sure downlo...
398,699,AppleSupport,False,2017-10-31 22:36:27,@115854 Lets take a closer look into this issu...,,697.0,2017-10-31,let take closer look issue select following li...
399,698,115854,True,2017-10-31 22:17:40,@AppleSupport https://t.co/NV0yucs0lB,696.0,700.0,2017-10-31,applesupport http co
400,700,115854,True,2017-10-31 22:16:56,@AppleSupport why are my I️’s changing not sho...,698.0,,2017-10-31,applesupport i️ changing showing correctly soc...


In [156]:
# one dataframe for apple support and other for users
df_apple = df[df.inbound == 'False']

df_users = df[df.inbound == 'True']

In [157]:
# apply tf-idf vectorizer to apple df
my_additional_stop_words = ['apple', 'applesupport', 'want', 'hey', 'hi', 'hello', 'http', 'thank', 'thanks', 'ok']
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

tfidf_apple = TfidfVectorizer(stop_words=my_stop_words, max_df=0.10, min_df=0.005) #ngram_range=(1,2)
doc_word_apple = tfidf_apple.fit_transform(df_apple.text_clean)

df_apple_vec = pd.DataFrame(doc_word_apple.toarray(), index=df_apple.text_clean, columns=tfidf_apple.get_feature_names())

In [158]:
df_apple_vec.shape

(106860, 189)

In [159]:
# use NMF to reduce dimensionality to some # of topics
nmf_model_apple = NMF(20)
doc_topic_apple = nmf_model_apple.fit_transform(doc_word_apple)

In [160]:
# topics by words
nmf_model_apple.components_.shape

(20, 189)

In [161]:
# distribution of words by topic
np.unique(doc_topic_apple.argmax(axis=1), return_counts = True)[1]

array([11121,  2590,  5051,  5613,  4617,  3976, 11989,  4401,  1886,
        3902,  6522,  4053,  3327,  4446,  3020,  4668, 11216,  4417,
        6150,  3895])

In [162]:
# topics by words into dataframe
topic_word_apple = pd.DataFrame(nmf_model_apple.components_.round(3),
                         columns = tfidf_apple.get_feature_names())
topic_word_apple

Unnamed: 0,able,account,additional,address,ahead,amp,answer,app,appreciate,apps,...,understand,updated,updating,use,watch,way,welcome,wi,workaround,working
0,0.001,0.0,0.0,0.003,0.0,0.003,0.0,0.0,0.002,0.004,...,0.001,0.004,0.006,0.004,0.001,0.0,0.012,0.0,0.009,0.003
1,0.0,0.0,0.041,0.0,0.059,0.052,0.062,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.006,0.029,0.0,0.0,0.0,0.0
2,0.019,0.018,0.0,0.007,0.007,0.099,0.009,0.044,0.011,0.047,...,0.006,0.028,0.0,0.014,0.004,0.0,0.0,0.064,0.0,0.059
3,0.001,0.001,0.029,0.009,0.113,0.049,0.015,0.0,0.029,0.0,...,0.005,0.007,0.015,0.0,0.002,0.0,0.0,0.005,0.0,0.833
4,0.019,0.018,0.004,0.036,0.0,0.009,0.0,0.009,0.096,0.009,...,0.023,0.0,0.002,0.02,0.02,0.002,0.012,0.005,1.366,0.177
5,0.017,0.023,0.028,0.0,0.0,0.0,0.006,0.005,0.0,0.0,...,0.015,0.0,0.0,0.004,0.002,0.009,0.0,0.0,0.0,0.0
6,0.232,0.063,0.081,0.163,0.075,0.081,0.038,0.0,0.036,0.011,...,0.063,0.107,0.344,0.156,0.08,0.109,0.037,0.109,0.085,0.113
7,0.0,0.0,0.067,0.004,0.0,0.023,0.0,0.0,0.0,0.0,...,0.0,0.059,0.0,0.001,0.015,0.055,0.0,0.0,0.0,0.167
8,0.0,0.0,0.0,0.0,0.012,0.041,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018,0.085,0.0,0.0,0.0,0.135
9,0.019,0.0,0.13,0.004,0.138,0.0,0.013,0.0,0.02,0.0,...,0.044,0.0,0.005,0.001,0.017,0.023,0.0,0.0,0.0,0.004


In [163]:
# display top words per topic
display_topics(nmf_model_apple, tfidf_apple.get_feature_names(), 5)


Topic  0
fixed, future, software, need, start

Topic  1
happy, start, today, installed, provide

Topic  2
gt, setting, general, check, installed

Topic  3
continue, working, received, got, letting

Topic  4
reaching, workaround, need, support, specific

Topic  5
support, twitter, english, offer, join

Topic  6
step, article, check, try, question

Topic  7
started, join, specific, current, model

Topic  8
tell, experience, bit, happening, certainly

Topic  9
meet, information, gather, closer, experience

Topic  10
reach, team, glad, need, question

Topic  11
assist, better, info, information, released

Topic  12
country, located, option, message, direct

Topic  13
going, exactly, hear, experience, bit

Topic  14
experiencing, behavior, provide, exactly, glad

Topic  15
link, closer, following, use, message

Topic  16
happening, app, apps, message, music

Topic  17
running, currently, exact, installed, model

Topic  18
love, start, model, updated, support

Topic  19
battery, life, impor

In [164]:
doc_topic_nmf_apple = pd.DataFrame(doc_topic_apple.round(5),
                             index = df_apple.text_clean)
doc_topic_nmf_apple.head(3)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
text_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
version io running check setting gt general gt,0.0,0.0,0.12859,0.0,0.0,0.0,0.01207,0.0,0.0,0.0,0.0,0.00159,0.0,0.0,0.0,0.00089,0.0,0.04889,0.0,0.0
let take closer look issue select following link join u dm go http co,0.0,0.0,0.0,0.0004,0.0,0.01962,0.00323,0.00338,0.0,0.00137,0.0,0.00313,0.0,0.0,0.00077,0.10539,0.0,0.00077,0.0,0.00117
let go dm next step dm u http co,0.0,0.0,0.0,0.0,0.0,0.0,0.09149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [165]:
doc_topic_nmf_apple.iloc[1].argmax()

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  """Entry point for launching an IPython kernel.


15

In [166]:
# let's look at the cosine similarity between documents 1 and 3 by each component
cosine_similarity((doc_topic_nmf_apple.values[0], doc_topic_nmf_apple.values[2]))

array([[1.        , 0.08739353],
       [0.08739353, 1.        ]])

In [167]:
# pairwise_distances(doc_topic_nmf_apple, metric='cosine')[0].argsort()

In [168]:
# apply tf-idf vectorizer to users df
my_additional_stop_words = ['apple', 'applesupport', 'want', 'hey', 'hi', 'hello', 'http', 'thank', 'thanks', 'ok']
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

tfidf_users = TfidfVectorizer(stop_words=my_stop_words, max_df=0.10, min_df=0.005) #ngram_range=(1,2)
doc_word_users = tfidf_users.fit_transform(df_users.text_clean)

df_users_vec = pd.DataFrame(doc_word_users.toarray(), index=df_users.text_clean, columns=tfidf_users.get_feature_names())

In [169]:
df_users_vec.shape

(97896, 241)

In [170]:
df_users_vec.sample(3)

Unnamed: 0_level_0,able,access,account,actually,ago,alarm,amp,android,annoying,answer,...,went,wifi,work,worked,working,worst,wrong,year,yes,yesterday
text_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
applesupport every time type iphone corrects w e tried resetting keyboard dictionary continues please help,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
applesupport i️ question phone http co,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
applesupport isnt way turn auto brightnes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [171]:
# use NMF to reduce dimensionality to some # of topics
nmf_model_users = NMF(20)
doc_topic_users = nmf_model_users.fit_transform(doc_word_users)

In [172]:
# distribution of words by topic
np.unique(doc_topic_users.argmax(axis=1), return_counts = True)[1]

array([15310,  2912,  3665,  5578,  3134,  2153,  1986,  2648,  3918,
        5078,  2468,  3132,  3613,  2936,  4221,  2899, 21629,  3289,
        2898,  4429])

In [173]:
# topics by words
nmf_model_users.components_.shape

(20, 241)

In [174]:
# topics by words into dataframe
topic_word_users = pd.DataFrame(nmf_model_users.components_.round(3),
                         columns = tfidf_users.get_feature_names())
topic_word_users

Unnamed: 0,able,access,account,actually,ago,alarm,amp,android,annoying,answer,...,went,wifi,work,worked,working,worst,wrong,year,yes,yesterday
0,0.0,0.0,0.0,0.01,0.0,0.015,0.0,0.062,0.335,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016,0.0,0.0,0.0,0.0
1,0.049,0.056,0.0,0.025,0.0,0.003,0.0,0.0,0.006,0.0,...,0.044,0.0,0.0,0.0,0.0,0.0,0.035,0.0,0.0,0.017
2,0.023,0.012,0.011,0.005,0.0,0.008,0.0,0.0,0.0,0.002,...,0.011,0.0,0.0,0.009,0.0,0.0,0.005,0.0,0.0,0.004
3,0.0,0.0,0.0,0.011,0.036,0.0,0.016,0.023,0.0,0.0,...,0.098,0.005,0.0,0.0,0.0,0.078,0.015,0.047,0.0,0.01
4,0.023,0.0,0.0,0.021,0.001,0.008,0.0,0.013,0.023,0.0,...,0.0,0.064,0.0,0.014,0.0,0.0,0.0,0.009,0.0,0.01
5,0.013,0.0,0.0,0.013,0.007,0.007,0.0,0.015,0.001,0.012,...,0.0,0.009,0.0,0.017,0.0,0.018,0.0,0.012,0.0,0.021
6,0.021,0.002,0.001,0.01,0.004,0.001,0.0,0.0,0.0,0.0,...,0.014,0.05,0.0,0.039,0.0,0.0,0.0,0.001,6.01,0.017
7,0.017,0.012,0.0,0.035,0.0,0.102,0.0,0.01,0.0,0.0,...,0.013,0.16,6.144,0.053,0.0,0.021,0.016,0.015,0.0,0.011
8,0.026,0.005,0.0,0.004,0.007,0.019,0.0,0.013,0.019,0.0,...,0.007,0.0,0.0,0.007,0.0,0.021,0.026,0.049,0.0,0.008
9,0.017,0.0,0.0,0.014,0.05,0.051,0.0,0.009,0.041,0.003,...,0.003,0.021,0.0,0.035,0.0,0.035,0.046,0.052,0.0,0.008


In [175]:
# display top words by topic
display_topics(nmf_model_users, tfidf_users.get_feature_names(), 5)


Topic  0
fix, shit, bug, glitch, soon

Topic  1
app, store, open, download, using

Topic  2
help, need, pls, id, trying

Topic  3
battery, life, drain, draining, hour

Topic  4
issue, fixed, software, people, updating

Topic  5
problem, fixed, people, know, solution

Topic  6
yes, tried, restarted, using, version

Topic  7
work, tried, fine, wifi, button

Topic  8
new, old, buy, slow, software

Topic  9
time, day, tried, type, freeze

Topic  10
dm, sent, check, message, reply

Topic  11
updated, latest, version, software, happening

Topic  12
screen, lock, home, black, touch

Topic  13
like, look, sound, hour, shit

Topic  14
letter, question, mark, type, box

Topic  15
working, stop, fine, tried, touch

Topic  16
need, amp, got, message, know

Topic  17
music, song, itunes, play, playing

Topic  18
plus, using, version, latest, running

Topic  19
apps, freezing, freeze, download, crashing


In [176]:
doc_topic_nmf_users = pd.DataFrame(doc_topic_users.round(5),
                             index = df_users.text_clean)
doc_topic_nmf_users

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
text_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
applesupport newest update i️ made sure download yesterday,0.00000,0.00638,0.00045,0.00000,0.00062,0.00054,0.00049,0.00002,0.00097,0.00000,0.00059,0.00260,0.00055,0.00077,0.00000,0.00100,0.00502,0.00477,0.00000,0.01187
applesupport http co,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
applesupport i️ changing showing correctly social medium platform http co gyrvpyvnke,0.00000,0.00142,0.00065,0.00000,0.00024,0.00022,0.00016,0.00000,0.00014,0.00028,0.00000,0.00078,0.00323,0.00073,0.00359,0.00029,0.00135,0.00182,0.00000,0.00000
applesupport tried resetting setting restarting phone,0.00000,0.00115,0.00119,0.00000,0.00058,0.00091,0.00375,0.00652,0.00000,0.00763,0.00000,0.00061,0.00136,0.00000,0.00000,0.00700,0.01547,0.00000,0.00029,0.00053
applesupport look like http co,0.00000,0.00000,0.00000,0.00000,0.00045,0.00013,0.00000,0.00000,0.00000,0.00000,0.00062,0.00000,0.00000,0.13342,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
applesupport update slack everything seems working well thanks follow issue stuff tomorrow,0.00000,0.00000,0.00000,0.00000,0.11245,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.15205,0.00000,0.00000,0.00000,0.00000
hey applesupport able duplicate file page searched really annoying fix http co cqngyqnslz,0.02906,0.00000,0.00000,0.00046,0.00052,0.00000,0.00009,0.00000,0.00106,0.00060,0.00000,0.00000,0.00025,0.00265,0.00241,0.00000,0.01411,0.00127,0.00000,0.00220
yo applesupport weird glitch w capital i️ attempt make tweet le i️,0.00217,0.00000,0.00012,0.00187,0.00022,0.00006,0.00000,0.00087,0.00229,0.00055,0.00019,0.00020,0.00045,0.00317,0.00830,0.00000,0.01014,0.00007,0.00147,0.00109
fuck applesupport phone keep hanging call showing call failure,0.00000,0.00142,0.00065,0.00000,0.00024,0.00022,0.00016,0.00000,0.00014,0.00028,0.00000,0.00078,0.00323,0.00073,0.00359,0.00029,0.00135,0.00182,0.00000,0.00000


In [177]:
# let's look at the cosine similarity between documents 1 and 3 by each component
cosine_similarity((doc_topic_nmf_users.values[0], doc_topic_nmf_users.values[2]))

array([[1.        , 0.33945407],
       [0.33945407, 1.        ]])

In [178]:
# pairwise_distances(doc_topic, metric='cosine')[0].argsort()

In [179]:
# head of dataframe
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
396,696,AppleSupport,False,2017-10-31 22:27:49,@115854 We're here for you. Which version of t...,697.0,698.0,2017-10-31,version io running check setting gt general gt
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699.0,696.0,2017-10-31,applesupport newest update i️ made sure downlo...
398,699,AppleSupport,False,2017-10-31 22:36:27,@115854 Lets take a closer look into this issu...,,697.0,2017-10-31,let take closer look issue select following li...
399,698,115854,True,2017-10-31 22:17:40,@AppleSupport https://t.co/NV0yucs0lB,696.0,700.0,2017-10-31,applesupport http co
400,700,115854,True,2017-10-31 22:16:56,@AppleSupport why are my I️’s changing not sho...,698.0,,2017-10-31,applesupport i️ changing showing correctly soc...


## Create a dataframe that combines user tweets from a conversation into one document

In [111]:
# read pickle
df = pd.read_pickle('tweet_clean.pkl')

In [112]:
# shape of dataframe
df.shape

(204756, 9)

In [113]:
# head of dataframe
df.head(3)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
396,696,AppleSupport,False,2017-10-31 22:27:49,@115854 We're here for you. Which version of t...,697.0,698,2017-10-31,version io running check setting gt general gt
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699.0,696,2017-10-31,applesupport newest update i️ made sure downlo...
398,699,AppleSupport,False,2017-10-31 22:36:27,@115854 Lets take a closer look into this issu...,,697,2017-10-31,let take closer look issue select following li...


In [114]:
# how many tweet conversations started by user
df[df.in_response_to_tweet_id == ''].shape

(51658, 9)

In [115]:
# how many unique authors initialized tweet conversation with apple support
len(df[df.in_response_to_tweet_id == ''].author_id.unique())

46959

In [116]:
# copy dataframe as a new one
df_convo_user = df.copy()

In [117]:
# tweets from users only
df_convo_user = df_convo_user[df_convo_user.inbound == 'True']

In [118]:
# aggregate text by user
df_orig_text = df_convo_user.groupby(['author_id'])['text'].apply(' '.join).reset_index()

# get min and max date per user
df_min_max_date = df_convo_user.groupby('author_id').agg({'date_only':['min', 'max']}).reset_index()

# add column names to min and max date
df_min_max_date.columns = ['_'.join(col).strip() for col in df_min_max_date.columns.values]

In [119]:
# aggregate text clean by user
df_convo_user = df_convo_user.groupby(['author_id'])['text_clean'].apply(' '.join).reset_index()

In [120]:
# merge df convo user with aggregated text
df_convo_user = pd.merge(df_convo_user, df_orig_text[['author_id', 'text']], how='left', on='author_id')

In [121]:
# merge df convo user with min and max date
df_convo_user = pd.merge(df_convo_user, df_min_max_date[['author_id_', 'date_only_min', 'date_only_max']], how='left', left_on='author_id', right_on='author_id_')
df_convo_user = df_convo_user.drop(['author_id_'], axis=1)
df_convo_user = df_convo_user[['author_id', 'text', 'text_clean', 'date_only_min', 'date_only_max']]
df_convo_user.head()

Unnamed: 0,author_id,text,text_clean,date_only_min,date_only_max
0,408,"@AppleSupport Uh, weirdness after watchOS 4.1 ...",applesupport uh weirdness watchos update http co,2017-11-02,2017-11-02
1,1437,I'm not sure what the F is happening with iOS ...,sure f happening io trying let battery run lie...,2017-11-17,2017-11-17
2,1501,@AppleSupport all good now thanks. i’ve been a...,applesupport good thanks able send least half ...,2017-11-03,2017-11-03
3,2084,@AppleSupport Just updated iTunes and it said ...,applesupport updated itunes said longer subscr...,2017-10-31,2017-10-31
4,3922,@AppleSupport Pretty sure it started with iOS ...,applesupport pretty sure started io seems happ...,2017-10-07,2017-10-08


In [122]:
# check combined tweets for a user
df_convo_user[df_convo_user.author_id == 115854].text_clean.values

array(['applesupport newest update i️ made sure download yesterday applesupport http co applesupport i️ changing showing correctly social medium platform http co gyrvpyvnke'],
      dtype=object)

In [123]:
# pickle df convo user
df_convo_user.to_pickle('df_convo_user.pkl')