# Apple Tweets Preprocessing

In [53]:
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style("whitegrid", {'axes.grid' : False})
import datetime as dt
import random
from collections import Counter
from scipy import sparse

import pickle
import os
import re
import string
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tokenize import MWETokenizer
import gensim
import emoji
from spellchecker import SpellChecker
from textblob import TextBlob
from langdetect import detect

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.manifold import TSNE

import scattertext as st
from biterm.utility import vec_to_biterms
from biterm.btm import oBTM
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

from helper_functions import display_topics

## Load tweet data

In [360]:
# set up client instance
client = MongoClient()

In [11]:
# set up db instance
db = client.customersupport

In [17]:
# check collections in db
db.list_collection_names()

['tweets']

In [13]:
# check out one tweet
cursor = db.tweets.find({}, {'_id':0}).limit(1)
list(cursor)

[{'tweet_id': 1,
  'author_id': 'sprintcare',
  'inbound': 'False',
  'created_at': 'Tue Oct 31 22:10:47 +0000 2017',
  'text': '@115712 I understand. I would like to assist you. We would need to get you into a private secured link to further assist.',
  'response_tweet_id': 2,
  'in_response_to_tweet_id': 3}]

In [108]:
# load collection into dataframe 
cursor = db.tweets.find()
df = pd.DataFrame(list(cursor))
df.to_pickle('customer_tweets.pkl')

In [507]:
# read pickle
df = pd.read_pickle('customer_tweets.pkl')

In [155]:
# shape of df
df.shape

(2811774, 8)

In [23]:
# basic info about the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2811774 entries, 0 to 2811773
Data columns (total 8 columns):
_id                        object
tweet_id                   int64
author_id                  object
inbound                    object
created_at                 object
text                       object
response_tweet_id          object
in_response_to_tweet_id    object
dtypes: int64(1), object(7)
memory usage: 171.6+ MB


In [231]:
# check out a few sample records
df.head(3)

Unnamed: 0,_id,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,5ebc7ad2507a19aa9e7ccae9,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2,3
1,5ebc7ad2507a19aa9e7ccaea,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4,6
2,5ebc7ad2507a19aa9e7ccaeb,6,sprintcare,False,Tue Oct 31 21:46:24 +0000 2017,@115712 Can you please send us a private messa...,57,8


In [27]:
# how many unique author ids are there
len(df.author_id.unique())

702777

In [28]:
# how many tweets from each author id?
df.author_id.value_counts()

AmazonHelp      169840
AppleSupport    106860
Uber_Support     56270
SpotifyCares     43265
Delta            42253
                 ...  
403265               1
403266               1
640356               1
640354               1
746645               1
Name: author_id, Length: 702777, dtype: int64

In [508]:
# drop column _id
df = df.drop(['_id'], axis=1)

In [510]:
# let's focus on tweets to and from apple support only
df = df[(df.author_id == 'AppleSupport') | (df.text.str.contains('@applesupport', na=False, flags=re.IGNORECASE, regex=True))]

In [234]:
df.head(3)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
396,696,AppleSupport,False,Tue Oct 31 22:27:49 +0000 2017,@115854 We're here for you. Which version of t...,697.0,698
397,697,115854,True,Tue Oct 31 22:31:23 +0000 2017,@AppleSupport The newest update. I️ made sure ...,699.0,696
398,699,AppleSupport,False,Tue Oct 31 22:36:27 +0000 2017,@115854 Lets take a closer look into this issu...,,697


In [166]:
# how many unique users who tweeted to apple support?
len(df.author_id.unique())

58583

In [113]:
# sample tweet and its replies between a user and apple support
df[(df.author_id == 115854) | (df.tweet_id.isin([696, 699]))].sort_values(by='created_at')

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
400,700,115854,True,Tue Oct 31 22:16:56 +0000 2017,@AppleSupport why are my I️’s changing not sho...,698.0,
399,698,115854,True,Tue Oct 31 22:17:40 +0000 2017,@AppleSupport https://t.co/NV0yucs0lB,696.0,700.0
396,696,AppleSupport,False,Tue Oct 31 22:27:49 +0000 2017,@115854 We're here for you. Which version of t...,697.0,698.0
397,697,115854,True,Tue Oct 31 22:31:23 +0000 2017,@AppleSupport The newest update. I️ made sure ...,699.0,696.0
398,699,AppleSupport,False,Tue Oct 31 22:36:27 +0000 2017,@115854 Lets take a closer look into this issu...,,697.0


In [511]:
# remove outbound messages that are not from apple support 
df = df[~((df.inbound == 'False') & (df.author_id != 'AppleSupport'))]

## Cleaning

### Clean up datetime column and add date only column

In [513]:
# check the current format of created at date
df.created_at[400]

'Tue Oct 31 22:16:56 +0000 2017'

In [514]:
# convert created at column to datetime type
df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')

# add date only column
df['date_only'] = df['created_at'].dt.normalize()

### Clean up text column

In [515]:
# convert slang / abbreviated phrases to words, such as brb to be right back
chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

#chat_words_conversion("one minute BRB")

df['text_clean'] = df.text.apply(lambda x: chat_words_conversion(x))

In [516]:
# fix word lengthening, such as amazingggggg
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

df.text_clean = df.text_clean.apply(lambda x: reduce_lengthening(x))

# lower case text
df.text_clean = df.text_clean.str.lower()

# remove punctuation
punc = (lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', str(x))) 
df.text_clean = df.text_clean.map(punc)

# remove curly open and closing quotes (for both single and double quotes)
# single curly open quote
df.text_clean = df.text_clean.map(lambda x: re.sub("‘", ' ', str(x)))
# single curly closing quote
df.text_clean = df.text_clean.map(lambda x: re.sub("’", ' ', str(x)))
# double curly open quote
df.text_clean = df.text_clean.map(lambda x: re.sub("“", ' ', str(x)))
# double curly closing quote
df.text_clean = df.text_clean.map(lambda x: re.sub("”", ' ', str(x)))

# remove numbers
num = (lambda x: re.sub('\w*\d\w*', ' ', str(x)))
df.text_clean = df.text_clean.map(num)

# remove stop words
stop = stopwords.words('english')
df.text_clean = df.text_clean.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [123]:
df.head(5)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only
396,696,AppleSupport,False,2017-10-31 22:27:49,version ios running check settings gt general gt,697.0,698.0,2017-10-31
397,697,115854,True,2017-10-31 22:31:23,applesupport newest update i️ made sure downlo...,699.0,696.0,2017-10-31
398,699,AppleSupport,False,2017-10-31 22:36:27,lets take closer look issue select following l...,,697.0,2017-10-31
399,698,115854,True,2017-10-31 22:17:40,applesupport https co,696.0,700.0,2017-10-31
400,700,115854,True,2017-10-31 22:16:56,applesupport i️ changing showing correctly soc...,698.0,,2017-10-31


In [521]:
# most frequently occurring words 
word_counter = Counter()
for text in df.text_clean.values:
    for word in text.split():
        word_counter[word] += 1
        
word_counter.most_common(10)

[('co', 101956),
 ('https', 101944),
 ('applesupport', 98885),
 ('us', 71483),
 ('dm', 56712),
 ('help', 46891),
 ('let', 34422),
 ('ios', 34388),
 ('iphone', 32952),
 ('update', 24023),
 ('look', 22334),
 ('know', 21671),
 ('please', 21316),
 ('issue', 20687),
 ('get', 19889),
 ('like', 18986),
 ('version', 17826),
 ('phone', 17205),
 ('work', 15970),
 ('thanks', 15911)]

In [523]:
# least frequently occurring words 
word_counter = Counter()
for text in df.text_clean.values:
    for word in text.split():
        word_counter[word] += 1
        
word_counter.most_common()[:-10-1:-1]

[('stopmakingnewphonesuntillyoulearntomaketheoldonerunrite', 1),
 ('tyouguysgetitright', 1),
 ('whycan', 1),
 ('🤬😡🤬😡🤬', 1),
 ('cqngyqnslz', 1),
 ('🙎🏾\u200d♂️', 1),
 ('unsaving', 1),
 ('quickquestion', 1),
 ('dzxhazufio', 1),
 ('censoring', 1)]

In [524]:
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
df.text_clean = df.text_clean.apply(lambda text: lemmatize_words(text))

# remove emoji 
def give_emoji_free_text(text):
    allchars = [str for str in text] 
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)]) 
    return clean_text
df.text_clean = df.text_clean.apply(lambda x: give_emoji_free_text(x))

# remove urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
df.text_clean = df.text_clean.apply(lambda x: remove_urls(x))

In [525]:
df.text_clean.sample(10)

1435394    i️ miss freaking keyboard glitch get together ...
379670      brightness iphone keep turning stop applesupport
117464     thanks confirming let go ahead remove case see...
2246217    applesupport saludos buen día al hacer respald...
2118338    applesupport place use play button let know pe...
430129     happen use different set headphone also tell u...
2542070                          applesupport iphone http co
1694155    know important device working properly happy h...
2113356    applesupport hi still see bug even io updated ...
1699579    want help get thing working smoothly send u dm...
Name: text_clean, dtype: object

In [526]:
len(df.text_clean)

204756

In [None]:
# correct spelling using text blob
# for tweet in sample_text:
#     # TextBlob is providing correct method
#     sample_text = TextBlob(tweet).correct()

In [286]:
# correct spelling using spell checker
# spell = SpellChecker()
# def correct_spellings(text):
#     corrected_text = []
#     misspelled_words = spell.unknown(text.split())
#     for word in text.split():
#         if word in misspelled_words:
#             corrected_text.append(spell.correction(word))
#         else:
#             corrected_text.append(word)
#     return " ".join(corrected_text)
        
# sample_text = sample_text.apply(lambda x: correct_spellings(x))

In [527]:
# any non-english characters?

# -*- coding: utf-8 -*-
# def isEnglish(s):
#     try:
#         s.encode(encoding='utf-8').decode('ascii')
#     except UnicodeDecodeError:
#         return False
#     else:
#         return True

# assert not isEnglish('slabiky, ale liší se podle významu')
# assert isEnglish('English')
# assert not isEnglish('ގެ ފުރަތަމަ ދެ އަކުރު ކަ')
# assert not isEnglish('how about this one : 通 asfަ')
# assert isEnglish('?fd4))45s&')

In [528]:
#df[~df.text_clean.apply(lambda x: isEnglish(x))]

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699,696,2017-10-31,applesupport newest update i️ made sure downlo...
400,700,115854,True,2017-10-31 22:16:56,@AppleSupport why are my I️’s changing not sho...,698,,2017-10-31,applesupport i️ changing showing correctly soc...
406,707,115855,True,2017-10-31 21:48:51,@AppleSupport I️ have an iPhone 7 Plus and yes...,705,708,2017-10-31,applesupport i️ iphone plus yes i️
408,709,115855,True,2017-10-31 21:34:45,@AppleSupport I️ need answers because it’s ann...,708,710,2017-10-31,applesupport i️ need answer annoying
413,714,115856,True,2017-10-31 22:19:32,Hey @AppleSupport and anyone else who upgraded...,712715,,2017-10-31,hey applesupport anyone else upgraded issue ca...
...,...,...,...,...,...,...,...,...,...
2809832,2986064,691757,True,2017-10-31 21:53:55,@AppleSupport Hi! I saw this page earlier and ...,,2986063,2017-10-31,applesupport hi saw page earlier say op temp w...
2809833,2986065,691757,True,2017-10-31 21:44:59,@AppleSupport Just a little question: Will it ...,2986063,,2017-10-31,applesupport little question hurt macbook pro ...
2810028,2986242,823415,True,2017-10-31 21:43:50,@AppleSupport why is “I️ “ showing up like thi...,2986241,,2017-10-31,applesupport i️ showing like annoying
2811116,2987300,823685,True,2017-11-21 22:10:42,Travelled over an hour to @115858 Store to get...,2987299,,2017-11-21,travelled hour store get repair £ iphonex told...


In [None]:
# def detect_lang(x):   
#     b = TextBlob(x)
#     return b.detect_language()   

# sample_text.apply(lambda x: detect_lang(x))

In [680]:
#lang = detect("hello worlds!")
#text_lang = sample_text.apply(lambda x: detect(x))

In [374]:
# text_lang

1059998    en
1648152    en
376891     en
859531     en
1399583    fr
           ..
2234310    it
2782313    en
490659     en
1446715    en
1765172    en
Name: text_clean, Length: 1000, dtype: object

In [379]:
# text_lang.value_counts()

en    849
fr     73
nl     26
no     13
it      8
af      8
es      6
da      4
tr      3
ca      2
et      2
tl      2
pt      2
hr      1
cy      1
Name: text_clean, dtype: int64

In [None]:
# df['text_lang'] = df.text_clean.apply(lambda x: detect(x))

In [530]:
# pickle dataframe
df.to_pickle('tweet_clean.pkl')

## Create a dataframe that keeps each tweet or reply as a document

In [3]:
# read pickle
df = pd.read_pickle('tweet_clean.pkl')

In [415]:
# create copy of df
df_all = df.copy()

In [416]:
# head of df all
df_all.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
396,696,AppleSupport,False,2017-10-31 22:27:49,@115854 We're here for you. Which version of t...,697.0,698.0,2017-10-31,version io running check setting gt general gt
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699.0,696.0,2017-10-31,applesupport newest update i️ made sure downlo...
398,699,AppleSupport,False,2017-10-31 22:36:27,@115854 Lets take a closer look into this issu...,,697.0,2017-10-31,let take closer look issue select following li...
399,698,115854,True,2017-10-31 22:17:40,@AppleSupport https://t.co/NV0yucs0lB,696.0,700.0,2017-10-31,applesupport http co
400,700,115854,True,2017-10-31 22:16:56,@AppleSupport why are my I️’s changing not sho...,698.0,,2017-10-31,applesupport i️ changing showing correctly soc...


In [428]:
# vectorize text data using tf-idf vectorizer
my_additional_stop_words = ['apple', 'applesupport', 'want', 'hey', 'hi', 'hello', 'http', 'thank', 'thanks', 'ok']
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

tfidf = TfidfVectorizer(stop_words=my_stop_words, max_df=0.10, min_df=0.01) #ngram_range=(1,2)
doc_word = tfidf.fit_transform(df_all.text_clean)

df_all_vec = pd.DataFrame(doc_word.toarray(), index=df_all.text_clean, columns=tfidf.get_feature_names())

In [429]:
# shape of df all vec
df_all_vec.shape

(204756, 103)

In [430]:
# sample of df all vec
df_all_vec.sample(3)

Unnamed: 0_level_0,able,amp,app,apps,article,assist,available,battery,better,change,...,updated,updating,use,using,version,watch,way,work,working,yes
text_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
applesupport explain shortage room since day got brand new iphone charger came work,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.393532,0.0,0.0
let look closer link dm u http co,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
let take look together recently updated macos affected apps send u dm http co,0.0,0.0,0.0,0.607011,0.0,0.0,0.0,0.0,0.0,0.0,...,0.62227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [435]:
# use NMF to reduce dimensionality to some # of topics
nmf_model = NMF(20)
doc_topic = nmf_model.fit_transform(doc_word)

In [436]:
# topics by words
nmf_model.components_.shape

(20, 103)

In [437]:
# documents by topics matrix
doc_topic.shape

(204756, 20)

In [438]:
# distribution of words by topic
np.unique(doc_topic.argmax(axis=1), return_counts = True)[1]

array([35220,  6407,  9879, 12826,  6006,  9474,  8552,  4661,  5754,
       14157,  7828, 12646,  8814,  7902,  6680,  4914, 17345, 12968,
        6046,  6677])

In [440]:
# topics by words into dataframe
topic_word = pd.DataFrame(nmf_model.components_.round(3),
                         #index = ["component_1", "component_2", "component_3"],
                         columns = tfidf.get_feature_names())
topic_word.head(5)

Unnamed: 0,able,amp,app,apps,article,assist,available,battery,better,change,...,updated,updating,use,using,version,watch,way,work,working,yes
0,0.0,0.0,0.0,0.0,0.0,0.014,0.007,0.0,0.017,0.0,...,0.023,0.02,0.0,0.0,0.001,0.0,0.0,0.03,0.0,0.0
1,0.013,0.008,0.0,0.105,0.0,0.26,0.0,0.0,0.207,0.06,...,0.0,0.0,0.077,0.0,0.0,0.064,0.498,0.0,0.0,0.0
2,0.035,0.184,0.0,0.21,0.0,0.0,0.0,0.0,0.015,0.042,...,0.614,0.067,0.156,0.0,0.0,0.031,0.055,0.0,0.0,0.07
3,0.047,0.056,0.0,0.191,0.0,0.0,0.0,0.0,0.0,0.0,...,0.092,0.0,0.0,5.776,5.811,0.064,0.0,0.0,0.0,0.018
4,0.0,0.083,0.0,0.002,0.0,0.0,0.008,0.0,0.047,0.018,...,0.085,0.441,0.029,0.0,0.0,0.012,0.045,0.0,0.0,0.0


In [443]:
# display top words per topic
display_topics(nmf_model, tfidf.get_feature_names(), 5)


Topic  0
fixed, software, future, need, reach

Topic  1
like, started, meet, experiencing, happening

Topic  2
phone, updated, apps, make, amp

Topic  3
version, using, running, currently, tell

Topic  4
fix, problem, updating, sure, released

Topic  5
send, country, message, closer, updated

Topic  6
battery, life, important, updated, day

Topic  7
reaching, need, love, support, glad

Topic  8
gt, setting, general, version, check

Topic  9
app, music, apps, store, message

Topic  10
continue, meet, got, country, link

Topic  11
happy, started, reach, assist, experiencing

Topic  12
device, released, updating, sure, experiencing

Topic  13
support, twitter, english, offer, contact

Topic  14
new, got, problem, plus, amp

Topic  15
work, meet, future, tried, apps

Topic  16
step, article, check, try, question

Topic  17
time, screen, day, problem, type

Topic  18
going, tell, love, experiencing, experience

Topic  19
working, updated, apps, glad, amp


In [442]:
# create dataframe for docs x topics
doc_topic_nmf = pd.DataFrame(doc_topic.round(5),
                             index = df_all.text_clean)
doc_topic_nmf.head(5)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
text_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
version io running check setting gt general gt,0.0,0.0,0.0,0.01028,0.0,0.0,0.00039,0.0,0.10493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01156,0.0,0.0,0.0
applesupport newest update i️ made sure download yesterday,0.0,0.0,0.0,0.0,0.00551,0.00052,6e-05,0.0,0.00112,0.00054,0.0,0.0,0.01104,0.00036,0.0001,0.0,0.00616,0.0,0.0,0.00379
let take closer look issue select following link join u dm go http co,0.0,0.00169,0.0,0.00668,0.0,0.00522,0.00034,0.0,0.0,0.0,0.00616,0.001,0.0,0.02588,0.0,0.00052,0.00481,0.0,0.0,0.0
applesupport http co,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
applesupport i️ changing showing correctly social medium platform http co gyrvpyvnke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [550]:
# let's look at the cosine similarity between documents 1 and 3 by each component
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[2]))

array([[1.        , 0.97219565],
       [0.97219565, 1.        ]])

## Create two dataframes, one from apple support and other from all users

In [3]:
# read pickle
df = pd.read_pickle('tweet_clean.pkl')

In [3]:
# head of dataframe
df.head(5)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
396,696,AppleSupport,False,2017-10-31 22:27:49,@115854 We're here for you. Which version of t...,697.0,698.0,2017-10-31,version io running check setting gt general gt
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699.0,696.0,2017-10-31,applesupport newest update i️ made sure downlo...
398,699,AppleSupport,False,2017-10-31 22:36:27,@115854 Lets take a closer look into this issu...,,697.0,2017-10-31,let take closer look issue select following li...
399,698,115854,True,2017-10-31 22:17:40,@AppleSupport https://t.co/NV0yucs0lB,696.0,700.0,2017-10-31,applesupport http co
400,700,115854,True,2017-10-31 22:16:56,@AppleSupport why are my I️’s changing not sho...,698.0,,2017-10-31,applesupport i️ changing showing correctly soc...


In [52]:
# one dataframe for apple support and other for users
df_apple = df[df.inbound == 'False']

df_users = df[df.inbound == 'True']

In [229]:
# apply tf-idf vectorizer to apple df
my_additional_stop_words = ['apple', 'applesupport', 'want', 'hey', 'hi', 'hello', 'http', 'thank', 'thanks', 'ok']
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

tfidf_apple = TfidfVectorizer(stop_words=my_stop_words, max_df=0.10, min_df=0.005) #ngram_range=(1,2)
doc_word_apple = tfidf_apple.fit_transform(df_apple.text_clean)

df_apple_vec = pd.DataFrame(doc_word_apple.toarray(), index=df_apple.text_clean, columns=tfidf_apple.get_feature_names())

In [230]:
df_apple_vec.shape

(106860, 189)

In [231]:
# use NMF to reduce dimensionality to some # of topics
nmf_model_apple = NMF(20)
doc_topic_apple = nmf_model_apple.fit_transform(doc_word_apple)

In [232]:
# topics by words
nmf_model_apple.components_.shape

(20, 189)

In [233]:
# distribution of words by topic
np.unique(doc_topic_apple.argmax(axis=1), return_counts = True)[1]

array([11105,  2611,  5179,  5978,  4713,  4088, 11236,  4622,  4785,
        4202,  6856,  3675,  4259,  3466,  2316,  2355,  4621,  6321,
        7649,  6823])

In [220]:
# topics by words into dataframe
topic_word_apple = pd.DataFrame(nmf_model_apple.components_.round(3),
                         columns = tfidf_apple.get_feature_names())
topic_word_apple

Unnamed: 0,account,additional,address,ahead,answer,appreciate,autocorrect,begin,bit,bluetooth,...,tap,thing,today,touch,trouble,troubleshooting,trying,turn,watch,welcome
0,0.0,0.0,0.186,0.0,0.0,0.001,4.518,0.0,0.0,0.0,...,0.041,0.0,0.0,0.004,0.005,0.0,0.0,0.007,0.007,0.0
1,0.01,0.0,0.0,0.0,0.172,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.037,0.0,0.0,0.0,0.036,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.006,4.63,0.0,0.045,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.002,0.072,0.0,0.004,0.002,0.027,0.0
4,0.006,0.0,0.018,0.0,0.0,0.012,0.0,0.0,0.0,0.005,...,0.01,0.0,0.0,0.005,0.008,0.0,0.011,0.004,0.0,0.0
5,0.0,0.0,0.0,0.0,0.031,0.059,0.009,0.0,0.0,0.0,...,0.008,0.0,0.0,0.003,0.023,0.0,0.002,0.009,0.038,0.0
6,0.03,0.0,0.0,0.0,0.0,0.064,0.0,0.0,0.0,0.011,...,0.0,0.0,0.0,0.0,0.07,11.865,0.074,0.019,0.0,0.0
7,0.014,0.0,0.052,0.0,0.0,0.022,0.0,0.0,0.0,0.0,...,0.205,0.0,0.0,0.002,0.058,0.0,0.187,0.005,0.002,0.0
8,0.078,6.546,0.0,0.0,0.044,0.106,0.0,0.0,0.0,0.0,...,0.014,0.0,0.0,0.018,0.034,0.0,0.06,0.03,0.009,0.0
9,0.024,0.0,0.0,0.0,0.016,0.027,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002,0.0,0.008,0.0,0.0,0.0,0.061,0.0


In [234]:
# display top words per topic
display_topics(nmf_model_apple, tfidf_apple.get_feature_names(), 5)


Topic  0
fixed, future, software, need, glad

Topic  1
happy, start, today, provide, installed

Topic  2
gt, setting, general, check, installed

Topic  3
continue, working, received, got, letting

Topic  4
reaching, workaround, need, support, specific

Topic  5
support, twitter, english, offer, join

Topic  6
step, try, tried, follow, article

Topic  7
started, specific, join, current, model

Topic  8
tell, happening, app, apps, experience

Topic  9
meet, information, gather, experience, option

Topic  10
reach, team, glad, need, store

Topic  11
assist, better, info, released, information

Topic  12
country, located, message, option, direct

Topic  13
going, exactly, hear, experience, bit

Topic  14
experiencing, behavior, provide, tell, glad

Topic  15
closer, current, join, message, got

Topic  16
link, following, use, message, join

Topic  17
check, article, question, info, great

Topic  18
running, battery, currently, life, exact

Topic  19
love, start, model, updated, support


In [235]:
doc_topic_nmf_apple = pd.DataFrame(doc_topic_apple.round(5),
                             index = df_apple.text_clean)
doc_topic_nmf_apple.head(3)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
text_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
version io running check setting gt general gt,0.0,0.00000,0.12615,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.03200,0.03700,0.00000
let take closer look issue select following link join u dm go http co,0.0,0.00000,0.00000,0.00019,0.00000,0.01911,0.00269,0.00228,0.00000,0.00000,0.00000,0.00379,0.00000,0.00000,0.00022,0.08189,0.09393,0.00143,0.00000,0.00000
let go dm next step dm u http co,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.12345,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
step tried since started last night,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.08201,0.08627,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
great io rule outdated step tried since started recall started,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.05435,0.11493,0.00000,0.00000,0.00126,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00256,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
happy help concern begin made sure apps date via app store gt update tab slower performance happen specific pattern send u dm work together http co,0.0,0.01425,0.03173,0.00001,0.00165,0.00039,0.00833,0.00203,0.00809,0.00096,0.00571,0.00121,0.00000,0.00000,0.00103,0.00073,0.00087,0.00000,0.00654,0.00487
certainly glad get pointed right direction please reach dm let u know ipad model using version io installed well country tweeting http co,0.0,0.00047,0.00109,0.00000,0.00000,0.00000,0.00124,0.00262,0.00219,0.00069,0.05106,0.00002,0.03770,0.00171,0.00062,0.00214,0.00032,0.00000,0.01441,0.00579
love help device using latest version io include fix unexpected result auto correction typing chance advise backing updating guide help http co ahjigcvfrg,0.0,0.00000,0.00012,0.00000,0.00000,0.00000,0.01046,0.00000,0.00000,0.00000,0.00000,0.00375,0.00000,0.00000,0.00108,0.00000,0.00000,0.00143,0.00000,0.06329
like help send u dm start http co,0.0,0.00524,0.00000,0.00000,0.00000,0.00000,0.00428,0.00000,0.00000,0.00000,0.00000,0.00000,0.00511,0.00000,0.00150,0.00033,0.00000,0.00000,0.00453,0.05456


In [65]:
doc_topic_nmf_apple.iloc[1].argmax()

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  """Entry point for launching an IPython kernel.


7

In [20]:
# let's look at the cosine similarity between documents 1 and 3 by each component
cosine_similarity((doc_topic_nmf_apple.values[0], doc_topic_nmf_apple.values[2]))

array([[1.       , 0.0080298],
       [0.0080298, 1.       ]])

In [None]:
# pairwise_distances(doc_topic_nmf_apple, metric='cosine')[0].argsort()

In [238]:
# apply tf-idf vectorizer to users df
my_additional_stop_words = ['apple', 'applesupport', 'want', 'hey', 'hi', 'hello', 'http', 'thank', 'thanks', 'ok']
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

tfidf_users = TfidfVectorizer(stop_words=my_stop_words, max_df=0.10, min_df=0.005) #ngram_range=(1,2)
doc_word_users = tfidf_users.fit_transform(df_users.text_clean)

df_users_vec = pd.DataFrame(doc_word_users.toarray(), index=df_users.text_clean, columns=tfidf_users.get_feature_names())

In [239]:
df_users_vec.shape

(97896, 241)

In [106]:
df_users_vec.sample(3)

Unnamed: 0_level_0,able,account,amp,annoying,app,apple,apps,battery,bluetooth,box,...,using,version,want,watch,way,week,wifi,work,working,yes
text_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
good user experience iphonex applesupport http co,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
applesupport open apps phone namely safari iheartradio twitter freeze close happens almost every time,0.0,0.0,0.0,0.0,0.0,0.0,0.395439,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
applesupport,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [240]:
# use NMF to reduce dimensionality to some # of topics
nmf_model_users = NMF(20)
doc_topic_users = nmf_model_users.fit_transform(doc_word_users)

In [241]:
# distribution of words by topic
np.unique(doc_topic_users.argmax(axis=1), return_counts = True)[1]

array([15481,  3880,  3340,  6419,  3099,  2210,  2023,  2646,  4236,
        7361,  3324,  3573,  4970,  3290,  5919,  3920,  4866,  3629,
        8912,  4798])

In [206]:
# topics by words
nmf_model_users.components_.shape

(20, 137)

In [207]:
# topics by words into dataframe
topic_word_users = pd.DataFrame(nmf_model_users.components_.round(3),
                         columns = tfidf_users.get_feature_names())
topic_word_users

Unnamed: 0,access,actually,ago,alarm,android,answer,anymore,auto,automatically,available,...,user,video,volume,weird,went,worked,worst,wrong,year,yesterday
0,0.036,0.255,0.0,0.023,0.196,0.479,0.275,0.018,0.0,0.0,...,0.177,0.0,0.089,0.267,0.056,0.0,0.051,0.111,0.0,0.135
1,0.015,0.023,0.0,0.005,0.01,0.027,0.022,0.024,0.005,0.006,...,0.004,0.0,0.01,0.062,0.027,0.0,0.003,0.003,0.013,0.028
2,0.072,0.0,0.0,0.0,0.0,0.016,0.089,0.0,0.005,0.036,...,0.053,0.0,0.031,0.0,0.0,0.0,0.006,0.0,0.0,0.029
3,0.024,0.003,0.0,0.01,0.0,0.023,0.004,0.012,0.0,0.009,...,0.003,0.0,0.003,0.033,0.035,0.0,0.003,0.0,0.0,0.001
4,0.013,0.012,0.0,0.015,0.043,0.026,0.02,0.017,0.019,0.017,...,0.023,0.0,0.029,0.001,0.0,0.0,0.006,0.083,0.044,0.0
5,0.015,0.024,0.0,0.049,0.025,0.001,0.056,0.0,0.001,0.0,...,0.016,0.0,0.014,0.0,0.0,0.0,0.085,0.005,0.001,0.007
6,0.0,0.063,0.0,0.068,0.041,0.0,0.022,0.01,0.002,0.006,...,0.063,0.0,0.018,0.0,0.083,0.0,0.01,0.003,0.029,0.034
7,0.0,0.012,0.0,0.027,0.009,0.0,0.016,0.0,0.001,0.0,...,0.015,0.0,0.041,0.0,0.076,5.084,0.004,0.008,0.06,0.118
8,0.02,0.135,0.0,0.033,0.22,0.16,0.023,0.013,0.036,0.223,...,0.157,0.0,0.0,0.018,0.211,0.0,0.098,0.03,0.02,0.058
9,0.067,0.042,0.0,0.0,0.036,0.257,0.02,0.004,0.004,0.044,...,0.025,0.0,0.002,0.0,0.054,0.0,0.697,0.012,0.117,0.036


In [242]:
# display top words by topic
display_topics(nmf_model_users, tfidf_users.get_feature_names(), 5)


Topic  0
fix, shit, bug, glitch, going

Topic  1
app, store, open, download, message

Topic  2
help, pls, tried, trying, got

Topic  3
battery, life, drain, draining, hour

Topic  4
issue, fixed, software, people, wifi

Topic  5
problem, fixed, know, people, solution

Topic  6
yes, tried, restarted, using, version

Topic  7
work, tried, fine, wifi, button

Topic  8
new, got, old, buy, slow

Topic  9
time, day, tried, freeze, type

Topic  10
dm, sent, message, check, reply

Topic  11
updated, latest, version, amp, software

Topic  12
screen, lock, home, black, touch

Topic  13
like, look, know, got, day

Topic  14
letter, question, mark, type, box

Topic  15
working, tried, stop, fine, macbook

Topic  16
music, itunes, song, play, playing

Topic  17
plus, got, using, amp, version

Topic  18
need, know, fixed, really, support

Topic  19
apps, freezing, amp, freeze, download


In [37]:
doc_topic_nmf_users = pd.DataFrame(doc_topic_users.round(5),
                             index = df_users.text_clean)
doc_topic_nmf_users

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
text_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
applesupport newest update i️ made sure download yesterday,0.00136,0.00000,0.00010,0.00178,0.00464
applesupport http co,0.00000,0.00000,0.00000,0.00000,0.00000
applesupport i️ changing showing correctly social medium platform http co gyrvpyvnke,0.00000,0.00000,0.00000,0.00000,0.00000
applesupport tried resetting setting restarting phone,0.00135,0.00000,0.00068,0.00344,0.01114
applesupport look like http co,0.00044,0.00000,0.00056,0.00000,0.01911
...,...,...,...,...,...
applesupport update slack everything seems working well thanks follow issue stuff tomorrow,0.00048,0.00660,0.09234,0.00098,0.02833
hey applesupport able duplicate file page searched really annoying fix http co cqngyqnslz,0.00175,0.05617,0.00110,0.00240,0.00690
yo applesupport weird glitch w capital i️ attempt make tweet le i️,0.00101,0.00794,0.00058,0.00053,0.00644
fuck applesupport phone keep hanging call showing call failure,0.00000,0.00000,0.00000,0.00000,0.00000


In [38]:
# let's look at the cosine similarity between documents 1 and 3 by each component
cosine_similarity((doc_topic_nmf_users.values[0], doc_topic_nmf_users.values[2]))

array([[1., 0.],
       [0., 0.]])

In [None]:
# pairwise_distances(doc_topic, metric='cosine')[0].argsort()

In [63]:
# head of dataframe
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
396,696,AppleSupport,False,2017-10-31 22:27:49,@115854 We're here for you. Which version of t...,697.0,698.0,2017-10-31,version io running check setting gt general gt
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699.0,696.0,2017-10-31,applesupport newest update i️ made sure downlo...
398,699,AppleSupport,False,2017-10-31 22:36:27,@115854 Lets take a closer look into this issu...,,697.0,2017-10-31,let take closer look issue select following li...
399,698,115854,True,2017-10-31 22:17:40,@AppleSupport https://t.co/NV0yucs0lB,696.0,700.0,2017-10-31,applesupport http co
400,700,115854,True,2017-10-31 22:16:56,@AppleSupport why are my I️’s changing not sho...,698.0,,2017-10-31,applesupport i️ changing showing correctly soc...


## Create a dataframe where we combine user tweets from a conversation into one document

In [35]:
# read pickle
df = pd.read_pickle('tweet_clean.pkl')

In [36]:
# shape of dataframe
df.shape

(204756, 9)

In [38]:
# head of dataframe
df.head(3)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,date_only,text_clean
396,696,AppleSupport,False,2017-10-31 22:27:49,@115854 We're here for you. Which version of t...,697.0,698,2017-10-31,version io running check setting gt general gt
397,697,115854,True,2017-10-31 22:31:23,@AppleSupport The newest update. I️ made sure ...,699.0,696,2017-10-31,applesupport newest update i️ made sure downlo...
398,699,AppleSupport,False,2017-10-31 22:36:27,@115854 Lets take a closer look into this issu...,,697,2017-10-31,let take closer look issue select following li...


In [39]:
# how many tweet conversations started by user
df[df.in_response_to_tweet_id == ''].shape

(51658, 9)

In [40]:
# how many unique authors initialized tweet conversation with apple support
len(df[df.in_response_to_tweet_id == ''].author_id.unique())

46959

In [44]:
# copy dataframe as a new one
df_convo_user = df.copy()

In [45]:
# tweets from users only
df_convo_user = df_convo_user[df_convo_user.inbound == 'True']

In [46]:
# aggregate text by user
df_orig_text = df_convo_user.groupby(['author_id'])['text'].apply(' '.join).reset_index()

# get min and max date per user
df_min_max_date = df_convo_user.groupby('author_id').agg({'date_only':['min', 'max']}).reset_index()

# add column names to min and max date
df_min_max_date.columns = ['_'.join(col).strip() for col in df_min_max_date.columns.values]

In [47]:
# aggregate text clean by user
df_convo_user = df_convo_user.groupby(['author_id'])['text_clean'].apply(' '.join).reset_index()

In [48]:
# merge df convo user with aggregated text
df_convo_user = pd.merge(df_convo_user, df_orig_text[['author_id', 'text']], how='left', on='author_id')

In [49]:
# merge df convo user with min and max date
df_convo_user = pd.merge(df_convo_user, df_min_max_date[['author_id_', 'date_only_min', 'date_only_max']], how='left', left_on='author_id', right_on='author_id_')
df_convo_user = df_convo_user.drop(['author_id_'], axis=1)
df_convo_user = df_convo_user[['author_id', 'text', 'text_clean', 'date_only_min', 'date_only_max']]
df_convo_user.head()

Unnamed: 0,author_id,text,text_clean,date_only_min,date_only_max
0,408,"@AppleSupport Uh, weirdness after watchOS 4.1 ...",applesupport uh weirdness watchos update http co,2017-11-02,2017-11-02
1,1437,I'm not sure what the F is happening with iOS ...,sure f happening io trying let battery run lie...,2017-11-17,2017-11-17
2,1501,@AppleSupport all good now thanks. i’ve been a...,applesupport good thanks able send least half ...,2017-11-03,2017-11-03
3,2084,@AppleSupport Just updated iTunes and it said ...,applesupport updated itunes said longer subscr...,2017-10-31,2017-10-31
4,3922,@AppleSupport Pretty sure it started with iOS ...,applesupport pretty sure started io seems happ...,2017-10-07,2017-10-08


In [51]:
# check combined tweets for a user
df_convo_user[df_convo_user.author_id == 115854].text_clean.values

array(['applesupport newest update i️ made sure download yesterday applesupport http co applesupport i️ changing showing correctly social medium platform http co gyrvpyvnke'],
      dtype=object)

In [50]:
# pickle df convo user
df_convo_user.to_pickle('df_convo_user.pkl')