In [1]:
# Base
import pandas as pd
import numpy as np
import glob, os, re, string
from datetime import datetime

# Graphs
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from wordcloud import WordCloud

# Language Detection
from polyglot.detect import Detector

# NLTK
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from scipy import stats



In [2]:
FDIR = 'data/ca_cleaned/cdge/'

files = [file for file in glob.glob(FDIR + '*.csv')]
files

['data/ca_cleaned/cdge/twitter_comfortdelgro.csv',
 'data/ca_cleaned/cdge/reddit_comfortdelgro.csv',
 'data/ca_cleaned/cdge/twitter_cdgtaxi.csv',
 'data/ca_cleaned/cdge/gplay_cdge.csv',
 'data/ca_cleaned/cdge/reddit_cdgtaxi.csv']

In [3]:
prefixes = ['gplay','reddit','twitter']

files_prefixed = [[f for f in files if prefix in f] for prefix in prefixes]
files_prefixed

[['data/ca_cleaned/cdge/gplay_cdge.csv'],
 ['data/ca_cleaned/cdge/reddit_comfortdelgro.csv',
  'data/ca_cleaned/cdge/reddit_cdgtaxi.csv'],
 ['data/ca_cleaned/cdge/twitter_comfortdelgro.csv',
  'data/ca_cleaned/cdge/twitter_cdgtaxi.csv']]

## Google Play

In [4]:
df_gp = pd.read_csv(files_prefixed[0][0])
df_gp['date'] = pd.to_datetime(df_gp['date'], errors='coerce')
df_gp['date'] = df_gp['date'].dt.date
df_gp = df_gp[['date','comment']]
df_gp.head(10)

Unnamed: 0,date,comment
0,2018-11-03,Horrible experience. waited 8mins for the cab ...
1,2018-11-02,Can be better. Please benchmark to Grab to get...
2,2018-11-01,Not so good
3,2018-10-31,Good one
4,2018-10-29,Unable to use app. SMS OTP not delivered (Mala...
5,2018-10-29,Easy to use
6,2018-10-26,Good app.
7,2018-10-24,It s not easy to use
8,2018-10-23,This app not working with Samsung S7. I try to...
9,2018-10-22,Great app to rely on when grab is having ridic...


### Reddit

In [5]:
df_re_com = pd.read_csv('data/reddit_comments.csv')
df_re_com['created_utc'] = pd.to_datetime(df_re_com['created_utc'], unit='s')
df_re_com['ts'] = df_re_com['created_utc'].dt.tz_localize('UTC').dt.tz_convert('Asia/Singapore')
df_re_com['date'] = df_re_com['ts'].dt.date
df_re_com = df_re_com[df_re_com['body'].str.contains('[deleted]')]
df_re_com.rename(columns={'body':'comment'},inplace=True)
df_re_com.sort_values('ts')
df_re_com

Unnamed: 0,submission_id,id,created_utc,comment,score,author_name,parent,ts,date
0,841u7q,dvm8qmt,2018-03-13 06:33:42,The auntie in me thank you a thousand times ov...,52,dancinginthesunlight,841u7q,2018-03-13 14:33:42+08:00,2018-03-13
1,841u7q,dvmf1t6,2018-03-13 10:46:37,The hero we need,7,chickennutbreadd,841u7q,2018-03-13 18:46:37+08:00,2018-03-13
3,841u7q,dvmaki1,2018-03-13 07:40:51,Damnnnnnn. Thanks mate!,2,barbaraimout,841u7q,2018-03-13 15:40:51+08:00,2018-03-13
4,841u7q,dvmbe2a,2018-03-13 08:13:42,most liked thread.\r\nWe all love Freebies,2,foodieandthebeast,841u7q,2018-03-13 16:13:42+08:00,2018-03-13
5,841u7q,dvmw2y8,2018-03-13 16:26:46,"Cool, you're a deals aggregator? Is somebody m...",2,haemuljeon,841u7q,2018-03-14 00:26:46+08:00,2018-03-14
6,841u7q,dvmct05,2018-03-13 09:13:48,Damn this is so cool!,1,DaniloIce,841u7q,2018-03-13 17:13:48+08:00,2018-03-13
7,841u7q,dvmfjih,2018-03-13 11:04:40,Thanks a lot,1,_whatcanbe,841u7q,2018-03-13 19:04:40+08:00,2018-03-13
8,841u7q,dvmwetd,2018-03-13 16:31:31,Mobike pass is still $5 on the website?,1,madsonic,841u7q,2018-03-14 00:31:31+08:00,2018-03-14
9,841u7q,dvn2jyl,2018-03-13 18:01:41,Free pads. Girlfriend confirm happy one.,1,El3ctr1c4l,841u7q,2018-03-14 02:01:41+08:00,2018-03-14
10,841u7q,dvqb04z,2018-03-15 07:44:20,thank you op!!!!,1,waffle5ky,841u7q,2018-03-15 15:44:20+08:00,2018-03-15


In [6]:
dfr = [pd.read_csv(f) for f in files_prefixed[1]]
df_re_topics = pd.concat(dfr)
df_re_topics['created'] = pd.to_datetime(df_re_topics['created'], unit='s')
df_re_topics['date'] = df_re_topics['created'].dt.date
df_re_topics['body'] = df_re_topics['body'].replace(np.nan, '')
df_re_topics['comment'] = df_re_topics['title'] + ' ' + df_re_topics['body']
df_re_topics.head(10)

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,date,comment
0,Tomica now makes miniature versions of the Com...,187,8kqrc3,https://i.redd.it/wppfk8ju4yy01.jpg,27,2018-05-20 13:26:14,,2018-05-20 21:26:14,2018-05-20,Tomica now makes miniature versions of the Com...
1,ComfortDelGro sees biggest taxi booking rise s...,50,8p614q,https://www.reddit.com/r/singapore/comments/8p...,24,2018-06-07 08:53:37,In what seems to cement a turnaround for the t...,2018-06-07 16:53:37,2018-06-07,ComfortDelGro sees biggest taxi booking rise s...
2,ComfortDelGro drivers unhappy with new levy st...,7,9gx4ka,https://www.todayonline.com/singapore/comfortd...,3,2018-09-19 01:55:57,,2018-09-19 09:55:57,2018-09-19,ComfortDelGro drivers unhappy with new levy st...
3,Grab confirms acquisition of Uber in Southeast...,127,875q3l,https://www.channelnewsasia.com/news/business/...,120,2018-03-26 10:20:47,,2018-03-26 18:20:47,2018-03-26,Grab confirms acquisition of Uber in Southeast...
4,ComfortDelGro launches on-demand bus booking app,5,94z0ev,https://www.channelnewsasia.com/news/singapore...,4,2018-08-06 14:44:06,,2018-08-06 22:44:06,2018-08-06,ComfortDelGro launches on-demand bus booking app
5,[Premium]ComfortDelGro buying 500 more cabs am...,13,8jr0bx,https://www.straitstimes.com/singapore/transpo...,13,2018-05-16 09:23:42,,2018-05-16 17:23:42,2018-05-16,[Premium]ComfortDelGro buying 500 more cabs am...
6,ComfortDelGro witnessing 'uptick' in bookings ...,38,8oxwgl,https://www.channelnewsasia.com/news/singapore...,45,2018-06-06 12:57:17,,2018-06-06 20:57:17,2018-06-06,ComfortDelGro witnessing 'uptick' in bookings ...
7,"No, “Temasek Holdings” will not lose money fro...",52,87yoh5,https://mothership.sg/2018/03/did-temasek-hold...,12,2018-03-29 13:05:03,,2018-03-29 21:05:03,2018-03-29,"No, “Temasek Holdings” will not lose money fro..."
8,[Premium]Uber's retreat to focus elsewhere a s...,13,875ghd,http://www.straitstimes.com/singapore/transpor...,4,2018-03-26 09:36:33,,2018-03-26 17:36:33,2018-03-26,[Premium]Uber's retreat to focus elsewhere a s...
9,Reports of Uber-Grab regional deal 'speculation',8,7ybwf9,http://www.straitstimes.com/singapore/transpor...,7,2018-02-18 11:53:49,,2018-02-18 19:53:49,2018-02-18,Reports of Uber-Grab regional deal 'speculation'


In [7]:
df_re_com_spec = df_re_com[df_re_com['submission_id'].isin(df_re_topics.id.tolist())]
df_re_com_spec = df_re_com_spec[['date','comment']]
df_re_com_spec.head(10)

Unnamed: 0,date,comment
0,2018-03-13,The auntie in me thank you a thousand times ov...
1,2018-03-13,The hero we need
3,2018-03-13,Damnnnnnn. Thanks mate!
4,2018-03-13,most liked thread.\r\nWe all love Freebies
5,2018-03-14,"Cool, you're a deals aggregator? Is somebody m..."
6,2018-03-13,Damn this is so cool!
7,2018-03-13,Thanks a lot
8,2018-03-14,Mobike pass is still $5 on the website?
9,2018-03-14,Free pads. Girlfriend confirm happy one.
10,2018-03-15,thank you op!!!!


In [8]:
df_re_topics_spec = df_re_topics[['date','comment']]
df_re_topics_spec.head(10)

Unnamed: 0,date,comment
0,2018-05-20,Tomica now makes miniature versions of the Com...
1,2018-06-07,ComfortDelGro sees biggest taxi booking rise s...
2,2018-09-19,ComfortDelGro drivers unhappy with new levy st...
3,2018-03-26,Grab confirms acquisition of Uber in Southeast...
4,2018-08-06,ComfortDelGro launches on-demand bus booking app
5,2018-05-16,[Premium]ComfortDelGro buying 500 more cabs am...
6,2018-06-06,ComfortDelGro witnessing 'uptick' in bookings ...
7,2018-03-29,"No, “Temasek Holdings” will not lose money fro..."
8,2018-03-26,[Premium]Uber's retreat to focus elsewhere a s...
9,2018-02-18,Reports of Uber-Grab regional deal 'speculation'


In [9]:
df_reddit = pd.concat([df_re_topics_spec, df_re_com_spec])
df_reddit.head(10)

Unnamed: 0,date,comment
0,2018-05-20,Tomica now makes miniature versions of the Com...
1,2018-06-07,ComfortDelGro sees biggest taxi booking rise s...
2,2018-09-19,ComfortDelGro drivers unhappy with new levy st...
3,2018-03-26,Grab confirms acquisition of Uber in Southeast...
4,2018-08-06,ComfortDelGro launches on-demand bus booking app
5,2018-05-16,[Premium]ComfortDelGro buying 500 more cabs am...
6,2018-06-06,ComfortDelGro witnessing 'uptick' in bookings ...
7,2018-03-29,"No, “Temasek Holdings” will not lose money fro..."
8,2018-03-26,[Premium]Uber's retreat to focus elsewhere a s...
9,2018-02-18,Reports of Uber-Grab regional deal 'speculation'


## Twitter

In [10]:
dft = [pd.read_csv(f) for f in files_prefixed[2]]
dft_cons = pd.concat(dft)

df_tw = dft_cons.loc[:,('date','tweet')]
df_tw.rename(columns={'tweet':'comment'},inplace=True)
df_tw.head(10)

Unnamed: 0,date,comment
0,2018-11-01,ComfortDelGro former CEO Kua Hong Pak dies aft...
1,2018-11-01,ComfortDelGro ex-CEO Kua Hong Pakdies https:/...
2,2018-11-01,ComfortDelGro ex-CEO Kua Hong Pak dies http:...
3,2018-11-01,Ironic @TfL's Claire Mann is speaking re #BusS...
4,2018-10-31,Research Reports : 31st October 2018 ***Repor...
5,2018-10-30,ComfortDelGro (SGX:C52) - CGS-CIMB Research 2...
6,2018-10-28,I'm at ComfortDelGro Driving Centre (CDC) in S...
7,2018-10-27,I'm at ComfortDelGro Driving Centre (CDC) in S...
8,2018-10-27,Go-Jek is in talks with former Uber ally Comfo...
9,2018-10-24,I thought #VisionZero meant City Hall & TfL wo...


### NEW

In [11]:
df_gp['source'] = 'gplay'
df_tw['source'] = 'twitter'
df_reddit['source'] = 'reddit'

In [12]:
df_merge = pd.concat([df_gp,df_tw,df_reddit])
# df_merge = df_merge.sort_values('date',ascending=False)
df_merge

Unnamed: 0,date,comment,source
0,2018-11-03,Horrible experience. waited 8mins for the cab ...,gplay
1,2018-11-02,Can be better. Please benchmark to Grab to get...,gplay
2,2018-11-01,Not so good,gplay
3,2018-10-31,Good one,gplay
4,2018-10-29,Unable to use app. SMS OTP not delivered (Mala...,gplay
5,2018-10-29,Easy to use,gplay
6,2018-10-26,Good app.,gplay
7,2018-10-24,It s not easy to use,gplay
8,2018-10-23,This app not working with Samsung S7. I try to...,gplay
9,2018-10-22,Great app to rely on when grab is having ridic...,gplay


In [13]:
df_merge.to_csv('data/ca_cleaned/cdge_consolidated.csv')