In [1]:
# Base
import pandas as pd
import numpy as np
import glob, os, re, string
from datetime import datetime

# Graphs
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from wordcloud import WordCloud

# Language Detection
from polyglot.detect import Detector

# NLTK
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer



### Twitter

In [2]:
BASE_DIR = 'data/ca_cleaned/gojek/'

files = [file for file in glob.glob(BASE_DIR + 'twitter_*.csv')]
files

['data/ca_cleaned/gojek/twitter_gojek.csv',
 'data/ca_cleaned/gojek/twitter_go-jek.csv']

In [3]:
df_tw_all = [pd.read_csv(f) for f in files]

df_tw = pd.concat(df_tw_all)
df_tw['date'] = pd.to_datetime(df_tw['date'], errors='coerce')
df_tw['date'] = df_tw['date'].dt.date
df_tw['comment'] = df_tw['tweet']
df_tw = df_tw[['date','comment']]
df_tw.head(10)

Unnamed: 0,date,comment
0,2018-11-04,I wonder why Tokopedia's lucky egg n gojek's g...
1,2018-11-04,What's your favorite method of transportation?...
2,2018-11-04,"Long before witty gojek, dirjen pajak, or netf..."
3,2018-11-04,Are you angry?
4,2018-11-04,Are you angry? https://twitter.com/persebayaup...
5,2018-11-04,What's your favorite method of transportation?...
6,2018-11-04,3 or 4 days left before uninstalling Gojek. Ju...
7,2018-11-04,"Just spied my Gojek rating, after months of cu..."
8,2018-11-04,What's your favorite method of transportation?...
9,2018-11-04,I added a video to a @YouTube playlist http:/...


In [4]:
df_tw.shape

(21067, 2)

### Reddit

In [5]:
df_re_com = pd.read_csv('data/reddit_comments.csv')
df_re_com['created_utc'] = pd.to_datetime(df_re_com['created_utc'], unit='s')
df_re_com['ts'] = df_re_com['created_utc'].dt.tz_localize('UTC').dt.tz_convert('Asia/Singapore')
df_re_com['date'] = df_re_com['ts'].dt.date
df_re_com.rename(columns={'body':'comment'},inplace=True)
df_re_com.sort_values('ts')
df_re_com

Unnamed: 0,submission_id,id,created_utc,comment,score,author_name,parent,ts,date
0,841u7q,dvm8qmt,2018-03-13 06:33:42,The auntie in me thank you a thousand times ov...,52,dancinginthesunlight,841u7q,2018-03-13 14:33:42+08:00,2018-03-13
1,841u7q,dvmf1t6,2018-03-13 10:46:37,The hero we need,7,chickennutbreadd,841u7q,2018-03-13 18:46:37+08:00,2018-03-13
2,841u7q,dvm8vqf,2018-03-13 06:38:30,Thank you!!,4,JaceTan,841u7q,2018-03-13 14:38:30+08:00,2018-03-13
3,841u7q,dvmaki1,2018-03-13 07:40:51,Damnnnnnn. Thanks mate!,2,barbaraimout,841u7q,2018-03-13 15:40:51+08:00,2018-03-13
4,841u7q,dvmbe2a,2018-03-13 08:13:42,most liked thread.\r\nWe all love Freebies,2,foodieandthebeast,841u7q,2018-03-13 16:13:42+08:00,2018-03-13
5,841u7q,dvmw2y8,2018-03-13 16:26:46,"Cool, you're a deals aggregator? Is somebody m...",2,haemuljeon,841u7q,2018-03-14 00:26:46+08:00,2018-03-14
6,841u7q,dvmct05,2018-03-13 09:13:48,Damn this is so cool!,1,DaniloIce,841u7q,2018-03-13 17:13:48+08:00,2018-03-13
7,841u7q,dvmfjih,2018-03-13 11:04:40,Thanks a lot,1,_whatcanbe,841u7q,2018-03-13 19:04:40+08:00,2018-03-13
8,841u7q,dvmwetd,2018-03-13 16:31:31,Mobike pass is still $5 on the website?,1,madsonic,841u7q,2018-03-14 00:31:31+08:00,2018-03-14
9,841u7q,dvn2jyl,2018-03-13 18:01:41,Free pads. Girlfriend confirm happy one.,1,El3ctr1c4l,841u7q,2018-03-14 02:01:41+08:00,2018-03-14


In [6]:
df_re_topics = pd.read_csv('data/ca_cleaned/gojek/reddit_gojek.csv')
df_re_topics['created'] = pd.to_datetime(df_re_topics['created'], unit='s')
df_re_topics['date'] = df_re_topics['created'].dt.date
df_re_topics['body'] = df_re_topics['body'].replace(np.nan, '')
df_re_topics['comment'] = df_re_topics['title'] + ' ' + df_re_topics['body']
df_re_topics.head(10)

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,date,comment
0,why delivery took so long,890,9u3b7o,https://i.redd.it/sac0qww1ebw11.jpg,32,2018-11-04 21:27:35,,2018-11-05 05:27:35,2018-11-04,why delivery took so long
1,"In aggressive expansion, Go-Jek to enter Singa...",96,8lpg4r,https://www.reddit.com/r/singapore/comments/8l...,34,2018-05-24 11:07:07,*Indonesian ride\-hailing giant Go\-Jek is set...,2018-05-24 19:07:07,2018-05-24,"In aggressive expansion, Go-Jek to enter Singa..."
2,Grab unveils open platform strategy to build S...,8,8xr04q,https://www.grab.com/sg/press/business/grab-un...,12,2018-07-11 00:48:04,,2018-07-11 08:48:04,2018-07-11,Grab unveils open platform strategy to build S...
3,Last 12 hours with Uber,76,8hkvz9,https://i.redd.it/48m8gxyy1dw01.jpg,16,2018-05-07 12:24:30,,2018-05-07 20:24:30,2018-05-07,Last 12 hours with Uber
4,Go-Jek to expand to 4 Southeast Asian countrie...,40,87s2hh,https://www.reddit.com/r/singapore/comments/87...,27,2018-03-28 21:54:38,Having used their services several times befor...,2018-03-29 05:54:38,2018-03-28,Go-Jek to expand to 4 Southeast Asian countrie...


In [7]:
df_re_com_gojek = df_re_com[df_re_com['submission_id'].isin(df_re_topics.id.tolist())]
df_re_com_gojek = df_re_com_gojek[['date','comment']]
df_re_com_gojek.head(10)

Unnamed: 0,date,comment
2810,2018-03-28,Awesome. Let’s hope they come to Singapore soo...
2811,2018-03-28,Gojek is an unicorn start up and work just nic...
2812,2018-03-29,[deleted]
2813,2018-03-28,Given the article said 1 country in the next f...
2814,2018-03-29,excellent news if they come here. We really ne...
2815,2018-03-29,"Gojek is phenomenal! From gofood , gosend , go..."
2816,2018-03-29,Here Comes A New Challenger!
2817,2018-03-29,Am i missing something here.... has gojek expa...
2818,2018-03-29,Go-food!!!
2819,2018-03-29,So interesting\r\n\r\nSBS vs SMRT \r\n\r\nGrab...


In [8]:
df_re_topics_gojek = df_re_topics[['date','comment']]
df_re_topics_gojek.head(10)

Unnamed: 0,date,comment
0,2018-11-04,why delivery took so long
1,2018-05-24,"In aggressive expansion, Go-Jek to enter Singa..."
2,2018-07-11,Grab unveils open platform strategy to build S...
3,2018-05-07,Last 12 hours with Uber
4,2018-03-28,Go-Jek to expand to 4 Southeast Asian countrie...


In [9]:
df_reddit = pd.concat([df_re_topics_gojek, df_re_com_gojek])
df_reddit.head(10)

Unnamed: 0,date,comment
0,2018-11-04,why delivery took so long
1,2018-05-24,"In aggressive expansion, Go-Jek to enter Singa..."
2,2018-07-11,Grab unveils open platform strategy to build S...
3,2018-05-07,Last 12 hours with Uber
4,2018-03-28,Go-Jek to expand to 4 Southeast Asian countrie...
2810,2018-03-28,Awesome. Let’s hope they come to Singapore soo...
2811,2018-03-28,Gojek is an unicorn start up and work just nic...
2812,2018-03-29,[deleted]
2813,2018-03-28,Given the article said 1 country in the next f...
2814,2018-03-29,excellent news if they come here. We really ne...


In [10]:
df_reddit.shape

(135, 2)

### All Together

In [11]:
df_tw['source'] = 'twitter'
df_reddit['source'] = 'reddit'

In [12]:
df_merge = pd.concat([df_tw,df_reddit])
df_merge = df_merge.sort_values('date',ascending=False)
df_merge

Unnamed: 0,date,comment,source
3048,2018-11-05,They'll probably only launch their food delive...,reddit
3041,2018-11-05,"actually not bad leh, they usually quote a 45m...",reddit
3053,2018-11-05,People are lazy lol. When I worked pt for mcd ...,reddit
3054,2018-11-05,ya sia! mktg oversight!,reddit
3055,2018-11-05,"Yes, they've announced their intention to ente...",reddit
3047,2018-11-05,cfm wait long long,reddit
3046,2018-11-05,gojek coming to sg?,reddit
3045,2018-11-05,they are trying their best alr. \r\n\r\nif the...,reddit
3056,2018-11-05,They already have an office ins SG actually :),reddit
3043,2018-11-05,Amos Yee,reddit


In [13]:
df_merge.shape

(21202, 3)

In [14]:
df_merge.to_csv('data/ca_cleaned/gojek_consolidated.csv')