In [1]:
# Base
import pandas as pd
import numpy as np
import glob, os, re, string
from datetime import datetime

# Graphs
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from wordcloud import WordCloud

# Language Detection
from polyglot.detect import Detector

# NLTK
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from scipy import stats



In [None]:
df_fb_1 = [pd.read_csv()]

In [None]:
FDIR = 'data/'

files = [file for file in glob.glob(FDIR + '*.csv')]
files

In [None]:
prefixes = ['reddit']

files_prefixed = [[f for f in files if prefix in f] for prefix in prefixes]
files_prefixed

In [None]:
def add_lang(df, col_name):
    detect_lang = lambda comment: Detector(str(comment), quiet=True).languages[0]

    df['lang_code'] = df.apply(lambda row: detect_lang(row[col_name]).code, axis=1)
    df['lang_sig'] = df.apply(lambda row: detect_lang(row[col_name]).confidence, axis=1)
    
    return df

In [None]:
df_titles = pd.read_csv(files[0])
df_titles['created'] = pd.to_datetime(df_titles['created'],unit='s')
df_titles.head()

In [None]:
df_com = pd.read_csv(files[1])
df_com.head(10)

In [None]:
df_com = add_lang(df_com, 'body')
df_com.head(10)

In [None]:
len(df_com[df_com['lang_code'] == 'en'].index) / len(df_com.index)

In [None]:
df_com[df_com['lang_code'] != 'en']

In [None]:
new_dir = 'data/clean'

if not os.path.exists(new_dir):
    os.makedirs(new_dir)

In [None]:
files[0][5:]

In [None]:
df_com[df_com['lang_code'] == 'en'].to_csv(new_dir + '/' + files[0][5:])

In [None]:
DDIR = 'data/ca_cleaned/grab/'

files = [file for file in glob.glob(DDIR + 'reddit_*.csv')]
files

In [None]:
dfs = []

for f in files:
    df = pd.read_csv(f)
    df['source'] = f.split('/')[3][:-4]
    dfs.append(df)

df_reddit = pd.concat(dfs)
df_reddit.to_csv(DDIR + 'reddit_grab_combined.csv')

In [2]:
DDIR = 'data/ca_cleaned/grab/'

files = [file for file in glob.glob(DDIR + 'fb_*.csv')]
files

['data/ca_cleaned/grab/fb_groups_grab_en.csv',
 'data/ca_cleaned/grab/fb_grab_hitchsg.csv']

In [3]:
dfsb = [pd.read_csv(f) for f in files]

df_fb_com = pd.concat(dfsb)
df_fb_com = df_fb_com[['date','comment']]
df_fb_com.head(10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,date,comment
0,2018-10-30 21:23:00,He sounds a little dodgy.
1,2018-10-30 21:23:00,Is dodgy. Report and see what ryde says. I can...
2,2018-10-30 21:23:00,Why can't ur bf use his acc and book himself?
3,2018-10-30 21:23:00,sorry but i dun find anything wrong with his m...
4,2018-10-30 21:23:00,"To expedite your request., pls indicate in not..."
5,2018-10-30 21:23:00,"Maybe army regular, talk as though he is in ar..."
6,2018-10-30 21:23:00,I think he is trying to be funny with you.
7,2018-10-30 21:23:00,Born cheap and too much sex
8,2018-10-30 21:23:00,Your mother must be a whore.
9,2018-10-30 21:23:00,0330am after booking u slept? U must be really...


In [4]:
df_fb_com.shape

(34891, 2)

In [5]:
df_fb_com.to_csv('data/ca_cleaned/grab/fb_grab_combined.csv')

In [None]:
dfst = []

for f in files:
    df = pd.read_csv(f)
    df['source'] = f.split('/')[3][:-4]
    dfst.append(df)
    
df_twitter = pd.concat(dfst)
df_twitter.to_csv(DDIR + 'twitter_grab_combined.csv')

### NEW

In [2]:
BASE_DIR = 'data/ca_cleaned/grab/'

prefixes = ['fb', 'gplay', 'reddit', 'twitter']
combined_text = 'combined'

In [3]:
def proc_df(prefix_index):
    filename = BASE_DIR + prefixes[prefix_index] + '_grab_' + combined_text + '.csv'
    return pd.read_csv(filename)

In [4]:
# Process Facebook

df_fb = proc_df(0)
df_fb['date'] = pd.to_datetime(df_fb['date'], errors='coerce')
df_fb['date'] = df_fb['date'].dt.date
df_fb = df_fb[['date','comment']]
df_fb.head(10)

Unnamed: 0,date,comment
0,2018-10-30,He sounds a little dodgy.
1,2018-10-30,Is dodgy. Report and see what ryde says. I can...
2,2018-10-30,Why can't ur bf use his acc and book himself?
3,2018-10-30,sorry but i dun find anything wrong with his m...
4,2018-10-30,"To expedite your request., pls indicate in not..."
5,2018-10-30,"Maybe army regular, talk as though he is in ar..."
6,2018-10-30,I think he is trying to be funny with you.
7,2018-10-30,Born cheap and too much sex
8,2018-10-30,Your mother must be a whore.
9,2018-10-30,0330am after booking u slept? U must be really...


In [5]:
# Process Google Play

df_gp = proc_df(1)
df_gp['date'] = pd.to_datetime(df_gp['date'], errors='coerce')
df_gp['date'] = df_gp['date'].dt.date
df_gp = df_gp[['date','comment']]
df_gp.head(10)

Unnamed: 0,date,comment
0,2018-11-02,Very usefull n advantages
1,2018-11-02,"Registry / login with a number, I am using my ..."
2,2018-11-02,I think it is a excellent app 😀😀.
3,2018-11-02,Good application
4,2018-11-02,You don't imagine how irritated when you alrea...
5,2018-11-02,Slow connection for location
6,2018-11-02,Your old version was much more customerfriendl...
7,2018-11-02,Good app
8,2018-11-02,My favorite online transportation
9,2018-11-02,No drivers whenever suck chair


In [6]:
# Process Twitter

df_tw = proc_df(3)
df_tw['date'] = pd.to_datetime(df_tw['date'], errors='coerce')
df_tw['date'] = df_tw['date'].dt.date
df_tw['comment'] = df_tw['tweet']
df_tw = df_tw[['date','comment']]
df_tw.head(10)

  if (yield from self.run_code(code, result)):


Unnamed: 0,date,comment
0,2018-11-01,Who plays mahjong but still takes grabshare
1,2018-11-01,First time to sa grabshare lintek
2,2018-11-01,why did the grabshare algorithm accept another...
3,2018-11-01,Yes!!! Uber/Lyft/Ola/Grab/Didi = taxis Uberpoo...
4,2018-10-31,Sana pwede rin i-rate yung mga kasabay sa Grab...
5,2018-10-31,"Boo, witches and wizards! Heres a little trea..."
6,2018-10-31,Does grabshare only take passenger with simila...
7,2018-10-31,accidentally booked a GrabShare instead of a G...
8,2018-10-31,i took a cab at 6am para sana makarating ng of...
9,2018-10-31,"Boo, witches and wizards! Heres a little trea..."


### Reddit

In [7]:
df_re_com = pd.read_csv('data/reddit_comments.csv')
df_re_com['created_utc'] = pd.to_datetime(df_re_com['created_utc'], unit='s')
df_re_com['ts'] = df_re_com['created_utc'].dt.tz_localize('UTC').dt.tz_convert('Asia/Singapore')
df_re_com['date'] = df_re_com['ts'].dt.date
df_re_com.rename(columns={'body':'comment'},inplace=True)
df_re_com.sort_values('ts')
df_re_com

Unnamed: 0,submission_id,id,created_utc,comment,score,author_name,parent,ts,date
0,841u7q,dvm8qmt,2018-03-13 06:33:42,The auntie in me thank you a thousand times ov...,52,dancinginthesunlight,841u7q,2018-03-13 14:33:42+08:00,2018-03-13
1,841u7q,dvmf1t6,2018-03-13 10:46:37,The hero we need,7,chickennutbreadd,841u7q,2018-03-13 18:46:37+08:00,2018-03-13
2,841u7q,dvm8vqf,2018-03-13 06:38:30,Thank you!!,4,JaceTan,841u7q,2018-03-13 14:38:30+08:00,2018-03-13
3,841u7q,dvmaki1,2018-03-13 07:40:51,Damnnnnnn. Thanks mate!,2,barbaraimout,841u7q,2018-03-13 15:40:51+08:00,2018-03-13
4,841u7q,dvmbe2a,2018-03-13 08:13:42,most liked thread.\r\nWe all love Freebies,2,foodieandthebeast,841u7q,2018-03-13 16:13:42+08:00,2018-03-13
5,841u7q,dvmw2y8,2018-03-13 16:26:46,"Cool, you're a deals aggregator? Is somebody m...",2,haemuljeon,841u7q,2018-03-14 00:26:46+08:00,2018-03-14
6,841u7q,dvmct05,2018-03-13 09:13:48,Damn this is so cool!,1,DaniloIce,841u7q,2018-03-13 17:13:48+08:00,2018-03-13
7,841u7q,dvmfjih,2018-03-13 11:04:40,Thanks a lot,1,_whatcanbe,841u7q,2018-03-13 19:04:40+08:00,2018-03-13
8,841u7q,dvmwetd,2018-03-13 16:31:31,Mobike pass is still $5 on the website?,1,madsonic,841u7q,2018-03-14 00:31:31+08:00,2018-03-14
9,841u7q,dvn2jyl,2018-03-13 18:01:41,Free pads. Girlfriend confirm happy one.,1,El3ctr1c4l,841u7q,2018-03-14 02:01:41+08:00,2018-03-14


In [8]:
df_re_topics = proc_df(2)
df_re_topics['created'] = pd.to_datetime(df_re_topics['created'], unit='s')
df_re_topics['date'] = df_re_topics['created'].dt.date
df_re_topics['body'] = df_re_topics['body'].replace(np.nan, '')
df_re_topics['comment'] = df_re_topics['title'] + ' ' + df_re_topics['body']
df_re_topics.head(10)

Unnamed: 0.1,Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,source,date,comment
0,0,Is it worth to be a Grab driver?,16,8ogbqp,https://www.reddit.com/r/singapore/comments/8o...,35,2018-06-04 18:50:42,Gonna ORD this August and will be enrolling in...,2018-06-05 02:50:42,reddit_grab_driver,2018-06-04,Is it worth to be a Grab driver? Gonna ORD thi...
1,1,"Uber, Grab drivers in Singapore now need to be...",42,5skz0u,http://www.channelnewsasia.com/news/singapore/...,21,2017-02-07 19:52:07,,2017-02-08 03:52:07,reddit_grab_driver,2017-02-07,"Uber, Grab drivers in Singapore now need to be..."
2,2,Grab driver AMA,55,8pr1yy,https://www.reddit.com/r/singapore/comments/8p...,47,2018-06-09 14:49:17,"Hey Reddiporeans,\r\n\r\nThere has been a lot ...",2018-06-09 22:49:17,reddit_grab_driver,2018-06-09,"Grab driver AMA Hey Reddiporeans,\r\n\r\nThere..."
3,3,"Dear Uber/Grab Drivers, Do you think it can be...",24,6vbaep,https://www.reddit.com/r/singapore/comments/6v...,20,2017-08-22 22:15:13,Had a random showerthought about Grab/Uber as ...,2017-08-23 06:15:13,reddit_grab_driver,2017-08-22,"Dear Uber/Grab Drivers, Do you think it can be..."
4,4,New feature for Grab drivers,27,8gqbaz,https://www.reddit.com/r/singapore/comments/8g...,11,2018-05-03 21:20:59,"A new Grab feature was updated for drivers, wh...",2018-05-04 05:20:59,reddit_grab_driver,2018-05-03,New feature for Grab drivers A new Grab featur...
5,5,Grab drivers complain about taxi driver allege...,64,4u213v,http://mothership.sg/2016/07/one-taxi-driver-i...,28,2016-07-22 17:51:39,,2016-07-23 01:51:39,reddit_grab_driver,2016-07-22,Grab drivers complain about taxi driver allege...
6,6,TMI grab driver,229,9o3j1w,https://i.redd.it/t969xogl56s11.jpg,32,2018-10-14 23:24:32,,2018-10-15 07:24:32,reddit_grab_driver,2018-10-14,TMI grab driver
7,7,Unhappy Grab drivers hope for Go-Jek lifeline,16,9sjvyj,https://www.straitstimes.com/singapore/transpo...,19,2018-10-30 10:30:00,,2018-10-30 18:30:00,reddit_grab_driver,2018-10-30,Unhappy Grab drivers hope for Go-Jek lifeline
8,8,PASSENGER: Why you chee hong me? GRAB DRIVER: ...,422,97o0aq,https://i.redd.it/n1afdju8ycg11.jpg,36,2018-08-16 09:26:13,,2018-08-16 17:26:13,reddit_grab_driver,2018-08-16,PASSENGER: Why you chee hong me? GRAB DRIVER: ...
9,9,Grab driver claims he earned $912 in 1 day by ...,32,97gte0,https://stomp.straitstimes.com/singapore-seen/...,47,2018-08-15 17:16:29,,2018-08-16 01:16:29,reddit_grab_driver,2018-08-15,Grab driver claims he earned $912 in 1 day by ...


In [9]:
df_re_com_grab = df_re_com[df_re_com['submission_id'].isin(df_re_topics.id.tolist())]
df_re_com_grab = df_re_com_grab[['date','comment']]
df_re_com_grab.head(10)

Unnamed: 0,date,comment
0,2018-03-13,The auntie in me thank you a thousand times ov...
1,2018-03-13,The hero we need
2,2018-03-13,Thank you!!
3,2018-03-13,Damnnnnnn. Thanks mate!
4,2018-03-13,most liked thread.\r\nWe all love Freebies
5,2018-03-14,"Cool, you're a deals aggregator? Is somebody m..."
6,2018-03-13,Damn this is so cool!
7,2018-03-13,Thanks a lot
8,2018-03-14,Mobike pass is still $5 on the website?
9,2018-03-14,Free pads. Girlfriend confirm happy one.


In [10]:
df_re_topics_grab = df_re_topics[['date','comment']]
df_re_topics_grab.head(10)

Unnamed: 0,date,comment
0,2018-06-04,Is it worth to be a Grab driver? Gonna ORD thi...
1,2017-02-07,"Uber, Grab drivers in Singapore now need to be..."
2,2018-06-09,"Grab driver AMA Hey Reddiporeans,\r\n\r\nThere..."
3,2017-08-22,"Dear Uber/Grab Drivers, Do you think it can be..."
4,2018-05-03,New feature for Grab drivers A new Grab featur...
5,2016-07-22,Grab drivers complain about taxi driver allege...
6,2018-10-14,TMI grab driver
7,2018-10-30,Unhappy Grab drivers hope for Go-Jek lifeline
8,2018-08-16,PASSENGER: Why you chee hong me? GRAB DRIVER: ...
9,2018-08-15,Grab driver claims he earned $912 in 1 day by ...


In [11]:
df_reddit = pd.concat([df_re_topics_grab, df_re_com_grab])
df_reddit.head(10)

Unnamed: 0,date,comment
0,2018-06-04,Is it worth to be a Grab driver? Gonna ORD thi...
1,2017-02-07,"Uber, Grab drivers in Singapore now need to be..."
2,2018-06-09,"Grab driver AMA Hey Reddiporeans,\r\n\r\nThere..."
3,2017-08-22,"Dear Uber/Grab Drivers, Do you think it can be..."
4,2018-05-03,New feature for Grab drivers A new Grab featur...
5,2016-07-22,Grab drivers complain about taxi driver allege...
6,2018-10-14,TMI grab driver
7,2018-10-30,Unhappy Grab drivers hope for Go-Jek lifeline
8,2018-08-16,PASSENGER: Why you chee hong me? GRAB DRIVER: ...
9,2018-08-15,Grab driver claims he earned $912 in 1 day by ...


In [12]:
df_fb['source'] = 'fb'
df_gp['source'] = 'gplay'
df_tw['source'] = 'twitter'
df_reddit['source'] = 'reddit'

In [13]:
df_merge = pd.concat([df_fb,df_gp,df_tw,df_reddit])
df_merge = df_merge.sort_values('date',ascending=False)
df_merge

Unnamed: 0,date,comment,source
10991,2018-11-17,there will be ego people around... just pray u...,fb
10999,2018-11-17,Blacklist this cheapskate,fb
10990,2018-11-17,"There still be ego,cheapo riders who think 5 c...",fb
11005,2018-11-17,No hitch driver will take 4pax in the rush hou...,fb
11004,2018-11-17,Take bus lah.,fb
11002,2018-11-17,It could be driver lo... Cuz no need to pay 10...,fb
11001,2018-11-17,Only sorhai will pick up. I believe this notic...,fb
11000,2018-11-17,Like that also can innovativev,fb
11003,2018-11-17,Omg....really no word to describe.....,fb
10998,2018-11-17,Knn micron staff all very rich yet so cheapo,fb


In [14]:
df_merge.to_csv('data/ca_cleaned/grab_consolidated.csv')